This code adds segment numbers to the transcript file, according to the time.

In [90]:
import os
import sys
import csv
import yaml
import math

# name of the csv file containing eyedata
eyedata_filename = "../cookietheftdata/PC0039/PC0039_red_eyegaze_aligned.csv"

# name of the transcript file
transcript_filename = "../cookietheftdata/PC0039/PC0039_red_transcript_par.yaml" 

# file that maps segments to coordinates
map_file = "./output_array_but_better.csv"

if not os.path.exists(eyedata_filename):
    print(f"File '{eyedata_filename}' doesn't exist")
    sys.exit()

if not os.path.exists(transcript_filename):
    print(f"File '{transcript_filename}' doesn't exist")
    sys.exit()

# Open mapping file for reading
with open(map_file,'r') as map:
    reader = csv.reader(map)
    map_array = [row for row in reader]

# Get image_map dimensions
    image_width, image_height = len(map_array[0]), len(map_array)

# gets all of the eyedata in, skipping the header
with open(eyedata_filename, 'r') as eyedata:
    reader = csv.reader(eyedata)
    eye_rows = [row for row in reader]
    del eye_rows[0]

# iterates through the transcript
with open(transcript_filename, 'r') as transcript:
    data = yaml.safe_load(transcript)

results = []
totalcount = 0
count = 0

# iterates through transcript and grabs end, start, and word, then appends the coords
for entry in data:
    if not 'result' in entry:
        continue
        
    for word_data in entry['result']:
        end = word_data.get('end', None)
        start = word_data.get('start', None)
        word = word_data.get('word', None)
        
        # if start or end are blank, it will break
        if start is None or end is None:
            raise ValueError("ERROR: No start or end time.")
        
        array = []
        
        # with the given start and end times, construct an array of coordinates from 
        # the eyedata file between those times, given the offset
        for i in range(1, len(eye_rows) - 1):
            if eye_rows[i][0] == '':
                continue
            
            if float(eye_rows[i][1]) > float(end):
                break
            
            if float(start) <= float(eye_rows[i][1]) <= float(end):
                
                if (math.isnan(float(eye_rows[i][2])) or math.isnan(float(eye_rows[i][3]))):
                    # print(float(eye_rows[i][2]),float(eye_rows[i][3]))
                    count += 1
                    segment = "-1"
                elif (0 <= round(float(eye_rows[i][2]) * image_width) < image_width and 0 <= round(float(eye_rows[i][3]) * image_height) < image_height):
                    # print(round(float(eye_rows[i][2]) * image_width),round(float(eye_rows[i][3]) * image_height))
                    pixel_x = round(float(eye_rows[i][2]) * image_width)
                    pixel_y = round(float(eye_rows[i][3]) * image_height)
                    segment = map_array[pixel_y][pixel_x]
                else:
                    # print(float(eye_rows[i][2]),float(eye_rows[i][3]))
                    count += 1
                    segment = "-1"
                totalcount += 1
                array.append(segment)

        new_entry = { 
            'end': end,
            'start': start,
            'word': word,
            'segment': array
        }
        
        results.append(new_entry)
print(str(count/totalcount * 100) + "% data lost")

1.8574677786201668% data lost


In [91]:
# name of the yaml file to write to
text_filename = "./realigned/PC0039_red_transcript_aligned_segments.yaml"

with open(text_filename, 'w') as yamlf:
    yaml.dump(results, yamlf, default_flow_style=None, sort_keys=False)

print(f"Results saved to '{text_filename}'")

Results saved to './realigned/PC0039_red_transcript_aligned_segments.yaml'
