# Gamified Biometric Data Notebook

Created by Liz Beard on 20240724
Last updated on 20240724

In [1]:
# let's load our libraries
import pandas as pd
import os
import glob
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter
from scipy.stats import gaussian_kde
from skimage.transform import resize
from matplotlib.colors import ListedColormap

# Data Wrangling/Cleaning

**Sensor Data CSV**

(pulled from [iMotions documentation](https://help.imotions.com/docs/export-sensor-data#the-format-colorcoded))
| Rows | Section | Description|
|----|----|----|
|1-18|INFO|general information about the recorded data (e.g. name of the study, name of the respondent, recording time, information about the hardware systems used, etc.)|
|19-31|METADATA|provides, for each of the data-columns in the export-file, explanatory information about the data contained in the respective data-columns (e.g., where the data comes from, the category of data, a short description of the data, the unit, etc.)|
|32+|DATA| contains the sensor data, but also other types of data (e.g., 'events').|

Data Columns:
| Column | Data Type | Description |
|----|----|----|
|Row|Timeline|Simple row index|
|Timestamp|Timeline|Elapsed time since the recording start (in milliseconds)|
|Eventsource|Slideshow events|Boolean which marks slide events (1 = slide event)|
|StartSlide|Slideshow events|start command is forwarded by iMotions to the media player|
|StartMedia|Slideshow events|confirmaton from the media player that the slide has started|
|EndMedia|Slideshow events|confirmation from the media player that the slide has ended|
|EndSlide|Slideshow events|end command is executed by iMotions|
|StimType|Slideshow events|allegedly unimportant?|
|Duration|Slideshow events|Planned duration of the slides (in milliseconds)|
|CollectionPhase|Slideshow events|allegedly unimportant?|
|SourceStimuliName|Slideshow events|User-defined name of the slide|
|Event Source|Sensor data|Boolean which marks data-samples recorded from a given device (1 = data sample)|
|Sensor Data Columns|Sensor data|Data output from a given sensor. Note that post-imported data, unknown data received by iMotions over the API, or R-processed 'continuous' data (e.g., filtered data) are represented in the same way as the sensor data provided directly by the hardware systems.|

In [2]:
# directories
data_dir = os.path.abspath(os.path.join(os.getcwd(), '../summer-2024/iMotions'))

## Separate Main CSV, Organize Files

In [25]:
def separate_data_by_device(metadata_section, data_section, output_dir, subj=None, block=None):
    # Identify Row and Timestamp columns by their descriptions
    row_col_name = data_section.columns[0]  # Typically the first column
    timestamp_col_name = data_section.columns[1]  # Typically the second column
    
    # Extract device names from the metadata section
    metadata_devices = metadata_section.iloc[1, 1:].dropna().unique()
    
    # Create a directory to save the device-specific CSV files
    os.makedirs(output_dir, exist_ok=True)
    
    # Process each device and save the relevant data to a separate CSV file
    for device in metadata_devices:
        device_columns = metadata_section.columns[metadata_section.iloc[1] == device].tolist()
        columns_to_include = [row_col_name, timestamp_col_name] + device_columns
        
        # Filter the data section for the current device
        device_data = data_section.iloc[1:, columns_to_include]
        
        # Save to CSV
        if subj and block is not None:
            filename = f'{subj}_block-{block}_{device}.csv'
        else:
            filename = f'{device}.csv'
            
        device_file_path = os.path.join(output_dir, filename)
        device_data.to_csv(device_file_path, index=False, header=False)

    
    print(f'Data separated by devices and saved to {output_dir}')

In [None]:
subj_csvs = sorted(glob.glob(os.path.join(data_dir, 'exported', 'Gamified Images - Block [0-7]', 'Sensor Data', '*.csv')))

In [35]:
for data in subj_csvs:
    block = data.split('/')[-3].split(' ')[-1]
    subj = data.split('/')[-1].split('_')[-1].split('.')[0]

    print('Sorting ',subj, ', block ', block)

    data = pd.read_csv(data, header=None, dtype=str)
    
    if not isinstance(data, pd.DataFrame):
        raise ValueError("data is not a pandas DataFrame")

    # Create a directory to save the specific CSV files
    save_dir = os.path.join(data_dir, subj)
    os.makedirs(save_dir, exist_ok=True)

    # Identify the starting indices of each section
    info_start_idx = 0
    metadata_start_idx = data[data.iloc[:, 0].str.contains('#METADATA', na=False)].index[0]
    data_start_idx = data[data.iloc[:, 0].str.contains('#DATA', na=False)].index[0]

    # Extract each section
    info_section = data.iloc[info_start_idx:metadata_start_idx]
    metadata_section = data.iloc[metadata_start_idx:data_start_idx]
    data_section = data.iloc[data_start_idx:]

    # Return file paths for verification
    info_file_path = os.path.join(save_dir, f'{subj}_block-{block}_INFO.csv')
    metadata_file_path = os.path.join(save_dir,f'{subj}_block-{block}_METADATA.csv')
    data_file_path = os.path.join(save_dir, f'{subj}_block-{block}_DATA.csv')

    # Save each section to separate CSV files
    info_section.to_csv(info_file_path, index=False, header=False)
    metadata_section.to_csv(metadata_file_path, index=False, header=False)
    data_section.to_csv(data_file_path, index=False, header=False)

    # Call the function
    separate_data_by_device(metadata_section, data_section, save_dir, subj, block)

Sorting  sub-017 , block  1
Data separated by devices and saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/sub-017
Sorting  sub-050 , block  1
Data separated by devices and saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/sub-050
Sorting  sub-016 , block  1
Data separated by devices and saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/sub-016
Sorting  sub-049 , block  1
Data separated by devices and saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/sub-049
Sorting  sub-015 , block  1
Data separated by devices and saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/sub-015
Sorting  sub-048 , block  1
Data separated by devices and saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/sub-048
Sorting  sub-014 , block  1
Data separated by devices and saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/sub-014
Sorting  sub-047 , block  1
Data separated by devices and save

## Combine PsychoPy CSVS into Single Subj File

In [None]:
psychopy_dir = os.path.abspath(os.path.join(os.getcwd(), '../summer-2024/tasks/logs'))
subj_dirs = sorted(glob.glob(os.path.join(psychopy_dir, 'sub-0[0-6]*')))
psychopy_file_pattern = '{}_block-[0-7].csv'

for dir in subj_dirs:
    subj = dir.split('/')[-1]

    # Read and concatenate the CSV files that match the psychopy file pattern for the given subj
    subj_files = sorted(glob.glob(os.path.join(dir, psychopy_file_pattern.format(subj))))

    data_frames = []

    for file in subj_files:

        # Extract block number from the filename
        block_number = file.split('/')[-1].split('_')[-1].split('-')[-1][-5]
        df = pd.read_csv(file)
        df['block'] = block_number  # Add block number as a new column
        data_frames.append(df)
    
    subj_data = pd.concat(data_frames)
    
    # save the concatenated data
    subj_data.to_csv(os.path.join(data_dir, subj, f'{subj}_psychopy.csv'), index=False)


## Create PsychoPy Screen for correct gaze mapping

In [54]:
def recreate_screen(image_path, output_path, window_size=(1920, 1080), background_color=(128, 128, 128)):
    """
    Recreates a screen with the specified image centered on a background of the given color and size.

    Parameters:
    - image_path (str): Path to the input image.
    - output_path (str): Path to save the recreated screen image.
    - window_size (tuple): Size of the window (width, height). Default is (1290, 1080).
    - background_color (tuple): Background color in RGB. Default is (0, 0, 0).
    """
    # Load the image
    image = Image.open(image_path)

    # Create a new image with the specified background color
    screen = Image.new('RGB', window_size, background_color)

    # Calculate the position to center the image on the screen
    image_position = (
        (window_size[0] - image.width) // 2,
        (window_size[1] - image.height) // 2
    )

    # Paste the image onto the screen at the calculated position
    screen.paste(image, image_position)

    # Save the recreated screen as an image file
    screen.save(output_path)

    print(f'Recreated screen saved to {output_path}')

In [55]:
# test usage
image_id = '2512' # eventually i'll want to loop through all of the images and make sure the image is just the prefix (not the .jpg)
images_path = os.path.abspath(os.path.join(os.getcwd(), f'../summer-2024/tasks/study1-images_v1-20240602/'))

recreate_screen(os.path.join(images_path, f'{image_id}.jpg'), f'/Users/ebeard/Downloads/{image_id}-Dell.jpg')

Recreated screen saved to /Users/ebeard/Downloads/2512-Dell.jpg


In [56]:
dell_dir = os.path.join(data_dir, 'screen')

for image in sorted(glob.glob(images_path + '/*.jpg')):
    image_id = image.split('/')[-1].split('.')[0]
    recreate_screen(image, os.path.join(dell_dir, f'{image_id}-screen.jpg'))

Recreated screen saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/screen/1271-screen.jpg
Recreated screen saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/screen/1505-screen.jpg
Recreated screen saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/screen/1604-screen.jpg
Recreated screen saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/screen/1660-screen.jpg
Recreated screen saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/screen/1710-screen.jpg
Recreated screen saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/screen/2032-screen.jpg
Recreated screen saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/screen/2091-screen.jpg
Recreated screen saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/screen/2102-screen.jpg
Recreated screen saved to /Users/ebeard/Dropbox (Penn)/gamified/summer-2024/iMotions/screen/2235-screen.jpg
Recreated screen saved to /U

# Heatmaps
Tbh I'm not entirely sure why I wanted to separate all the data when I could just work out of the DATA spreadsheet. Shrug.


Using technique described by [pupil labs](https://docs.pupil-labs.com/neon/pupil-cloud/visualizations/heatmap/).

In [10]:
# Define the function to get fixation data between onset and offset
def get_fixation_data_between_onset_offset(events_df, gaze_df, image_id):
    # Filter events for the specific image onset and offset
    image_events = events_df[events_df['MarkerName'] == f'{image_id}.jpg']
    
    # Find the onset and offset timestamps
    onset_timestamp = image_events[image_events['MarkerDescription'].str.contains('onset', na=False)]['Timestamp']
    offset_timestamp = image_events[image_events['MarkerDescription'].str.contains('offset', na=False)]['Timestamp']
    
    # Ensure we have exactly one onset and one offset timestamp
    if len(onset_timestamp) != 1 or len(offset_timestamp) != 1:
        raise ValueError("Expected exactly one onset and one offset timestamp for the image.")
    
    # Extract scalar values from the Series
    onset_timestamp = onset_timestamp.iloc[0]
    offset_timestamp = offset_timestamp.iloc[0]
    
    # Filter gaze data between onset and offset
    fixation_data = gaze_df[(gaze_df['Timestamp'] >= onset_timestamp) & (gaze_df['Timestamp'] <= offset_timestamp)]
    
    return fixation_data

In [63]:
def create_heatmap(stimulus_eye_data, image_path, output_path,image_size=(1920, 1080), sampling_rate=30):
    # Convert fixation data to numpy arrays
    fixation_x = stimulus_eye_data['Fixation X'].values
    fixation_y = stimulus_eye_data['Fixation Y'].values
    event_duration = stimulus_eye_data['Fixation Duration'].values

    # Upsample for kernel density estimation
    x_samples = np.repeat(fixation_x, (event_duration * sampling_rate / 1000).astype(int))
    y_samples = np.repeat(fixation_y, (event_duration * sampling_rate / 1000).astype(int))

    # Perform 2D kernel density estimation
    kde = gaussian_kde(np.vstack([x_samples, y_samples]), bw_method='scott')
    x_grid = np.linspace(0, image_size[0], 300)
    y_grid = np.linspace(0, image_size[1], 300)
    x_mesh, y_mesh = np.meshgrid(x_grid, y_grid)
    grid_coords = np.vstack([x_mesh.ravel(), y_mesh.ravel()])
    kde_values = kde(grid_coords).reshape(x_mesh.shape)

    # Normalize the kde values to the maximum
    kde_values /= kde_values.max()

    # Define heatmap colors
    heatmap_colors = ['#FFFFFF00', '#00FF007A', '#48FF007A', '#91FF007A', '#DAFF007A', 
                      '#FFDA007A', '#FF91007A', '#FF48007A', '#FF00007A']
    cmap = ListedColormap(heatmap_colors)

    # load the sample image
    screen_image = Image.open(image_path).convert('RGBA')

    heatmap_image = Image.fromarray(np.uint8(plt.cm.hot(kde_values) * 255))

    # Blend the heatmap with the original image
    blended_image = Image.blend(screen_image, heatmap_image.resize(screen_image.size).convert('RGBA'))

    # Display the blended image
    blended_image.save(output_path, format='PNG')

    print(f'Blended image saved to {output_path}')


In [30]:
# we need to add all subjects fixations for a given stimuli
subj_dirs = sorted(glob.glob(os.path.join(data_dir, 'sub-*')))

image_id = '2512' # eventually i'll want to loop through all of the images and make sure the image is just the prefix (not the .jpg)
images_path = os.path.abspath(os.path.join(os.getcwd(), f'../summer-2024/tasks/study1-images_v1-20240602/'))

all_fixation_data = []

for x in subj_dirs:
    subj = x.split('/')[-1]

    # # need to find correct subj ET data path from the sub-001_psychopy.csv folder
    psychopy_csv = pd.read_csv(os.path.join(x, f'{subj}_psychopy.csv'))
    block = psychopy_csv['block'].loc[psychopy_csv['image_file']==f'{image_id}.jpg']
    
    if block.empty:
        print(f"No block found for image_id {image_id} for {subj}")
        continue

    block = block.iloc[0]

    # # Check if block is a single digit
    if len(str(block)) != 1:
        print("Block is not a single digit.")
    
    try:
        # # load the data for the correct events and gaze file
        print(f'Loading data for {subj} block {block}...')
        events_df = pd.read_csv(os.path.join(x, f'{subj}_block-{block}_External Events API (v0).csv'))
        gaze_df = pd.read_csv(os.path.join(x, f'{subj}_block-{block}_R Analysis GazeAnalysis I-VT filter.csv'))

        # # Get fixation data for the specific image
        fixation_data = get_fixation_data_between_onset_offset(events_df, gaze_df, image_id)
        valid_fixations = fixation_data.dropna(subset=['Fixation X', 'Fixation Y', 'Fixation Duration']).copy()
        valid_fixations = valid_fixations.drop_duplicates(subset=['Fixation X', 'Fixation Y', 'Fixation Duration']).dropna(axis=1)

        # # Drop the first fixation
        valid_fixations = valid_fixations.iloc[1:]

        # # add subj and image_id to the dataframe
        valid_fixations['subj'] = subj
        valid_fixations['image_id'] = f'{image_id}.jpg'
        valid_fixations['block'] = block

        # # append to all_fixation_data
        all_fixation_data.append(valid_fixations)
    
    except Exception as e:
        print(f"An error occurred while processing {subj} block {block}: {e}")
        continue

all_fixation_data = pd.concat(all_fixation_data)
create_heatmap(all_fixation_data, image_path, output_path,image_size=(1920, 1080), sampling_rate=30)


No block found for image_id 2512 for sub-001
Loading data for sub-002 block 4...
Loading data for sub-003 block 2...
No block found for image_id 2512 for sub-004
Loading data for sub-005 block 2...
Loading data for sub-006 block 3...
Loading data for sub-007 block 3...
Loading data for sub-008 block 2...
Loading data for sub-009 block 2...
No block found for image_id 2512 for sub-010
Loading data for sub-011 block 4...
Loading data for sub-012 block 3...
Loading data for sub-013 block 3...
Loading data for sub-014 block 5...
Loading data for sub-015 block 2...
Loading data for sub-016 block 4...
Loading data for sub-017 block 1...
Loading data for sub-018 block 3...
Loading data for sub-019 block 5...
Loading data for sub-020 block 1...
Loading data for sub-021 block 1...
Loading data for sub-022 block 4...
Loading data for sub-023 block 2...
Loading data for sub-024 block 4...
Loading data for sub-025 block 4...
Loading data for sub-026 block 1...
Loading data for sub-027 block 5...
L

## shit we tried yesterday

In [43]:
def compute_subject_heatmap(events_df, gaze_df, image_id, image_path, nbins_x=300):

    original_image = Image.open(os.path.join(image_path, f'{image_id}.jpg'))
    img_width, img_height = original_image.size

    # Compute the number of bins for the y-axis based on the aspect ratio of the image
    nbins_y = int(nbins_x * img_height / img_width)
    
    """Compute heatmap for a single subject."""
    fixation_data = get_fixation_data_between_onset_offset(events_df, gaze_df, image_id)

    if fixation_data.empty:
        raise ValueError("No fixation_data for specific image found")
    
    valid_fixations = fixation_data.dropna(subset=['Fixation X', 'Fixation Y', 'Fixation Duration']).copy()
    valid_fixations['Fixation X'] = valid_fixations['Fixation X'].astype(int)
    valid_fixations['Fixation Y'] = valid_fixations['Fixation Y'].astype(int)
    
    heatmap, xedges, yedges = np.histogram2d(valid_fixations['Gaze X'], valid_fixations['Gaze Y'], bins=[nbins_x, nbins_y])
    heatmap = gaussian_filter(heatmap, sigma=0.01 * max(nbins_x, nbins_y))
    return heatmap

In [5]:
def aggregate_heatmaps(heatmaps):
    """Aggregate heatmaps by summing and then normalizing."""
    aggregated_heatmap = np.sum(heatmaps, axis=0)
    aggregated_heatmap /= np.max(aggregated_heatmap)
    return aggregated_heatmap

In [40]:
subj_dirs = sorted(glob.glob(os.path.join(data_dir, 'sub-*')))

image_id = '2512' # eventually i'll want to loop through all of the images and make sure the image is just the prefix (not the .jpg)
images_path = os.path.abspath(os.path.join(os.getcwd(), f'../summer-2024/tasks/study1-images_v1-20240602/'))

In [44]:
subject_heatmaps = []

for dir in subj_dirs:

    subj = dir.split('/')[-1]

    # # need to find correct subj ET data path from the sub-001_psychopy.csv folder
    psychopy_csv = pd.read_csv(os.path.join(dir, f'{subj}_psychopy.csv'))
    block = psychopy_csv['block'].loc[psychopy_csv['image_file']==f'{image_id}.jpg']
    
    if block.empty:
        print(f"No block found for image_id {image_id} for {subj}")
        continue

    block = block.iloc[0]

    # # Check if block is a single digit
    if len(str(block)) != 1:
        print("Block is not a single digit.")
    
    try:
    # # load the data for the correct events and gaze file
        print(f'Loading data for {subj} block {block}...')
        events_df = pd.read_csv(os.path.join(dir, f'{subj}_block-{block}_External Events API (v0).csv'))
        gaze_df = pd.read_csv(os.path.join(dir, f'{subj}_block-{block}_R Analysis GazeAnalysis I-VT filter.csv'))

    # # compute the heatmap for the subject
        print(f'Processing heatmap for {subj} block {block}...')
        subject_heatmap = compute_subject_heatmap(events_df, gaze_df, image_id, images_path)
        subject_heatmaps.append(subject_heatmap)

    except Exception as e:
        print(f"An error occurred while processing {subj} block {block}: {e}")
        continue

No block found for image_id 2512 for sub-001
Loading data for sub-002 block 4...
Processing heatmap for sub-002 block 4...
Loading data for sub-003 block 2...
Processing heatmap for sub-003 block 2...
No block found for image_id 2512 for sub-004
Loading data for sub-005 block 2...
Processing heatmap for sub-005 block 2...
Loading data for sub-006 block 3...
Processing heatmap for sub-006 block 3...
Loading data for sub-007 block 3...
Processing heatmap for sub-007 block 3...
Loading data for sub-008 block 2...
Processing heatmap for sub-008 block 2...
Loading data for sub-009 block 2...
Processing heatmap for sub-009 block 2...
No block found for image_id 2512 for sub-010
Loading data for sub-011 block 4...
Processing heatmap for sub-011 block 4...
Loading data for sub-012 block 3...
Processing heatmap for sub-012 block 3...
Loading data for sub-013 block 3...
Processing heatmap for sub-013 block 3...
Loading data for sub-014 block 5...
Processing heatmap for sub-014 block 5...
Loading

In [45]:
aggregated_heatmap = aggregate_heatmaps(subject_heatmaps)

# Resize the heatmap using Lanczos smoothing interpolation

original_image = Image.open(os.path.join(images_path, f'{image_id}.jpg'))
img_width, img_height = original_image.size

final_heatmap = resize(aggregated_heatmap, (img_height, img_width), order=3)

# Convert heatmap to image
heatmap_image = Image.fromarray(np.uint8(plt.cm.hot(final_heatmap) * 255))

# Blend the heatmap with the original image
blended_image = Image.blend(original_image.convert('RGBA'), heatmap_image.convert('RGBA'), alpha=0.5)

# Save the heatmap overlay image
output_image_path = f'/Users/ebeard/Downloads/heatmap_overlay_image-{image_id}_v2.png'
blended_image.save(output_image_path)

print(f'Heatmap overlay image saved at: {output_image_path}')

Heatmap overlay image saved at: /Users/ebeard/Downloads/heatmap_overlay_image-2512_v2.png


In [53]:
# Load the sample image
sample_image = Image.open(sample_image_path)
img_width, img_height = sample_image.size

# Convert fixation coordinates to integers if they are not already
fixation_data = fixation_data.dropna(subset=['Gaze X', 'Gaze Y'])

fixation_data['Gaze X'] = fixation_data['Gaze X'].astype(int)
fixation_data['Gaze Y'] = fixation_data['Gaze Y'].astype(int)

# Compute the 2D histogram
nbins_x = 300
nbins_y = int(300 * img_height / img_width)
heatmap, xedges, yedges = np.histogram2d(fixation_data['Gaze X'], fixation_data['Gaze Y'], bins=[nbins_x, nbins_y])

# Apply a 2D Gaussian blur to the histogram
heatmap = gaussian_filter(heatmap, sigma=0.01 * max(nbins_x, nbins_y))

# Normalize the resulting values to the maximum
heatmap /= np.max(heatmap)

# Resize the heatmap using Lanczos smoothing interpolation
final_heatmap = resize(heatmap, (img_height, img_width), order=3)  # order=3 corresponds to Lanczos interpolation

# Convert heatmap to image
heatmap_image = Image.fromarray(np.uint8(plt.cm.hot(final_heatmap) * 255))

# Blend the heatmap with the original image
blended_image = Image.blend(sample_image.convert('RGBA'), heatmap_image.convert('RGBA'), alpha=0.5)

# Save the heatmap overlay image
output_image_path = '/Users/ebeard/Downloads/heatmap_overlay.png'
blended_image.save(output_image_path)

print(f'Heatmap overlay image saved at: {output_image_path}')


Heatmap overlay image saved at: /Users/ebeard/Downloads/heatmap_overlay.png


#### archive