In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

In [2]:
# Read the CSV file into a DataFrame
file_path = '../Data/Raw/survey_data.csv'
df = pd.read_csv(file_path, header=0)

In [3]:
# Define the list of rating columns
ratings_cols = ['joy', 'sadness', 'fear', 'anger', 'disgust', 'surprise', 'other', 'other_text', 'pleasure', 'arousal', 'dominance', 'appraisal']

In [4]:
# Define the number of videos and columns per video
num_videos = 10
cols_per_video = 16

In [5]:
# Define a function to extract the video identifier from the column index
def extract_video_id(col_index):
    return (col_index - 45) // cols_per_video + 1

In [6]:
# Initialize a list to store processed rows
processed_rows = []

In [7]:
# Iterate through the rows to process each participant's data
rows = df.iloc[2:,:].iterrows()

for _, row in rows:
    # Set to store the video IDs for the current participant
    videos_to_keep = set()
    # Iterate through columns to identify videos with non-null values
    for col_index in range(45, 8237, cols_per_video):
        non_none_indices = row.iloc[col_index:col_index + cols_per_video].dropna().index
        if len(non_none_indices) > 0:
            video_id = extract_video_id(col_index)
            videos_to_keep.add(video_id)

    # Sort the videos to keep
    videos_to_keep = sorted(videos_to_keep)
    
    # List to store the column index range for each video
    column_ranges = []
    for video_id in videos_to_keep:
        start_col = 45 + (video_id - 1) * cols_per_video
        end_col = start_col + cols_per_video
        column_ranges.append(range(start_col + 4, end_col))
    
    for column_range, video_id in zip(column_ranges, videos_to_keep):
        # Construct the processed row with the current participant_id
        processed_row = {'participant_id': row.PROLIFIC_PID}
        # Extract rating data from the assigned videos
        ratings_vals = row.iloc[column_range]
        # Store the video_id
        processed_row[f'video_id'] = video_id
        # Update the rating values for each column
        processed_row.update({col: val for col, val in zip(ratings_cols, ratings_vals)})
        # Append the processed row to the list
        processed_rows.append(processed_row)

In [8]:
# Create a new DataFrame from the processed data
processed_df = pd.DataFrame(processed_rows)

In [9]:
# Exclude participant 5f09fab615ee6b0ab5235b15 and 6632140e3df7840cd7a81131
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "5f09fab615ee6b0ab5235b15"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "6632140e3df7840cd7a81131"].index)

In [10]:
processed_df = processed_df.replace(['Very Low', 'Low', 'Average', 'High', 'Very High'], [1, 2, 3, 4, 5])

In [11]:
num_cols = ['joy', 'sadness', 'fear', 'anger', 'disgust', 'surprise', 'other', 'pleasure', 'arousal', 'dominance']
processed_df[num_cols] = processed_df[num_cols].astype(float)

In [12]:
processed_df.dtypes

participant_id     object
video_id            int64
joy               float64
sadness           float64
fear              float64
anger             float64
disgust           float64
surprise          float64
other             float64
other_text         object
pleasure          float64
arousal           float64
dominance         float64
appraisal          object
dtype: object

In [13]:
processed_df

Unnamed: 0,participant_id,video_id,joy,sadness,fear,anger,disgust,surprise,other,other_text,pleasure,arousal,dominance,appraisal
0,5c791e24266f210012af518d,1,,1.0,1.0,,,2.0,,,4.0,3.0,3.0,i honestly dont know about this one
1,5c791e24266f210012af518d,6,,4.0,,,,,,,1.0,1.0,2.0,it seemed like something had made the robot sa...
2,5c791e24266f210012af518d,59,,5.0,1.0,,,,,,1.0,1.0,1.0,it seems to be sad and maybe a little confused...
3,5c791e24266f210012af518d,61,,1.0,,1.0,1.0,,,,4.0,3.0,3.0,it seemed to be kinda all over the place
4,5c791e24266f210012af518d,219,,,4.0,,,1.0,,,2.0,7.0,4.0,"it seems like something made the robot scared,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575,5fb45df6ce203a13f34ac0af,301,5.0,,3.0,,,4.0,4.0,Love,9.0,9.0,6.0,I think the robot just met another attractive ...
576,5fb45df6ce203a13f34ac0af,392,,1.0,,1.0,2.0,,5.0,boredom,3.0,1.0,1.0,The robot's weekend plans fell through and now...
577,5fb45df6ce203a13f34ac0af,415,1.0,5.0,1.0,1.0,1.0,1.0,,,1.0,1.0,2.0,The robot looks like its heart is broken. Its ...
578,5fb45df6ce203a13f34ac0af,460,3.0,2.0,2.0,,,2.0,4.0,Curiosity,7.0,3.0,4.0,I think this robot was looking for something o...


In [14]:
processed_df[processed_df.participant_id == "5e0e72587855d35d4dd68c7d"]

Unnamed: 0,participant_id,video_id,joy,sadness,fear,anger,disgust,surprise,other,other_text,pleasure,arousal,dominance,appraisal
110,5e0e72587855d35d4dd68c7d,19,,,,,,,,,5.0,3.0,3.0,The robot is exploring its surroundings
111,5e0e72587855d35d4dd68c7d,136,,,,,,,2.0,Boredom,5.0,3.0,3.0,The robot is awaiting instructions
112,5e0e72587855d35d4dd68c7d,164,3.0,,,,,,,,6.0,4.0,2.0,The robot is having fun navigating through the...
113,5e0e72587855d35d4dd68c7d,214,,,,,,,2.0,Calm,4.0,3.0,4.0,The robot is very calmly exploring the area
114,5e0e72587855d35d4dd68c7d,221,1.0,1.0,2.0,1.0,1.0,2.0,,,4.0,5.0,4.0,The robot is navigating and figuring out the s...
115,5e0e72587855d35d4dd68c7d,342,2.0,,,,,,,,6.0,2.0,2.0,The robot is happy exploring the area
116,5e0e72587855d35d4dd68c7d,390,1.0,,,,,,,,4.0,3.0,3.0,The robot is very calmly navigating through th...
117,5e0e72587855d35d4dd68c7d,422,,,,,,,2.0,Patience,5.0,3.0,3.0,The robot is calmly waiting for instruction
118,5e0e72587855d35d4dd68c7d,493,1.0,1.0,2.0,1.0,1.0,1.0,,,2.0,5.0,3.0,The robot is panicking as it tries to figure o...
119,5e0e72587855d35d4dd68c7d,494,2.0,,,,,,,,5.0,2.0,2.0,The robot is exploring the surrounding area.


In [15]:
# Save the processed data to a new CSV file
processed_df.to_csv('../Data/Processed/rating.csv', index=False)

In [16]:
videos_covered = set(processed_df.video_id.unique())

In [17]:
len(videos_covered)

512

In [18]:
all_videos = set(range(1,513))

In [19]:
all_videos.difference(videos_covered)

set()

In [20]:
len(all_videos.difference(videos_covered))

0

In [21]:
counts = processed_df.groupby("video_id").transform("size")

# Filter the DataFrame based on counts greater than 1
filtered_df = processed_df[counts > 1]

In [22]:
sorted(filtered_df.video_id.unique())

[35,
 50,
 54,
 75,
 83,
 87,
 102,
 122,
 126,
 152,
 161,
 162,
 166,
 168,
 188,
 192,
 205,
 224,
 233,
 243,
 244,
 246,
 252,
 253,
 279,
 280,
 300,
 301,
 311,
 315,
 319,
 345,
 347,
 348,
 355,
 356,
 358,
 359,
 366,
 378,
 392,
 403,
 413,
 415,
 424,
 425,
 427,
 456,
 460,
 477,
 481,
 491,
 493,
 494,
 498,
 501,
 505,
 510]

In [23]:
len(filtered_df.video_id.unique())

58

In [24]:
len(processed_df.video_id.unique()) + len(all_videos.difference(videos_covered))

512