In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

In [2]:
# Read the CSV file into a DataFrame
file_path = '../Data/Raw/survey_data_numeric.csv'
df = pd.read_csv(file_path, header=0, dtype=object)

In [3]:
df.iloc[2:,:]

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,...,__js_Progress,__js_CaptchaSeen,__js_OpeningStatementSeen,__js_CommitmentRequestSeen,__js_DemographicsSeen,__js_ExampleQuestionSeen,__js_PastVideoQIDs,PROLIFIC_PID,STUDY_ID,SESSION_ID
2,5/1/24 21:30,5/1/24 21:46,0,100,950,1,5/1/24 21:46,R_2mFyXuMw0xGhq49,anonymous,EN,...,15,1,1,1,1,1,,5c791e24266f210012af518d,66323697033362689828dfdf,6632984eea600cf193cc86f7
3,5/1/24 21:32,5/1/24 21:51,0,100,1132,1,5/1/24 21:51,R_5Mg1k6wP14yRnih,anonymous,EN,...,15,1,1,1,1,1,,5a631a73b9e3b50001a6ebd8,66323697033362689828dfdf,663298b162f251a71d7c09ab
4,5/1/24 21:26,5/1/24 21:54,0,100,1645,1,5/1/24 21:54,R_5B8lXO4YHfKHfUJ,anonymous,EN,...,15,1,1,1,1,1,,5eebc1fd5feed239a73d693d,66323697033362689828dfdf,66329754c89ade67073a406b
5,5/1/24 21:31,5/1/24 21:55,0,100,1423,1,5/1/24 21:55,R_2ErW8XvC6N2tDZA,anonymous,EN,...,15,1,1,1,1,1,,6089aa8120d7418a70f3eba7,66323697033362689828dfdf,6632987acacfc3f0baf7bf0f
6,5/1/24 21:24,5/1/24 21:59,0,100,2091,1,5/1/24 21:59,R_3P5E5KoGwh5SatC,anonymous,EN,...,15,1,1,1,1,1,,63fc7ac4c0ba10b71a25a63d,66323697033362689828dfdf,663296d669a5872195662459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,5/10/24 15:23,5/10/24 15:46,0,100,1359,1,5/10/24 15:46,R_5dmSacDanFPOOa5,anonymous,EN,...,15,1,1,1,1,1,"306,722,551,618,315,000,000,000,000,000,000,00...",5fe1053fad8fbab3583b4ecf,663a6eb03adf2ec50b7b89f2,663e1fd0eacfdc715c839f5b
323,5/11/24 14:04,5/11/24 14:29,0,100,1459,1,5/11/24 14:29,R_41iYm9SyiCFRoPv,anonymous,EN,...,15,1,1,1,1,1,"31,233,263,128,256,800,000,000,000,000,000,000...",577bfa3c9fecfe0001a5bcbf,663a6eb03adf2ec50b7b89f2,663f5ec72814424115da5964
324,5/11/24 14:04,5/11/24 14:30,0,100,1547,1,5/11/24 14:30,R_7P1bLiBuznSTRyY,anonymous,EN,...,15,1,1,1,1,1,"251,412,822,843,274,000,000,000,000,000,000,00...",5e332dec41d1cd30b905bdfe,663a6eb03adf2ec50b7b89f2,663f5ec02e128dbb738c9864
325,5/11/24 14:05,5/11/24 14:31,0,100,1577,1,5/11/24 14:31,R_1fcOe9zF7A8itEh,anonymous,EN,...,15,1,1,1,1,1,"31,442,829,359,988,300,000,000,000,000,000,000...",62c1f284b5ce5f4d8840ea7c,663a6eb03adf2ec50b7b89f2,663f5eef7d50f1ec92e1423d


In [4]:
# Define the list of rating column
intensity_columns = ['joy', 'sadness', 'fear', 'anger', 'disgust', 'surprise', 'other']
sam_columns = ['pleasure', 'arousal', 'dominance']
rating_columns = intensity_columns + ['other_text'] + sam_columns + ['appraisal']

In [5]:
# Define the number of videos and columns per video
num_videos = 10
cols_per_video = 16

In [6]:
# Define a function to extract the video identifier from the column index
def extract_video_id(col_index):
    return (col_index - 45) // cols_per_video + 1

In [7]:
# Initialize a list to store processed rows
processed_rows = []

In [8]:
# Iterate through the rows to process each participant's data
rows = df.iloc[2:,:].iterrows()

for _, row in rows:
    # Set to store the video IDs for the current participant
    videos_to_keep = set()
    # Iterate through columns to identify videos with non-null values
    for col_index in range(45, 8237, cols_per_video):
        non_none_indices = row.iloc[col_index:col_index + cols_per_video].dropna().index
        if len(non_none_indices) > 0:
            video_id = extract_video_id(col_index)
            videos_to_keep.add(video_id)

    # Sort the videos to keep
    videos_to_keep = sorted(videos_to_keep)

    if len(videos_to_keep) != num_videos:
        raise Exception(f"Participant {row.QID3948} rated {len(videos_to_keep)} videos.") 
    
    # List to store the column index range for each video
    column_ranges = []
    for video_id in videos_to_keep:
        start_col = 45 + (video_id - 1) * cols_per_video
        end_col = start_col + cols_per_video
        column_ranges.append(range(start_col + 4, end_col))
    
    for column_range, video_id in zip(column_ranges, videos_to_keep):
        # Construct the processed row with the current participant_id
        processed_row = {'participant_id': row.QID3948}
        processed_row['start_time'] = row.StartDate
        processed_row['end_time'] = row.RecordedDate
        # Extract rating data from the assigned videos
        ratings_vals = row.iloc[column_range]
        # Store the video_id
        processed_row[f'video_id'] = video_id
        # Update the rating values for each column
        processed_row.update({col: val for col, val in zip(rating_columns, ratings_vals)})
        # Append the processed row to the list
        processed_rows.append(processed_row)

In [9]:
# Create a new DataFrame from the processed data
processed_df = pd.DataFrame(processed_rows)

In [10]:
processed_df[processed_df.participant_id == "6151797ab0b64a8275ab4f1e"]

Unnamed: 0,participant_id,start_time,end_time,video_id,joy,sadness,fear,anger,disgust,surprise,other,other_text,pleasure,arousal,dominance,appraisal
2780,6151797ab0b64a8275ab4f1e,5/10/24 10:50,5/10/24 11:22,30,3,,1,,,3,,,9,1,1,The lights and movements of robot were slow. T...
2781,6151797ab0b64a8275ab4f1e,5/10/24 10:50,5/10/24 11:22,38,1,1.0,1,1.0,3.0,3,,,3,5,9,The robot moves and makes sound which makes me...
2782,6151797ab0b64a8275ab4f1e,5/10/24 10:50,5/10/24 11:22,99,2,1.0,1,1.0,1.0,2,,,9,1,1,the robot moving slow and light are not to dis...
2783,6151797ab0b64a8275ab4f1e,5/10/24 10:50,5/10/24 11:22,149,2,1.0,1,1.0,1.0,4,,,9,3,1,The lights of the robot are less moving which ...
2784,6151797ab0b64a8275ab4f1e,5/10/24 10:50,5/10/24 11:22,298,1,1.0,1,1.0,1.0,1,,,1,1,1,the robot moves only in small part of his terr...
2785,6151797ab0b64a8275ab4f1e,5/10/24 10:50,5/10/24 11:22,302,1,1.0,2,1.0,1.0,2,,,7,2,3,"The white light give me unsafe feeling, althou..."
2786,6151797ab0b64a8275ab4f1e,5/10/24 10:50,5/10/24 11:22,311,1,1.0,1,1.0,2.0,2,,,4,5,3,the robot moves too slow and i don't feel to s...
2787,6151797ab0b64a8275ab4f1e,5/10/24 10:50,5/10/24 11:22,347,2,1.0,1,1.0,1.0,2,,,9,2,3,The robot moves slow and lights are slow blink...
2788,6151797ab0b64a8275ab4f1e,5/10/24 10:50,5/10/24 11:22,354,1,1.0,1,1.0,1.0,1,,,9,1,1,"The color of the light were white, I find them..."
2789,6151797ab0b64a8275ab4f1e,5/10/24 10:50,5/10/24 11:22,438,1,1.0,1,1.0,1.0,1,,,9,1,3,"I feel this robot knows where is he going, it ..."


In [11]:
# Exclude participants
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "5f09fab615ee6b0ab5235b15"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "65c349313a148edd8311e6a6"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "64033cb754a87f6c919f305a"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "6278270bfb2439bf3abf192d"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "6347c42aae670afaf618bb62"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "62bd685ac3edc9ad0d77812b"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "6151797ab0b64a8275ab4f1e"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "58954e2dc88c680001dd4b7e"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "647ffbcfcf8adc8070bc2291"].index)
processed_df = processed_df.reset_index(drop=True)

In [12]:
processed_df

Unnamed: 0,participant_id,start_time,end_time,video_id,joy,sadness,fear,anger,disgust,surprise,other,other_text,pleasure,arousal,dominance,appraisal
0,5c791e24266f210012af518d,5/1/24 21:30,5/1/24 21:46,1,,1,1,,,2,,,4,3,3,i honestly dont know about this one
1,5c791e24266f210012af518d,5/1/24 21:30,5/1/24 21:46,6,,4,,,,,,,1,1,2,it seemed like something had made the robot sa...
2,5c791e24266f210012af518d,5/1/24 21:30,5/1/24 21:46,59,,5,1,,,,,,1,1,1,it seems to be sad and maybe a little confused...
3,5c791e24266f210012af518d,5/1/24 21:30,5/1/24 21:46,61,,1,,1,1,,,,4,3,3,it seemed to be kinda all over the place
4,5c791e24266f210012af518d,5/1/24 21:30,5/1/24 21:46,219,,,4,,,1,,,2,7,4,"it seems like something made the robot scared,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3155,62a447d91e71bfacd2e8c006,5/11/24 14:44,5/11/24 15:19,298,,5,,,,,,,1,6,2,"Robot was very sad, and just bumbling about, w..."
3156,62a447d91e71bfacd2e8c006,5/11/24 14:44,5/11/24 15:19,409,,4,,,,,,,1,8,5,It was sad and was singing a sad song - it was...
3157,62a447d91e71bfacd2e8c006,5/11/24 14:44,5/11/24 15:19,438,4,,,,,,,,7,7,6,"Robot seemed quite happy, and was going about ..."
3158,62a447d91e71bfacd2e8c006,5/11/24 14:44,5/11/24 15:19,499,,,4,,,3,,,3,8,5,"Robot was agitated, probably fearful."


In [13]:
counts = processed_df.groupby("video_id").transform("size")

In [14]:
counts.mean()

6.19873417721519