In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

In [2]:
# Read the CSV file into a DataFrame
file_path = '../Data/Raw/survey_data.csv'
df = pd.read_csv(file_path, header=0, dtype=object)

In [3]:
df.iloc[2:,:]

Unnamed: 0,StartDate,EndDate,Status,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,DistributionChannel,UserLanguage,...,__js_Progress,__js_CaptchaSeen,__js_OpeningStatementSeen,__js_CommitmentRequestSeen,__js_DemographicsSeen,__js_ExampleQuestionSeen,__js_PastVideoQIDs,PROLIFIC_PID,STUDY_ID,SESSION_ID
2,5/1/24 21:30,5/1/24 21:46,IP Address,100,950,TRUE,5/1/24 21:46,R_2mFyXuMw0xGhq49,anonymous,EN,...,15,1,1,1,1,1,,5c791e24266f210012af518d,66323697033362689828dfdf,6632984eea600cf193cc86f7
3,5/1/24 21:32,5/1/24 21:51,IP Address,100,1132,TRUE,5/1/24 21:51,R_5Mg1k6wP14yRnih,anonymous,EN,...,15,1,1,1,1,1,,5a631a73b9e3b50001a6ebd8,66323697033362689828dfdf,663298b162f251a71d7c09ab
4,5/1/24 21:26,5/1/24 21:54,IP Address,100,1645,TRUE,5/1/24 21:54,R_5B8lXO4YHfKHfUJ,anonymous,EN,...,15,1,1,1,1,1,,5eebc1fd5feed239a73d693d,66323697033362689828dfdf,66329754c89ade67073a406b
5,5/1/24 21:31,5/1/24 21:55,IP Address,100,1423,TRUE,5/1/24 21:55,R_2ErW8XvC6N2tDZA,anonymous,EN,...,15,1,1,1,1,1,,6089aa8120d7418a70f3eba7,66323697033362689828dfdf,6632987acacfc3f0baf7bf0f
6,5/1/24 21:24,5/1/24 21:59,IP Address,100,2091,TRUE,5/1/24 21:59,R_3P5E5KoGwh5SatC,anonymous,EN,...,15,1,1,1,1,1,,63fc7ac4c0ba10b71a25a63d,66323697033362689828dfdf,663296d669a5872195662459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,5/10/24 15:23,5/10/24 15:46,IP Address,100,1359,TRUE,5/10/24 15:46,R_5dmSacDanFPOOa5,anonymous,EN,...,15,1,1,1,1,1,"306,722,551,618,315,000,000,000,000,000,000,00...",5fe1053fad8fbab3583b4ecf,663a6eb03adf2ec50b7b89f2,663e1fd0eacfdc715c839f5b
323,5/11/24 14:04,5/11/24 14:29,IP Address,100,1459,TRUE,5/11/24 14:29,R_41iYm9SyiCFRoPv,anonymous,EN,...,15,1,1,1,1,1,"31,233,263,128,256,800,000,000,000,000,000,000...",577bfa3c9fecfe0001a5bcbf,663a6eb03adf2ec50b7b89f2,663f5ec72814424115da5964
324,5/11/24 14:04,5/11/24 14:30,IP Address,100,1547,TRUE,5/11/24 14:30,R_7P1bLiBuznSTRyY,anonymous,EN,...,15,1,1,1,1,1,"251,412,822,843,274,000,000,000,000,000,000,00...",5e332dec41d1cd30b905bdfe,663a6eb03adf2ec50b7b89f2,663f5ec02e128dbb738c9864
325,5/11/24 14:05,5/11/24 14:31,IP Address,100,1577,TRUE,5/11/24 14:31,R_1fcOe9zF7A8itEh,anonymous,EN,...,15,1,1,1,1,1,"31,442,829,359,988,300,000,000,000,000,000,000...",62c1f284b5ce5f4d8840ea7c,663a6eb03adf2ec50b7b89f2,663f5eef7d50f1ec92e1423d


In [4]:
# Define the list of rating column
intensity_columns = ['joy', 'sadness', 'fear', 'anger', 'disgust', 'surprise', 'other']
sam_columns = ['pleasure', 'arousal', 'dominance']
rating_columns = intensity_columns + ['other_text'] + sam_columns + ['appraisal']

In [5]:
# Define the number of videos and columns per video
num_videos = 10
cols_per_video = 16

In [6]:
# Define a function to extract the video identifier from the column index
def extract_video_id(col_index):
    return (col_index - 45) // cols_per_video + 1

In [7]:
# Initialize a list to store processed rows
processed_rows = []

In [8]:
# Iterate through the rows to process each participant's data
rows = df.iloc[2:,:].iterrows()

for _, row in rows:
    # Set to store the video IDs for the current participant
    videos_to_keep = set()
    # Iterate through columns to identify videos with non-null values
    for col_index in range(45, 8237, cols_per_video):
        non_none_indices = row.iloc[col_index:col_index + cols_per_video].dropna().index
        if len(non_none_indices) > 0:
            video_id = extract_video_id(col_index)
            videos_to_keep.add(video_id)

    # Sort the videos to keep
    videos_to_keep = sorted(videos_to_keep)

    if len(videos_to_keep) != num_videos:
        raise Exception(f"Participant {row.QID3948} rated {len(videos_to_keep)} videos.") 
    
    # List to store the column index range for each video
    column_ranges = []
    for video_id in videos_to_keep:
        start_col = 45 + (video_id - 1) * cols_per_video
        end_col = start_col + cols_per_video
        column_ranges.append(range(start_col + 4, end_col))
    
    for column_range, video_id in zip(column_ranges, videos_to_keep):
        # Construct the processed row with the current participant_id
        processed_row = {'participant_id': row.QID3948}
        processed_row['start_time'] = row.StartDate
        processed_row['end_time'] = row.RecordedDate
        # Extract rating data from the assigned videos
        ratings_vals = row.iloc[column_range]
        # Store the video_id
        processed_row[f'video_id'] = video_id
        # Update the rating values for each column
        processed_row.update({col: val for col, val in zip(rating_columns, ratings_vals)})
        # Append the processed row to the list
        processed_rows.append(processed_row)

In [9]:
# Create a new DataFrame from the processed data
processed_df = pd.DataFrame(processed_rows)

In [10]:
# Exclude participant 5f09fab615ee6b0ab5235b15 and 6632140e3df7840cd7a81131
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "5f09fab615ee6b0ab5235b15"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "65c349313a148edd8311e6a6"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "64033cb754a87f6c919f305a"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "6278270bfb2439bf3abf192d"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "6347c42aae670afaf618bb62"].index)
# processed_df = processed_df.drop(processed_df[processed_df.participant_id == "59b044b4d98aab00019bad5a"].index) # not sure about this one
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "62bd685ac3edc9ad0d77812b"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "6151797ab0b64a8275ab4f1e"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "58954e2dc88c680001dd4b7e"].index)
processed_df = processed_df.drop(processed_df[processed_df.participant_id == "647ffbcfcf8adc8070bc2291"].index)
processed_df = processed_df.reset_index(drop=True)

In [11]:
# Save the processed data to a new CSV file
processed_df.to_csv('../Data/Processed/rating.csv', index=False)

In [12]:
processed_df

Unnamed: 0,participant_id,start_time,end_time,video_id,joy,sadness,fear,anger,disgust,surprise,other,other_text,pleasure,arousal,dominance,appraisal
0,5c791e24266f210012af518d,5/1/24 21:30,5/1/24 21:46,1,,Very Low,Very Low,,,Low,,,4,3,3,i honestly dont know about this one
1,5c791e24266f210012af518d,5/1/24 21:30,5/1/24 21:46,6,,High,,,,,,,1,1,2,it seemed like something had made the robot sa...
2,5c791e24266f210012af518d,5/1/24 21:30,5/1/24 21:46,59,,Very High,Very Low,,,,,,1,1,1,it seems to be sad and maybe a little confused...
3,5c791e24266f210012af518d,5/1/24 21:30,5/1/24 21:46,61,,Very Low,,Very Low,Very Low,,,,4,3,3,it seemed to be kinda all over the place
4,5c791e24266f210012af518d,5/1/24 21:30,5/1/24 21:46,219,,,High,,,Very Low,,,2,7,4,"it seems like something made the robot scared,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3155,62a447d91e71bfacd2e8c006,5/11/24 14:44,5/11/24 15:19,298,,Very High,,,,,,,1,6,2,"Robot was very sad, and just bumbling about, w..."
3156,62a447d91e71bfacd2e8c006,5/11/24 14:44,5/11/24 15:19,409,,High,,,,,,,1,9,5,It was sad and was singing a sad song - it was...
3157,62a447d91e71bfacd2e8c006,5/11/24 14:44,5/11/24 15:19,438,High,,,,,,,,7,7,6,"Robot seemed quite happy, and was going about ..."
3158,62a447d91e71bfacd2e8c006,5/11/24 14:44,5/11/24 15:19,499,,,High,,,Average,,,3,9,5,"Robot was agitated, probably fearful."


In [13]:
processed_df.participant_id.unique()

array(['5c791e24266f210012af518d', '5a631a73b9e3b50001a6ebd8',
       '5eebc1fd5feed239a73d693d', '6089aa8120d7418a70f3eba7',
       '63fc7ac4c0ba10b71a25a63d', '5e58436ebdccf5057ddd9190',
       '656cbb93ba4ac7120453d910', '5b09da13641b1200010eab05',
       '648ddf3aeff70c1f26ff2652', '60089c477baa6b3fba6dbe80',
       '55898937fdf99b7ae47f5c7f', '5e0e72587855d35d4dd68c7d',
       '6165e0b51a883e9db8cc7146', '5e7a4371b86374017acf58c3',
       '59e71a3b1ac65d0001eb47ac', '5e53d2941b233d02960a0dde',
       '5c12521fbf739a0001cfeb03', '589c431f0da7f10001de7407',
       '5f0cfeef098f1e53ff201909', '59b536d60516f6000197c3f9',
       '64543fa5dfed1d0ba8024957', '5a1683aeab721b0001ef8e4c',
       '5f2ac70ca7fb433824928131', '60cdad61f2fc16483e5db0a7',
       '5fbb4b0d5fdea9c55f813ace', '646baaaa73f3e86f2af26ac8',
       '60827ccde4706170115a3bf5', '5d57d77df9a0cd00177fc3da',
       '60a80fbd13aba726d46b8f2e', '63c53d59463365d1a0a23b1e',
       '5b5f15f2fc072a00017da73d', '5d96a61b5d0f1c00117

In [14]:
len(processed_df.participant_id.unique())

316

In [15]:
counts = processed_df.groupby("participant_id").transform("size")

# Filter the DataFrame based on counts greater than 10
filtered_df = processed_df[counts > 10]

In [16]:
filtered_df

Unnamed: 0,participant_id,start_time,end_time,video_id,joy,sadness,fear,anger,disgust,surprise,other,other_text,pleasure,arousal,dominance,appraisal


In [17]:
videos_covered = set(processed_df.video_id.unique())

In [18]:
len(videos_covered)

512

In [19]:
all_videos = set(range(1,513))

In [20]:
all_videos.difference(videos_covered)

set()

In [21]:
len(all_videos.difference(videos_covered))

0

In [29]:
counts = processed_df.groupby("video_id").transform("size")

# Filter the DataFrame based on counts greater than 1
filtered_df = processed_df[counts == 5]

In [30]:
sorted(filtered_df.video_id.unique())

[393, 464, 466]

In [31]:
len(filtered_df.video_id.unique())

3

In [32]:
count = {}

for i in range(1, 11):
    filtered_df = processed_df[counts == i]
    count[i] = len(filtered_df.video_id.unique())

In [38]:
count

{1: 0, 2: 0, 3: 0, 4: 0, 5: 3, 6: 420, 7: 88, 8: 0, 9: 1, 10: 0}

In [34]:
len(processed_df.video_id.unique()) + len(all_videos.difference(videos_covered))

512

In [35]:
processed_df[processed_df.video_id == 331]

Unnamed: 0,participant_id,start_time,end_time,video_id,joy,sadness,fear,anger,disgust,surprise,other,other_text,pleasure,arousal,dominance,appraisal
468,653fcb1b0f5d689c00a97629,5/3/24 16:57,5/3/24 18:04,331,Low,Low,High,Average,High,High,,,3,7,7,The robot is facing a challenge or obstacle th...
679,5ea6df209d688606829ac644,5/8/24 18:39,5/8/24 19:07,331,,Very Low,High,,,Very Low,,,5,9,6,The robot looks like it's doing something susp...
1134,5b5df59dd113f600012de8e9,5/8/24 21:15,5/8/24 21:54,331,,,Very High,High,,High,,,3,6,6,The robot may be confused or in search of some...
1927,6156fef09d1ab07148870099,5/9/24 13:10,5/9/24 13:38,331,,Low,Very Low,Very Low,,,,,4,5,4,seemed like trying to find something
2444,58174d95a22f0f0001a3f0be,5/9/24 20:12,5/9/24 20:32,331,,Low,High,,,Very Low,,,1,7,3,The robot seemed frightened to move around too...
2524,65de18036cf5d10beb26c723,5/9/24 20:13,5/9/24 21:20,331,,Very High,,Average,,,,,2,4,3,"the robot looks sad and angry, that is why it ..."
2546,65bd29a33fe10dcc8e183b52,5/9/24 13:39,5/9/24 22:11,331,Low,Very Low,Very Low,Average,Average,Very Low,,,2,9,7,"The robot is alert, and moderately angry and d..."
2886,63e96aac1a64f6442aeadf72,5/10/24 11:52,5/10/24 12:12,331,,,Average,,,,,,1,4,1,Robot seems to be alarmed by something
2987,6545c72ce36b05a1f97d2052,5/10/24 12:38,5/10/24 13:06,331,,Low,Very Low,Very Low,,,,,2,1,2,Robot is going around in circles and staying t...
