### The below code

- Identifies each participant response videos along with the participant predictions (Well / Poor)
- Calculates the distribution of the responses across all participants

In [21]:
import os
import re
import ast
import numpy as np
import pandas as pd

from tqdm import tqdm
from moviepy.editor import VideoFileClip
from moviepy.editor import *

In [50]:
files_to_ignore = ['.DS_Store']
pilot_studyData_path = '../../pilot/collected_data/pilot_videos/'
survey_information_path = '../../pilot/collected_data/Bad_Idea_pilot_November 6, 2023_15.44.csv'

In [40]:
survey_df = pd.read_csv(survey_information_path)
# survey_df

In [41]:
pilot_participants = sorted([participant for participant in os.listdir(pilot_studyData_path) if participant not in files_to_ignore])
pilot_participants

['1162', '2196', '2404', '4439', '5977', '6109', '7192', '8240']

##### Temp pre-processing 
To map the `QID` of the stimulusVideo to its corresponding TRUE outcome

To FIX: Rearrange the stimulusVideos in the qualtrics survey for easier parsing

In [36]:
pilot_studyDataVideo_true_outcome_dict = {}

for participant in pilot_participants:
    participant_path = pilot_studyData_path + participant
    participant_studyData_path = participant_path + '/mp4StudyVideo/'
    studyVideos = sorted([video.split('_')[0] for video in os.listdir(participant_studyData_path)])
    
    for video in studyVideos:
        pilot_studyDataVideo_true_outcome_dict[video] = input(f'{video}: True Outcome = ')
    
    break

QID160: True Outcome = Bad
QID967: True Outcome = Good
QID973: True Outcome = Good
QID978: True Outcome = Good
QID983: True Outcome = Bad
QID988: True Outcome = Bad
QID993: True Outcome = Bad
QID998: True Outcome = Bad


In [42]:
pilot_studyDataVideo_true_outcome_dict

{'QID160': 'Bad',
 'QID967': 'Good',
 'QID973': 'Good',
 'QID978': 'Good',
 'QID983': 'Bad',
 'QID988': 'Bad',
 'QID993': 'Bad',
 'QID998': 'Bad'}

In [43]:
pilot_studyDataVideo_true_outcome_dict = {'QID160': 'Bad',
 'QID967': 'Good',
 'QID973': 'Good',
 'QID978': 'Good',
 'QID983': 'Bad',
 'QID988': 'Bad',
 'QID993': 'Bad',
 'QID998': 'Bad'}

##### Expected answers
Here, we read in the information and the expected outcomes of the stimulus videos to compare as to how the `perceived_outcome` varies as to the `expected_outcome` and `true_outcome`

In [45]:
survey_summary_data_path = '../../pilot/analysis_data/summary_data.xlsx'
sheet_name = 'Sheet2'
stimulusVideo_expected_answer_df = pd.read_excel(survey_summary_data_path, sheet_name)
stimulusVideo_expected_answer_df

Unnamed: 0,stimulus_video_order (qualtrics),stimulus_video_QID (qualtrics),Cut_stimulus_video_ID,Expected Answer,Comments
0,1,QID160,9,BAD,
1,2,QID967,33,BAD,
2,3,QID973,44,BAD,
3,4,QID978,41,BAD,
4,5,QID983,12,BAD,(maybe)
5,6,QID988,15,GOOD,
6,7,QID993,30,BAD,
7,8,QID998,7,BAD,


In [53]:
pilot_studyDataVideo_expected_answer = {}

for index, row in stimulusVideo_expected_answer_df.iterrows():
    pilot_studyDataVideo_expected_answer[row['stimulus_video_QID (qualtrics)']] = (row['Expected Answer'], row['Cut_stimulus_video_ID'])
pilot_studyDataVideo_expected_answer

{'QID160': ('BAD', 9),
 'QID967': ('BAD', 33),
 'QID973': ('BAD', 44),
 'QID978': ('BAD', 41),
 'QID983': ('BAD', 12),
 'QID988': ('GOOD', 15),
 'QID993': ('BAD', 30),
 'QID998': ('BAD', 7)}

Obtain the column names of the QIDs that ask about the `perceived_outcome` of the stimulusVideos

In [54]:
# Extract keys, add 2 to their integer part, and create a list of values
pilot_perceived_outcome_QIDs = ['Q' + str(int(key[3:]) + 2) for key in pilot_studyDataVideo_true_outcome_dict]
pilot_perceived_outcome_QIDs

['Q162', 'Q969', 'Q975', 'Q980', 'Q985', 'Q990', 'Q995', 'Q1000']

Create a new df to parse only the required columns and participants

In [55]:
participant_data = []
columns = ['Duration (in seconds)', 'Q959', 'Q726', 'Q11.4', 'Q11.1', 'randomID']
columns.extend(pilot_perceived_outcome_QIDs)

pilot_df = survey_df.loc[4: , columns]
pilot_df

Unnamed: 0,Duration (in seconds),Q959,Q726,Q11.4,Q11.1,randomID,Q162,Q969,Q975,Q980,Q985,Q990,Q995,Q1000
4,240,1,23,Male,"Middle Eastern or North African (e.g., Lebanes...",7192,Yes (bad idea),Yes (bad idea),Yes (bad idea),Yes (bad idea),No (good idea),No (good idea),Yes (bad idea),Yes (bad idea)
5,223,2,46,Male,Caucasian/European American/White,2196,Yes (bad idea),No (good idea),Yes (bad idea),Yes (bad idea),Yes (bad idea),No (good idea),No (good idea),Yes (bad idea)
6,225,3,28,Male,Asian/Asian American,1162,No (good idea),Yes (bad idea),Yes (bad idea),No (good idea),Yes (bad idea),No (good idea),No (good idea),No (good idea)
7,231,4,26,Male,Caucasian/European American/White,8240,Yes (bad idea),Yes (bad idea),Yes (bad idea),Yes (bad idea),Yes (bad idea),No (good idea),No (good idea),Yes (bad idea)
8,269,5,21,Female,Asian/Asian American,5977,Yes (bad idea),Yes (bad idea),Yes (bad idea),Yes (bad idea),No (good idea),No (good idea),Yes (bad idea),Yes (bad idea)
9,339,6,20,Female,Asian/Asian American,4439,Yes (bad idea),Yes (bad idea),Yes (bad idea),Yes (bad idea),Yes (bad idea),No (good idea),Yes (bad idea),Yes (bad idea)


#### Obtain Participant Information

The below methods perform:

- `getParticipantInformation()`: Extracts the participants information regarding the survey
- `videoDuration()`: calculates the video duration length

In [70]:
def videoDuration(response_stimulus_flag, stimulusVideo_ID, stimulusVideo_path):
    videos = sorted([video for video in os.listdir(stimulusVideo_path) if video not in files_to_ignore])
    for vid in videos:
        # Load the video file
        video_path = stimulusVideo_path + vid
        
        # Obtain the required video and calculate the duration of the video
        if response_stimulus_flag == 'responseVideo':
            vidName = vid.split('_')[0]
        elif response_stimulus_flag == 'stimulusVideo':
            vidName = vid.split('.')[0]
        
        if vidName == stimulusVideo_ID:
            video = VideoFileClip(video_path)
            
            # Get the duration of the video in seconds
            duration = video.duration
            return duration, vid

        
def getParticipantInformation(df, studyData_true_outcome_dict, perceived_outcome_QIDs, studyData_path):
    """
    Args():
        - df : takes in the df containing the records of participants that needs to be extracted
        - studyData_true_outcome_dict : A dict containing the TRUE stimulusVideo outcome value
        - perceived_outcome_QIDs : A list of values that contains the column name present in the dataframe
                                   that contains the participant's perceived outcome of the stimulusVideos
        - studyData_path : path where the studyResponse data is stored - to calculate the responseVideo duration length
    Returns:
        - participant_info : a list of tuples that contain relevant information for each reponseVideo for all the participants recorded in the qualtrics survey
    """
    # A list that contain tuples of information regarding each studyResponse video's information and metadata for each participant
    participant_info = []

    ### Iterate through the row of participant data
    for index, row in df.iterrows():
        ### Iterate through the columns that contain information regarding the perceived_outcome for each stimulusVideo for the given participant
        for perceived_outcome_id in tqdm(perceived_outcome_QIDs, desc='Participants Processed: '):
            study_participant_id = row['Q959']
            qualtrics_participant_id = row['randomID']

            ### Obtain the stimulusVideo's corresponding Qualtrics QID in the survey
            stimulusVideo_QID = 'QID' + str(int(perceived_outcome_id[1:]) - 2)

            ### From the dict obtained above (manually) - retrieve the TRUE stimulusVideo outcome
            true_video_outcome = studyData_true_outcome_dict[stimulusVideo_QID].lower()
            
            ### Expected outcome of the stimulusVidei
            expected_video_outcome = pilot_studyDataVideo_expected_answer[stimulusVideo_QID][0].lower()

            ### Obtain the PERCEIVED Outcome recorded in the survey
            perceived_outcome = row[perceived_outcome_id].split(' ')[1].split('(')[1].strip()

            # print(true_video_outcome, perceived_outcome)
            # print(true_video_outcome, perceived_outcome_dict[stimulusVideo_QID])
            # print('---')

            ### If the stimulusVideo's TRUE outcome matches the PERCEIVED outcome, then the user's agreement in video perspective is True.
            if true_video_outcome == perceived_outcome:
                agreement = 'Y'
            else:
                agreement = 'N'
            
            ### Specify the directory where the final stimulusVideo used in the survey is stored at - to obtain the video duration length
            survey_stimulus_cut_vidoes_path = '../pilot/stimulus_cut_videos/'
            stimulusVideo_cut_id = 'new_' + str(pilot_studyDataVideo_expected_answer[stimulusVideo_QID][1])
            stimulusVideo_duration, stimulusVideo_cut_id = videoDuration('stimulusVideo', stimulusVideo_cut_id, survey_stimulus_cut_vidoes_path)
            
            ### Specify the directory where the responseVideo of the participant is stored at - to obtain the responseVideo duration
            stimulusVideoResponse_path = studyData_path + qualtrics_participant_id + '/' + 'mp4StudyVideo/'
            stimulusVideoResponse_video_duration, stimulusVideoResponse_id = videoDuration('responseVideo', stimulusVideo_QID, stimulusVideoResponse_path)
            
            if abs(stimulusVideo_duration - stimulusVideoResponse_video_duration) <= 1.0:
                recorded_with_accurate_length = 'Y'
            else:
                recorded_with_accurate_length = 'N'
            # print(f'Participant ID: {qualtrics_participant_id}')
            # print(f'Stimulus Video: {stimulusVideo_QID}')
            # print(f'True Outcome: {studyData_true_outcome_dict[stimulusVideo_QID]}')
            # print(f"Perceived Outcome: {perceived_outcome_dict[stimulusVideo_QID].split(' ')[0]}")
            # print('-----')        

            # manual_qualtrics_participant_id = row['Q959']
            # age = row['Q726']
            # gender = row['Q11.4']
            # ethnicity = row['Q11.1'] 

            ### Append all the required data
            participant_info.append(
                (
                    study_participant_id, 
                    qualtrics_participant_id, 
                    qualtrics_participant_id, 
                    stimulusVideoResponse_id, 
                    stimulusVideo_duration, 
                    stimulusVideoResponse_video_duration, 
                    recorded_with_accurate_length, 
                    true_video_outcome, 
                    expected_video_outcome, 
                    row[perceived_outcome_id], 
                    agreement, 
                    None
                )
            )
    
    return participant_info

In [71]:
participant_info = getParticipantInformation(pilot_df, pilot_studyDataVideo_true_outcome_dict, pilot_perceived_outcome_QIDs, pilot_studyData_path)

Participants Processed: 100%|█████████████████████| 8/8 [00:01<00:00,  5.19it/s]
Participants Processed: 100%|█████████████████████| 8/8 [00:01<00:00,  5.10it/s]
Participants Processed: 100%|█████████████████████| 8/8 [00:01<00:00,  5.03it/s]
Participants Processed: 100%|█████████████████████| 8/8 [00:01<00:00,  5.44it/s]
Participants Processed: 100%|█████████████████████| 8/8 [00:01<00:00,  5.04it/s]
Participants Processed: 100%|█████████████████████| 8/8 [00:01<00:00,  5.35it/s]


In [72]:
summary_columns = [
    'Participant_ID (study)', 
    'Participant_ID (qualtrics)', 
    'aws_id', 
    'Video', 
    'Stimulus Video Duration (s)', 
    'Study Response Video Duration (s)', 
    'Recorded w/ accurate length??', 
    'True Outcome', 
    'Expected Outcome', 
    'Perceived Outcome', 
    'Agreement', 
    'Comments'
]
summary_data_df = pd.DataFrame(participant_info, columns=summary_columns)
summary_data_df

Unnamed: 0,Participant_ID (study),Participant_ID (qualtrics),aws_id,Video,Stimulus Video Duration (s),Study Response Video Duration (s),Recorded w/ accurate length??,True Outcome,Expected Outcome,Perceived Outcome,Agreement,Comments
0,1,7192,7192,QID160_1699105482544.mp4,9.97,9.93,Y,bad,bad,Yes (bad idea),Y,
1,1,7192,7192,QID967_1699105505776.mp4,8.2,8.2,Y,good,bad,Yes (bad idea),N,
2,1,7192,7192,QID973_1699105587711.mp4,7.47,7.47,Y,good,bad,Yes (bad idea),N,
3,1,7192,7192,QID978_1699105570140.mp4,7.47,7.47,Y,good,bad,Yes (bad idea),N,
4,1,7192,7192,QID983_1699105526010.mp4,6.97,6.98,Y,bad,bad,No (good idea),N,
5,1,7192,7192,QID988_1699105605093.mp4,7.0,7.0,Y,bad,good,No (good idea),N,
6,1,7192,7192,QID993_1699105456879.mp4,7.47,7.47,Y,bad,bad,Yes (bad idea),Y,
7,1,7192,7192,QID998_1699105547444.mp4,8.3,8.27,Y,bad,bad,Yes (bad idea),Y,
8,2,2196,2196,QID160_1699106002221.mp4,9.97,9.87,Y,bad,bad,Yes (bad idea),Y,
9,2,2196,2196,QID967_1699106043440.mp4,8.2,8.18,Y,good,bad,No (good idea),Y,


In [74]:
summary_data_df.to_csv('../../pilot/analysis_data/summary_data.csv', index = False)

In [75]:
agreement_counts = summary_data_df['Agreement'].value_counts().to_dict()
agreement_counts

{'N': 29, 'Y': 19}