<h3><b>CREATE DATA FRAMES.IPYNB</b><h3>
<p>Creates dataframes with all data and without excluded participants<p>
<p>outputs: chosen_stim, ratings, questionnaire_scores, questionnaire_complete, complete_task (having excluded faulty participants) <p>
<br>
<p>This script has been run using a subset of pilot data to confirm that it works</p>

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import jsonlines
from functools import reduce
import statistics
import math
import os
import json
import ast
import warnings
pd.options.mode.copy_on_write = True
pd.set_option('display.max_colwidth', None)
pd.set_option('future.no_silent_downcasting', True)
warnings.simplefilter(action='ignore', category=FutureWarning)

#uses various functions
from dataclean_func import *

In [44]:
#load in dataframes
    ##for example script: using a subset of pilot data to show how data cleaning works
df = pd.read_csv("csvs/dummy_data.csv", low_memory=False)
demographics=pd.read_csv("csvs/dummy_demographic.csv", low_memory=False)
df.drop('Unnamed: 0', axis="columns", inplace=True)
demographics.drop('Unnamed: 0', axis="columns", inplace=True)

In [45]:
excluded_list=[] #create empty list of excluded participants

DEMOGRAPHICS, DIGITSPAN AND DIAGNOSIS

In [46]:
##check for people who's demographic info doesn't align
demographics[demographics.aligns!=True].dropna() ##dropping those with missing data 

for i in list(demographics[demographics.aligns!=True].dropna().participant_no):
    excluded_list.append(i)

#digit span and diagnosis
digit_span=make_digit_span(df)
diagnosis=make_diagnosis(df)

##exclude those with digit span of 0 
for i in list(digit_span[digit_span.digit_span==0].participant_no):
    excluded_list.append(i)

dem_df=(diagnosis.merge(digit_span, on='participant_no')).merge(demographics, on='participant_no')

TIME LIMIT

In [47]:
demographics[demographics['time taken']>120]

Unnamed: 0,prolific_age,prolific_sex,time taken,participant_no,self_report_gender,self_report_assigned-at-birth,self_report_age,aligns


In [48]:
#exclude those who took too long
for i in list(demographics[demographics['time taken']>120].participant_no):
    excluded_list.append(i)
excluded_list

[]

VIDEO RATINGS

In [49]:
#create video ratings dataframes
total_participant=len(list(set(df.participant_no)))
ratings_df=pd.DataFrame()
chosen_stim_df=pd.DataFrame()
points_rating_df=pd.DataFrame()
for i in set(df.participant_no):     
    ##vid_ratings(df, i, "plot")
    vid_ratings_temp=vid_ratings(df, i, "rating_vids")
    ratings_df=pd.concat([ratings_df, vid_ratings_temp])
    chosen_stim_temp=vid_ratings(df, i, "chosen_stim")
    chosen_stim_df=pd.concat([chosen_stim_df, chosen_stim_temp])
    
    ###NB WILL NEED TO ALTER LINE 124 ONCE HAVE ACTUAL POINTS RATINGS###
        #this is new to the full study - without these ratings in the pilot data
    points_rating_temp=vid_ratings(df, i, "points_rating") 
    points_rating_df=pd.concat([points_rating_df, points_rating_temp])

chosen_stim_df=pd.concat([chosen_stim_df, points_rating_df])
chosen_stim_df=chosen_stim_df.fillna(0).sort_values(by='participant_no').reset_index().drop('index',axis=1)#

In [50]:
##exclude any with 0 for a rating of interest
disgust_stim_check=chosen_stim_df[chosen_stim_df.trial_type=="disgust"]
fear_stim_check=chosen_stim_df[chosen_stim_df.trial_type=="fear"]
exclude=[]
for i in set(chosen_stim_df.participant_no):
    participant_disgust=disgust_stim_check[disgust_stim_check.participant_no==i].reset_index()
    if participant_disgust.disgusting_2[0]==0: ##ONLY 2 RATINGS
        exclude.append(i)

    participant_fear=fear_stim_check[fear_stim_check.participant_no==i].reset_index()
    if participant_fear.frightening_2[0]==0:
        exclude.append(i)

#exclude ones with a zero
for i in list(exclude):
    excluded_list.append(i)
excluded_list

[]

In [51]:
#combine with demographic information
chosen_stim_df=pd.merge(chosen_stim_df, dem_df, on='participant_no', how='outer')

BEHAVIOURAL TASK

In [52]:
##create dataframe with all task information
complete_task_df=create_task_df(df, "no plot")

In [54]:
#exclusions on the basis of this dataframe
task_understood=make_task_understood(df, complete_task_df, "no plot")
#task_understood[task_understood.task_understood=="No"]

for i in list(task_understood[task_understood.task_understood=="No"].participant_no):
    excluded_list.append(i)
excluded_list

[]

In [55]:
#extract task outcomes - i.e., error types, error rates, win-stay/lose-shift
task_summary=make_task_outcomes(complete_task_df.drop('index', axis=1))
task_summary

Unnamed: 0,percentage_correct,mean_perseverative_er,mean_regressive_er,median_till_correct,mean_till_correct,win_stay,lose_shift,timed_out,block_no,block_type,disgust_block,fear_block,points_block,participant_no,fractals
0,0.485,2.4,17.2,11.5,33.333333,0.68932,0.447917,1,1.0,Disgust,1,0,0,8.0,"['F014', 'F020']"
0,0.465,7.0,45.0,52.0,66.666667,0.526316,0.490385,1,3.0,Fear,0,1,0,8.0,"['F009', 'F012']"
0,0.565,0.5,13.666667,11.0,28.571429,0.542857,0.553191,1,2.0,Points,0,0,1,8.0,"['F010', 'F018']"
0,0.525,2.285714,10.571429,16.5,25.0,0.626168,0.695652,1,2.0,Disgust,1,0,0,9.0,"['F010', 'F020']"
0,0.507538,0.857143,12.285714,15.0,24.875,0.677083,0.686275,0,1.0,Fear,0,1,0,9.0,"['F012', 'F015']"
0,0.445,2.0,24.75,23.0,40.0,0.693182,0.567568,1,3.0,Points,0,0,1,9.0,"['F009', 'F018']"
0,0.78481,0.714286,1.0,9.0,9.875,0.924528,0.52,0,2.0,Disgust,1,0,0,7.0,"['F000', 'F020']"
0,0.729927,0.0,4.714286,13.0,17.125,0.875,0.729167,0,1.0,Fear,0,1,0,7.0,"['F012', 'F014']"
0,0.61745,1.142857,6.0,12.0,18.625,0.896552,0.540984,0,3.0,Points,0,0,1,7.0,"['F015', 'F018']"


Add relevant video rating outcomes and demographic info to this 'task_summary' df - for hypothesis testing

In [56]:
##adding video information to task dataframe (for hypothesis testing)
stim_ratings_covariates=pd.DataFrame()
block_feedback=pd.DataFrame()
for participant_no in set(chosen_stim_df.participant_no):
    participant_df=chosen_stim_df[chosen_stim_df.participant_no==participant_no]
    disgust=participant_df[participant_df.trial_type=="disgust"]
    fear=participant_df[participant_df.trial_type=="fear"]
    valence_diff=int(fear.unpleasant_1)-int(disgust.unpleasant_1)
    arousal_diff=int(fear.arousing_1)-int(disgust.arousing_1)

    valence_habdiff=(int(fear.unpleasant_1)-int(fear.unpleasant_2))-(int(disgust.unpleasant_1)-int(disgust.unpleasant_2))
    arousal_habdiff=(int(fear.arousing_1)-int(fear.arousing_2))-(int(disgust.arousing_1)-int(disgust.arousing_2))
    
    row=pd.DataFrame({
        'participant_no': [participant_no],
        'valence_diff': [valence_diff],
        'arousal_diff': [arousal_diff],
        'valence_habdiff': [valence_habdiff],
        'arousal_habdiff': [arousal_habdiff],
    })
    stim_ratings_covariates=pd.concat([stim_ratings_covariates, row])

    block_feedback_row=pd.DataFrame({
        'participant_no': [participant_no, participant_no, participant_no],
        'block_type': ['Fear', 'Disgust', 'Points'],
        'feedback_details': [fear['Vid'].iloc[-1], disgust['Vid'].iloc[-1], 'Lose 10 Points']
        #'disgust_vid': [disgust['Vid'].iloc[-1]],
        #'fear_vid': [fear['Vid'].iloc[-1]],
        #'points_feedback': 'Lose 10 Points'
    })
    block_feedback=pd.concat([block_feedback, block_feedback_row])

In [57]:
##combine demographics, video ratings and task performance into one dataframe
task_and_ratings_summary=pd.merge(task_summary, stim_ratings_covariates, on='participant_no', how='outer')
task_and_ratings_summary=pd.merge(task_and_ratings_summary, block_feedback, on=['participant_no', 'block_type'], how='outer')
dfs = [task_and_ratings_summary, dem_df]
dem_vids_task = reduce(lambda  left,right: pd.merge(left,right,on=['participant_no'],
                                            how='outer'), dfs)

SAVE OUT as CSVs

In [58]:
##remove excluded participants
if [] in list(excluded_list):
    excluded_list.remove([])
excluded_list=(set(excluded_list))
excluded_list

set()

In [59]:
##remove these participants and save out as csvs
excluded_chosen_stim=chosen_stim_df
excluded_ratings=ratings_df
excluded_complete_task=complete_task_df
excluded_demographics=demographics
excluded_dem_vids_task=dem_vids_task

for participant in excluded_list:
    excluded_chosen_stim=excluded_chosen_stim.loc[excluded_chosen_stim.participant_no!=participant]
    excluded_ratings=excluded_ratings.loc[excluded_ratings.participant_no!=participant]
    excluded_complete_task=excluded_complete_task.loc[excluded_complete_task.participant_no!=participant]
    excluded_demographics=excluded_demographics.loc[excluded_demographics.participant_no!=participant]
    excluded_dem_vids_task=excluded_dem_vids_task.loc[excluded_dem_vids_task.participant_no!=participant]

#excluded_chosen_stim.to_csv('csvs/chosen_stim_excluded.csv')
#excluded_ratings.to_csv('csvs/ratings_excluded.csv')
#excluded_complete_task.to_csv('csvs/complete_task_excluded.csv')
#excluded_demographics.to_csv('csvs/demographics_excluded.csv')
#excluded_dem_vids_task.to_csv('csvs/dem_vids_task_excluded.csv')

<p><b>GET DEMOGRAPHIC INFORMATION</b></p>
<p>e.g., for summary in results section</p>

In [60]:
np.mean(excluded_demographics.prolific_age)

40.333333333333336

In [61]:
excluded_demographics['prolific_sex'].value_counts(normalize=False)

prolific_sex
Female    2
Male      1
Name: count, dtype: int64