<h3><b>CREATE DATA FRAMES.IPYNB</b><h3>
<p>Creates dataframes for use in subsequent analysis - excluding all inattentive participants<p>
<p>outputs: chosen_stim, ratings, questionnaire_scores, questionnaire_complete, complete_task (full and excluded) <p>
<br>
<b>KEY RESULT:</b> 3 of 14 participants would be excluded using the chosen exclusion criteria (with only 1 failing to reach the performance criteria on the reversal learning task)
<p>NB this calculation does not account for exclusions which would be made due to self-reported demographic and digit-span data because not all pilot participants had the opportunity to complete the digit span task and demographic questionnaire  (due to coding errors and an initial failure to include these tasks in the experiment). We expect that all or almost all participants will meet these criteria, and thus will not be excluded on the basis of their digit-span score or self-reported demographic information. 

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import jsonlines
from functools import reduce
import statistics
import seaborn as sns
import math
import os
import json
import ast
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.copy_on_write = True
pd.set_option('display.max_colwidth', None)

In [65]:
#uses various functions
from vid_ratings_func import vid_ratings
from task_func import *
from digit_span_func import make_digit_span

In [66]:
#load in dataframe
df = pd.read_csv("csvs/blinded_data_full.csv", low_memory=False)
demographics=pd.read_csv("csvs/blinded_demography.csv")
df.drop('Unnamed: 0', axis="columns", inplace=True)
demographics.drop('Unnamed: 0', axis="columns", inplace=True)

In [67]:
#check that the files have loaded in okay
#df
demographics 
##note self-report demographic information is missing for a number of pilot participants due to a failure to ask some pilot participants 
# (this has been remedied for the final study)

Unnamed: 0,prolific_age,prolific_sex,time taken,participant_no,self_report_gender,self_report_assigned-at-birth,self_report_age,aligns
0,45,Male,51.166667,1,,,,
1,25,Female,72.533333,2,,,,
2,25,Male,42.066667,3,Male,Yes,25.0,True
3,49,Male,133.85,4,,,,
4,30,Female,128.8,5,Female,Yes,30.0,True
5,48,Male,49.833333,6,,,,
6,38,Female,43.383333,7,,,,
7,49,Female,101.6,8,,,,
8,34,Male,98.95,9,Male,Yes,34.0,True
9,20,Female,58.533333,10,Female,Yes,20.0,True


In [68]:
excluded_list=[] #for excluding dodgy participants

DEMOGRAPHICS

In [69]:
##check for people who's demographic info doesn't align
demographics[demographics.aligns!=True].dropna() ##dropping those with missing data (early batches didn't manually collect demographic data)

Unnamed: 0,prolific_age,prolific_sex,time taken,participant_no,self_report_gender,self_report_assigned-at-birth,self_report_age,aligns


In [70]:
for i in list(demographics[demographics.aligns!=True].dropna().participant_no):
    excluded_list.append(i)
excluded_list

[]

TIME LIMIT

In [71]:
demographics[demographics['time taken']>120]

Unnamed: 0,prolific_age,prolific_sex,time taken,participant_no,self_report_gender,self_report_assigned-at-birth,self_report_age,aligns
3,49,Male,133.85,4,,,,
4,30,Female,128.8,5,Female,Yes,30.0,True


In [72]:
#exclude those who took too long
for i in list(demographics[demographics['time taken']>120].participant_no):
    excluded_list.append(i)
excluded_list

[4, 5]

VIDEO RATINGS

In [73]:
total_participant=len(list(set(df.participant_no)))
ratings_df=pd.DataFrame()
chosen_stim_df=pd.DataFrame()
for i in (range(1,total_participant+1)): 
    ##vid_ratings(df, i, "plot")
    vid_ratings_temp=vid_ratings(df, i, "rating_vids")
    ratings_df=pd.concat([ratings_df, vid_ratings_temp])
    chosen_stim_temp=vid_ratings(df, i, "chosen_stim")
    chosen_stim_df=pd.concat([chosen_stim_df, chosen_stim_temp])

u:\Documents\Disgust learning project\github\disgust_reversal_learning-pilot\vid_ratings_func.py:19: ChainedAssignmentError: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
When using the Copy-on-Write mode, such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object.


  rating_vids_df['response'].replace('  ', np.nan, inplace=True)
u:\Documents\Disgust learning project\github\disgust_reversal_learning-pilot\vid_ratings_func.py:19: ChainedAssignmentError: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
When using the Copy-on-Write mode, such inplace method never works to upd

NEXT SECTION CAN BE REMOVED FOR REAL ANALYSIS -- CORRECTING AN ERROR IN CODE IN 1st BATCH OF PILOT DATA


In [74]:
#Also remove these sections from vid_ratings_func.py
## remove for later batches - line136
    ##rating_vids['batch']=sub_df.reset_index().batch[0]
    ##rating_vids['batch1_participant_no']=sub_df.reset_index()['batch1_participant_no']

In [75]:
##was an issue with the video selection data for batch 1
##create correct stim dataframe - to correct for this error
correct_stim_df=pd.DataFrame({
    'batch1_participant_no': [1, 2, 3, 4],
    'disgust_vid': ['1414', '0888', '1414', '0888'],
    'fear_vid': ['0548', '0877', '0548', '0877']
})
correct_stim_df

Unnamed: 0,batch1_participant_no,disgust_vid,fear_vid
0,1,1414,548
1,2,888,877
2,3,1414,548
3,4,888,877


In [76]:
##assign what correct stim should have been - in seperate column
for i in correct_stim_df.index:
    participant_no=correct_stim_df.loc[i].batch1_participant_no
    disgust_vid=correct_stim_df.loc[i].disgust_vid
    fear_vid=correct_stim_df.loc[i].fear_vid
    ratings_df.loc[((ratings_df['batch1_participant_no'] == participant_no) & (ratings_df['Vid'] == disgust_vid)), ['correct_disgust_stim']]=1
    ratings_df.loc[((ratings_df['batch1_participant_no'] == participant_no) & (ratings_df['Vid'] == fear_vid)), ['correct_fear_stim']]=1

In [77]:
#create new chosen_stim_df with the correct scores for batch 1
chosen_stim_df=pd.concat([chosen_stim_df[chosen_stim_df.batch!=1], ratings_df[ratings_df['correct_disgust_stim']==1], ratings_df[ratings_df['correct_fear_stim']==1]])
chosen_stim_df

Unnamed: 0,Vid,trial_type,unpleasant_1,unpleasant_2,unpleasant_3,arousing_1,arousing_2,arousing_3,disgusting_1,disgusting_2,...,fear_stim,participant_no,batch,batch1_participant_no,unpleasant_total,arousing_total,disgusting_total,frightening_total,correct_disgust_stim,correct_fear_stim
1,1414,disgust,4,4,4,0,0,0,4,4,...,0,1.0,2,,8,0,9,0,,
5,46,fear,4,4,0,0,0,0,0,0,...,1,1.0,2,,4,0,0,3,,
1,1414,disgust,6,6,6,5,5,5,7,8,...,0,3.0,3,,12,10,15,0,,
6,374,fear,5,6,5,7,7,5,1,0,...,1,3.0,3,,11,12,0,14,,
1,1414,disgust,8,8,8,8,4,5,8,8,...,0,5.0,3,,16,9,16,0,,
7,548,fear,0,0,1,8,4,3,0,0,...,1,5.0,3,,1,7,0,5,,
1,1414,disgust,4,3,4,7,6,3,2,4,...,0,6.0,2,,7,9,6,1,,
7,548,fear,3,3,2,4,6,3,0,0,...,1,6.0,2,,5,9,0,12,,
3,1987,disgust,7,7,8,5,7,4,6,6,...,0,8.0,2,,15,11,14,14,,
6,374,fear,8,8,6,8,7,6,8,6,...,1,8.0,2,,14,13,11,15,,


RETURNING TO NORMAL (HAVE NOW FIXED ERROR IN BATCH 1 CODE)

In [78]:
##exclude any with 0 for a rating of interest
disgust_stim_check=chosen_stim_df[chosen_stim_df.trial_type=="disgust"]
fear_stim_check=chosen_stim_df[chosen_stim_df.trial_type=="fear"]
exclude=[]
for i in set(chosen_stim_df.participant_no):
    participant_disgust=disgust_stim_check[disgust_stim_check.participant_no==i].reset_index()
    if participant_disgust.disgusting_2[0]==0:
        exclude.append(i)

    participant_fear=fear_stim_check[fear_stim_check.participant_no==i].reset_index()
    if participant_fear.frightening_2[0]==0:
        exclude.append(i)

In [79]:
exclude

[]

In [80]:
if len(exclude)!=0:
    exclude_plot=set(exclude)
    for i in exclude_plot: 
        vid_ratings(df, i, "plot")
else:
    print("no exclusion")

no exclusion


In [81]:
#exclude ones with a zero
for i in list(exclude):
    excluded_list.append(i)
excluded_list

[4, 5]

DIGIT SPAN

In [82]:
##get digit span data - NB digit span was only added for later batches so only a few have digit span data
        #and combine with demographic data
digit_span=make_digit_span(df)
demographics=pd.merge(demographics, digit_span, on='participant_no', how='outer')

In [83]:
##due to an error in my first version of the code --> will remove for later batches
digit_span.loc[digit_span['participant_no']==6, ['digit_span']]=7
digit_span

Unnamed: 0,digit_span,participant_no
0,task failed,1.0
0,task failed,2.0
0,9.0,3.0
0,task failed,4.0
0,8.0,5.0
0,7,6.0
0,task failed,7.0
0,6.0,8.0
0,task failed,9.0
0,12.0,10.0


In [84]:
digit_span[digit_span.digit_span==0]

Unnamed: 0,digit_span,participant_no


In [85]:
##exclude those with digit span of 0 
for i in list(digit_span[digit_span.digit_span==0].participant_no):
    excluded_list.append(i)
excluded_list

[4, 5]

BEHAVIOURAL TASK

In [86]:
complete_task_df=create_task_df(df, "no plot")

In [87]:
task_understood=make_task_understood(df, complete_task_df, "no plot")
task_understood[task_understood.task_understood=="No"]

Unnamed: 0,participant_no,attention_checks,long_breaks,total_time,timed_out_d,criteria_d,timed_out_f,criteria_f,timed_out_p,criteria_p,timed_out_total,criteria_total,task_understood,breaks_details
0,4.0,6,Yes,117.993517,0,0,1,1,0,0,1,1,No,21.446017
0,5.0,5,Yes,110.3537,1,1,0,0,0,0,1,1,No,11.201817
0,14.0,6,No,77.6045,1,1,1,1,1,1,3,3,No,


In [88]:
task_understood

Unnamed: 0,participant_no,attention_checks,long_breaks,total_time,timed_out_d,criteria_d,timed_out_f,criteria_f,timed_out_p,criteria_p,timed_out_total,criteria_total,task_understood,breaks_details
0,1.0,6,No,39.6671,0,0,0,0,0,0,0,0,Yes,
0,2.0,6,No,57.04905,0,0,1,1,0,0,1,1,Yes,
0,3.0,5,No,24.508767,0,0,0,0,0,0,0,0,Yes,
0,4.0,6,Yes,117.993517,0,0,1,1,0,0,1,1,No,21.446017
0,5.0,5,Yes,110.3537,1,1,0,0,0,0,1,1,No,11.201817
0,6.0,6,No,34.406667,0,0,0,0,0,0,0,0,Yes,
0,7.0,6,No,35.407667,0,0,0,0,0,0,0,0,Yes,
0,8.0,6,No,81.701967,1,0,1,1,1,0,3,1,Yes,
0,9.0,5,No,66.0957,1,0,0,0,1,1,2,1,Yes,
0,10.0,6,No,41.706117,0,0,1,0,0,0,1,0,Yes,


In [89]:
for i in list(task_understood[task_understood.task_understood=="No"].participant_no):
    excluded_list.append(i)
excluded_list

[4, 5, 4.0, 5.0, 14.0]

In [90]:
##remove excluded participants
if [] in excluded_list:
    excluded_list.remove([])
excluded_list=(set(excluded_list))
excluded_list

{4, 5, 14.0}

REMOVE PARTICIPANTS AND SAVE CSVS

In [91]:
##remove these participants
excluded_chosen_stim=chosen_stim_df
excluded_ratings=ratings_df
excluded_complete_task=complete_task_df
excluded_demographics=demographics

for participant in excluded_list:
    excluded_chosen_stim=excluded_chosen_stim.loc[excluded_chosen_stim.participant_no!=participant]
    excluded_ratings=excluded_ratings.loc[excluded_ratings.participant_no!=participant]
    excluded_complete_task=excluded_complete_task.loc[excluded_complete_task.participant_no!=participant]
    excluded_demographics=excluded_demographics.loc[excluded_demographics.participant_no!=participant]

excluded_chosen_stim.to_csv('csvs/chosen_stim_excluded.csv')
excluded_ratings.to_csv('csvs/ratings_excluded.csv')
excluded_complete_task.to_csv('csvs/complete_task_excluded.csv')
excluded_demographics.to_csv('csvs/demographics_excluded.csv')

GET DEMOGRAPHIC INFORMATION

In [92]:
np.mean(excluded_demographics.prolific_age)

33.72727272727273

In [93]:
excluded_demographics['prolific_sex'].value_counts(normalize=False)

prolific_sex
Male      6
Female    5
Name: count, dtype: int64