<h3><b>CREATE DATA FRAMES.IPYNB</b><h3>
<p>Creates dataframes with all data and without excluded participants<p>
<p>outputs: chosen_stim, ratings, questionnaire_scores, questionnaire_complete, complete_task (full and excluded) <p>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import jsonlines
from functools import reduce
import statistics
import math
import os
import json
import ast
import warnings
pd.options.mode.copy_on_write = True
pd.set_option('display.max_colwidth', None)
pd.set_option('future.no_silent_downcasting', True)
warnings.simplefilter(action='ignore', category=FutureWarning)

#uses various functions
from dataclean_func import *

In [2]:
#load in dataframe
df = pd.read_csv("/Documents/Disgust learning project/pilot/pilot_analysis_all/dummy_data.csv", low_memory=False)
demographics=pd.read_csv("/Documents/Disgust learning project/pilot/pilot_analysis_all/dummy_demographic.csv", low_memory=False)
df.drop('Unnamed: 0', axis="columns", inplace=True)
demographics.drop('Unnamed: 0', axis="columns", inplace=True)

In [3]:
excluded_list=[] #create empty list of excluded participants

DEMOGRAPHICS, DIGITSPAN AND DIAGNOSIS

In [4]:
##check for people who's demographic info doesn't align
demographics[demographics.aligns!=True].dropna() ##dropping those with missing data 

for i in list(demographics[demographics.aligns!=True].dropna().participant_no):
    excluded_list.append(i)

#digit span and diagnosis
digit_span=make_digit_span(df)
diagnosis=make_diagnosis(df)

##exclude those with digit span of 0 
for i in list(digit_span[digit_span.digit_span==0].participant_no):
    excluded_list.append(i)

dem_df=(diagnosis.merge(digit_span, on='participant_no')).merge(demographics, on='participant_no')

VIDEO RATINGS

In [5]:
#create video ratings dataframes
total_participant=len(list(set(df.participant_no)))
ratings_df=pd.DataFrame()
chosen_stim_df=pd.DataFrame()
points_rating_df=pd.DataFrame()
for i in set(df.participant_no):     
    ##vid_ratings(df, i, "plot")
    vid_ratings_temp=vid_ratings(df, i, "rating_vids")
    ratings_df=pd.concat([ratings_df, vid_ratings_temp])
    chosen_stim_temp=vid_ratings(df, i, "chosen_stim")
    chosen_stim_df=pd.concat([chosen_stim_df, chosen_stim_temp])
    ###NB WILL NEED TO ALTER LINE 124 ONCE HAVE ACTUAL POINTS RATINGS
    points_rating_temp=vid_ratings(df, i, "points_rating") 
    points_rating_df=pd.concat([points_rating_df, points_rating_temp])

chosen_stim_df=pd.concat([chosen_stim_df, points_rating_df])
chosen_stim_df=chosen_stim_df.fillna(0).sort_values(by='participant_no').reset_index().drop('index',axis=1)#

wide_ratings_df=wide_ratings(chosen_stim_df)

In [6]:
##exclude any with 0 for a rating of interest
disgust_stim_check=chosen_stim_df[chosen_stim_df.trial_type=="disgust"]
fear_stim_check=chosen_stim_df[chosen_stim_df.trial_type=="fear"]
exclude=[]
for i in set(chosen_stim_df.participant_no):
    participant_disgust=disgust_stim_check[disgust_stim_check.participant_no==i].reset_index()
    if participant_disgust.disgusting_2[0]==0: ##ONLY 2 RATINGS
        exclude.append(i)

    participant_fear=fear_stim_check[fear_stim_check.participant_no==i].reset_index()
    if participant_fear.frightening_2[0]==0:
        exclude.append(i)

#exclude ones with a zero
for i in list(exclude):
    excluded_list.append(i)
excluded_list

[]

BEHAVIOURAL TASK

In [7]:
##create dataframe with all task information
complete_task_df=create_task_df(df, "no plot")

In [8]:
#exclusions on the basis of this dataframe
task_understood=make_task_understood(df, complete_task_df, "no plot")
#task_understood[task_understood.task_understood=="No"]

for i in list(task_understood[task_understood.task_understood=="No"].participant_no):
    excluded_list.append(i)
excluded_list

[7.0]

In [9]:
#extract task outcomes - i.e., error types, error rates, win-stay/lose-shift
task_summary=make_task_outcomes(complete_task_df.drop('index', axis=1))
task_summary

Unnamed: 0,percentage_correct,mean_perseverative_er,mean_regressive_er,median_till_correct,mean_till_correct,win_stay,lose_shift,timed_out,block_no,block_type,disgust_block,fear_block,points_block,participant_no
0,0.704545,0.428571,2.428571,9.5,11.0,0.822581,0.76,0,2.0,Disgust,1,0,0,8.0
0,0.575,0.6,15.6,30.5,33.333333,0.663158,0.557692,1,1.0,Fear,0,1,0,8.0
0,0.690909,0.142857,4.0,13.0,13.75,0.876923,0.772727,0,3.0,Points,0,0,1,8.0
0,0.6625,0.857143,2.285714,8.5,10.0,0.931818,0.542857,0,1.0,Disgust,1,0,0,9.0
0,0.58,1.428571,3.571429,8.0,12.5,0.976744,0.553571,0,2.0,Fear,0,1,0,9.0
0,0.671233,0.285714,2.142857,6.5,9.125,1.0,0.655172,0,3.0,Points,0,0,1,9.0
0,0.455,5.0,49.0,46.0,66.666667,0.478261,0.448598,1,3.0,Disgust,1,0,0,7.0
0,0.561728,0.571429,9.0,19.5,20.25,0.608696,0.507246,0,1.0,Fear,0,1,0,7.0
0,0.563953,1.857143,8.285714,21.0,21.5,0.645161,0.525641,0,2.0,Points,0,0,1,7.0


In [17]:
##combine demographics, video ratings and task performance into one dataframe
dfs = [task_summary, dem_df, wide_ratings_df]
dem_vids_task = reduce(lambda  left,right: pd.merge(left,right,on=['participant_no'],
                                            how='outer'), dfs)

SAVE OUT as CSVs

In [18]:
chosen_stim_df.to_csv('csvs/chosen_stim_full.csv')
ratings_df.to_csv('csvs/ratings_full.csv')
complete_task_df.to_csv('csvs/complete_task_full.csv')
dem_vids_task.to_csv('csvs/dem_vids_task_full.csv')

In [24]:
##remove excluded participants
if [] in list(excluded_list):
    excluded_list.remove([])
excluded_list=(set(excluded_list))
excluded_list

{7.0}

In [25]:
##remove these participants
excluded_chosen_stim=chosen_stim_df
excluded_ratings=ratings_df
excluded_complete_task=complete_task_df
excluded_demographics=demographics
excluded_dem_vids_task=dem_vids_task

for participant in excluded_list:
    excluded_chosen_stim=excluded_chosen_stim.loc[excluded_chosen_stim.participant_no!=participant]
    excluded_ratings=excluded_ratings.loc[excluded_ratings.participant_no!=participant]
    excluded_complete_task=excluded_complete_task.loc[excluded_complete_task.participant_no!=participant]
    excluded_demographics=excluded_demographics.loc[excluded_demographics.participant_no!=participant]
    excluded_dem_vids_task=excluded_dem_vids_task.loc[excluded_dem_vids_task.participant_no!=participant]

excluded_chosen_stim.to_csv('csvs/chosen_stim_excluded.csv')
excluded_ratings.to_csv('csvs/ratings_excluded.csv')
excluded_complete_task.to_csv('csvs/complete_task_excluded.csv')
excluded_demographics.to_csv('csvs/demographics_excluded.csv')
excluded_dem_vids_task.to_csv('csvs/dem_vids_task_excluded.csv')

GET DEMOGRAPHIC INFORMATION

In [14]:
np.mean(excluded_demographics.prolific_age)

np.float64(31.333333333333332)

In [15]:
excluded_demographics['prolific_sex'].value_counts(normalize=False)

prolific_sex
Female    2
Male      1
Name: count, dtype: int64