# STATISTICAL LEARNING

# import python packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import copy

#### define parameters of the study

In [2]:
study_number = 1 # 1 or 2

# read data files

In [3]:
df_raw = pd.read_csv('/raw_data/raw_study_1.csv')

  df_raw = pd.read_csv('C:/Users/Cinti/Documents/Kutatás/AUTI/_to_publish/raw_data/raw_study_3.csv')


# preprocessing

#### filter unnecessary columns

In [4]:
if study_number == 1:
    df = df_raw.loc[:, ['Subject', 'Block', 'Epoch', 'finalRT', 'ACC', 'triplet_type', 'trial_number', 'is_practice', 'first_response']]
elif study_number == 2:
    df = df_raw.loc[:, ['Group', 'Subject', 'Block', 'Epoch', 'finalRT', 'ACC', 'triplet_type', 'trial_number']] 

#### drop participants

In [5]:
if study_number == 1:
    participants_to_drop = ['zfbr1xmr', 'ewa5a6u1', 'oah1o7vp', 'lc8zqd2x', 'i2dl9e3g', 'ln7eke9h', '3ippg8nb', 'qzmrvkrj', 'dzlz88d0', 'humixa7l', 'j21b81w9', 'ogw7lvdc', '0fp60qq4', 'n80gbsvq', 'i07caajj', 'isfjzkx5', '9l7yus87', 'wmltviex', 'j21b81w9', 'ogw7lvdc', '0fp60qq4', 'n80gbsvq', 'i07caajj', 'isfjzkx5', '9l7yus87', 'wmltviex', '1x978q7u', '2k3uo9o4', 'b8xtm1zh', 'f0akmxef', 'gt3prp03', 'h6qpfwbn', 'ls9k0t3b', 'nu06la2p', 'orjkqq2x', 'zbizxozj', '5rkxbz6t', 'arq89iuv', 'jiu3yo02', 'vjyzdao1', 'wokamkap', '1r2rgkqt', '1jgexvhk', 'u81zondt', 's1v4un3e', 'xvipcks4', 'xbiiwl18', 'smu50mjz', '8dcnhu5e', 'e550q34s', 'mfzonm1o', 'y25hszee', 'p7juo6b2', 'kd1s2st3', '89bnh3gg', 'hqqp11vo', '3yl5creu', '73fy2ybt', '4h9lbkef', 'gh8haajx', 'wxsg8bsv', 'tozvyp0y', 'ag4evd41', 'hbo6llus', 'hejp7uzx', 'qry7e8zc']
elif study_number == 2:
    participants_to_drop = [2, 7, 28]

In [6]:
df = df[~df['Subject'].isin(participants_to_drop)].reset_index(inplace=False)

#### split based on median AQ points

In [7]:
if study_number == 1:
    # load dataframe with the AQ points
    AQ_df = pd.read_excel('/raw_data/aq_recoded.xlsx')
    AQ_df = AQ_df[['Subject', 'AQ_total']]
    
    # merge with dataframe
    df = df.merge(AQ_df, on = 'Subject', how = 'left')
    
    # split based on the median AQ
    AQ_median = df['AQ_total'].median()
    df['Group'] = df['AQ_total'].apply(lambda x: 0 if x < AQ_median else 1)

In [8]:
print(df[df['Group']==0]['Subject'].nunique())
print(df[df['Group']==1]['Subject'].nunique())

77
91


#### drop the practice blocks

In [9]:
if study_number == 1:
    indexNames = df[df['is_practice'] == 1 ].index
    df.drop(indexNames , inplace=True)
    df = df.reset_index(drop=True)

#### drop trials that were not first responses to a given stimulus

In [10]:
if study_number == 1:
    indexNames1 = df[df['first_response'] == 0].index
    df.drop(indexNames1, inplace=True)
    df = df.reset_index(drop=True)

#### exclude first trials, trills and repetitions

In [11]:
indexNames = df[df['triplet_type'] == 'X'].index
df.drop(indexNames, inplace=True)
indexNames2 = df[df['triplet_type'] == 'T'].index
df.drop(indexNames2, inplace=True)
indexNames3 = df[df['triplet_type'] == 'R'].index
df.drop(indexNames3, inplace=True)
df = df.reset_index(drop=True)

#### define outliers based on boxplot

In [12]:
def get_outliers(data, value):
    # define outlier boundaries (for each subject, for each epoch)
    data['outlier'] = 0
    n_epoch = data['Epoch'].nunique()
    n_group = data['Group'].nunique()
    
    for g in range(n_group):
        subjects = data[data['Group'] == g]['Subject'].unique()
        n_subject = data[data['Group'] == g]['Subject'].nunique()
        
        for s in range(n_subject):
            q1 = data[data['Group'] == g][data['Subject'] == subjects[s]].groupby('Epoch')[value].quantile(q = 0.25).to_numpy()
            q3 = data[data['Group'] == g][data['Subject'] == subjects[s]].groupby('Epoch')[value].quantile(q = 0.75).to_numpy()
            iqr = q3 - q1
            lower = q1 - 1.5 * iqr
            upper = q3 + 1.5 * iqr
            
            # create a new variale indicating if the trial is an outlier
            for i in range(len(data)):
                if data['Group'][i] == g:
                    if data['Subject'][i] == subjects[s]:
                        for e in range(n_epoch):
                            if data['Epoch'][i] == e + 1:
                                if (data[value][i] < lower[e]) | (data[value][i] > upper[e]):
                                    data['outlier'][i] = 1
                                else:
                                    data['outlier'][i] = 0
    return data          

In [13]:
df = get_outliers(df, 'finalRT')

  q1 = data[data['Group'] == g][data['Subject'] == subjects[s]].groupby('Epoch')[value].quantile(q = 0.25).to_numpy()
  q3 = data[data['Group'] == g][data['Subject'] == subjects[s]].groupby('Epoch')[value].quantile(q = 0.75).to_numpy()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['outlier'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['outlier'][i] = 1
  q1 = data[data['Group'] == g][data['Subject'] == subjects[s]].groupby('Epoch')[value].quantile(q = 0.25).to_numpy()
  q3 = data[data['Group'] == g][data['Subject'] == subjects[s]].groupby('Epoch')[value].quantile(q = 0.75).to_numpy()
A value is trying to be set on a copy of a slice from a DataFrame


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['outlier'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['outlier'][i] = 1
  q1 = data[data['Group'] == g][data['Subject'] == subjects[s]].groupby('Epoch')[value].quantile(q = 0.25).to_numpy()
  q3 = data[data['Group'] == g][data['Subject'] == subjects[s]].groupby('Epoch')[value].quantile(q = 0.75).to_numpy()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['outlier'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See 

KeyboardInterrupt: 

In [None]:
indexNames0 = df[df['outlier'] == 1].index
df.drop(indexNames0, inplace=True)
df = df.reset_index(drop=True)

#### drop incorrect trials

In [None]:
indexNames = df[df['ACC'] == 0 ].index
df.drop(indexNames, inplace=True)
df = df.reset_index(drop=True)

# Calculating statistical learning

#### Calculate H and L scores by epochs

In [None]:
if study_number == 1:
    SL_RT_wide = df.pivot_table(index=['Group', 'Subject', 'AQ_total'], columns=['Epoch','triplet_type'], values='finalRT', aggfunc=np.median)
    SL_RT_wide = SL_RT_wide.rename(columns={1.0: 'e1', 2.0: 'e2', 3.0:'e3', 4.0:'e4', 5.0: 'e5'}, level=0)
    SL_RT_wide.columns = SL_RT_wide.columns.map('_'.join).str.strip('_') 
    for x in [1,2,3,4,5]:
        SL_RT_wide["e"+str(x)+"_SL"] = SL_RT_wide["e"+str(x)+"_L"] - SL_RT_wide["e"+str(x)+"_H"]
elif study_number == 2:
    SL_RT_wide = df.pivot_table(index=['Group','Subject'], columns=['Epoch','triplet_type'], values='finalRT', aggfunc=np.median)
    SL_RT_wide = SL_RT_wide.rename(columns={1.0: 'e1', 2.0: 'e2', 3.0:'e3', 4.0:'e4'}, level=0)
    SL_RT_wide.columns = SL_RT_wide.columns.map('_'.join).str.strip('_')
    for x in [1,2,3,4]:
        SL_RT_wide["e"+str(x)+"_SL"] = SL_RT_wide["e"+str(x)+"_L"] - SL_RT_wide["e"+str(x)+"_H"]

SL_RT_wide.head()

#### Save data in CSV

In [None]:
#SL_RT_wide.to_csv('/processed_data/study_1/data_study_1_statistical_learning.csv',index=True)

# Calculating visuomotor performance

#### Calculate median RT by epochs

In [None]:
if study_number == 1:
    VP_RT_wide = df.pivot_table(index=['Group', 'Subject', 'AQ_total'], columns=['Epoch'], values='finalRT', aggfunc=np.median)
    VP_RT_wide = VP_RT_wide.rename(columns={1.0: 'e1', 2.0: 'e2', 3.0:'e3', 4.0:'e4', 5.0: 'e5'}, level=0)
elif study_number == 2:
    VP_RT_wide = df.pivot_table(index=['Group', 'Subject'], columns=['Epoch'], values='finalRT', aggfunc=np.median)
    VP_RT_wide = VP_RT_wide.rename(columns={1.0: 'e1', 2.0: 'e2', 3.0:'e3', 4.0:'e4'}, level=0)
    
VP_RT_wide.head()

#### Save data in CSV

In [None]:
#VP_RT_wide.to_csv('/processed_data/study_1/data_study_1_visuomotor_performance.csv',index=True)