In [1]:
# Load Dependencies

import pickle
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

In [2]:
# After step 1 is complete
# Load in the preprocessed and structured eeg data

# This code defines file for opening a pickeled object
# Update filename based on epoch duration
file = open("preprocessed_data_1s_epoch.obj",'rb')


# This code reads in the pickle file
preprocess_dict = pickle.load(file)
file.close()

In [3]:
# Restructure data as a long dataframe

# Initalize empty lists for structuring data
eeg_long_df_list = []
subj_list = []
ses_list = []

# For loop that reads in data from preprocess_dict and stores it with the new variables
for subj in list(preprocess_dict.keys()):
    for ses in list(preprocess_dict[subj].keys()):
        temp_df = preprocess_dict[subj][ses]
        eeg_long_df_list.append(temp_df)
        subj_list.append([subj] * temp_df.shape[0])
        ses_list.append([ses] * temp_df.shape[0])
        
subj_list = [item for sublist in subj_list for item in sublist]
ses_list = [item for sublist in ses_list for item in sublist]

In [4]:
# Convert the list to a dataframe
eeg_long_df = pd.concat(eeg_long_df_list).reset_index()

In [5]:
# Add subj_list and ses_list to eeg_long_df
eeg_long_df["subj"] = subj_list
eeg_long_df["ses"] = ses_list

In [6]:
eeg_long_df

Unnamed: 0,level_0,index,epoch,time,TP9-1,AF7-1,AF8-1,TP10-1,TP9-2,AF7-2,AF8-2,TP10-2,subj,ses
0,0,0,0,0.000000,0.222848,11.608067,0.223902,0.058120,-0.483754,-5.345793,-0.345295,-0.346974,001,001
1,1,1,0,0.003906,-1.673274,-25.432445,-2.610390,0.434157,2.526975,151.898427,2.187296,-2.175046,001,001
2,2,2,0,0.007812,-3.492170,-54.826924,-4.886415,0.673630,4.259710,290.713836,4.544382,-4.696435,001,001
3,3,3,0,0.011719,-5.221177,-70.724021,-6.137424,0.643151,3.749553,396.613306,6.575249,-8.378026,001,001
4,4,4,0,0.015625,-6.957718,-70.413833,-6.059825,0.200740,0.575384,462.006339,8.172946,-13.309297,001,001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26439931,16635,10491,64,0.980469,0.000000,0.000000,0.000000,0.000000,4.080540,-1.120093,13.346628,34.021331,150,003
26439932,16636,10492,64,0.984375,0.000000,0.000000,0.000000,0.000000,4.055455,-1.418954,13.809336,35.572413,150,003
26439933,16637,10493,64,0.988281,0.000000,0.000000,0.000000,0.000000,3.644284,-2.067256,13.820666,36.526438,150,003
26439934,16638,10494,64,0.992188,0.000000,0.000000,0.000000,0.000000,3.044645,-2.834532,13.519710,36.891907,150,003


In [None]:
# For rejected epochs, autoreject replaces that value with a zero
# That creates a problem when trying to calculate ISCs
# This is because lots of zeros artifically reduce correlation magnitude
# This code replaces zero wit nan to deal with that issue
eeg_long_df[["TP9-1", "AF7-1", "AF8-1", "TP10-1", "TP9-2", "AF7-2", "AF8-2", "TP10-2"]] = \
eeg_long_df.loc[:, ["TP9-1", "AF7-1", "AF8-1", "TP10-1", "TP9-2", "AF7-2", "AF8-2", "TP10-2"]].replace(0, np.nan)

In [None]:
# Measure EEG inter-subject correlation
# Make a big correlation matrix for each electrode for each pair of participants
# Then index that correlation matrix, selecting the correct cell and restructuring as a dataframe

corr_df_TP9 = eeg_long_df.groupby(["subj", "ses"])[["TP9-1", "TP9-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,-1]
corr_df_AF7 = eeg_long_df.groupby(["subj", "ses"])[["AF7-1", "AF7-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,-1]
corr_df_AF8 = eeg_long_df.groupby(["subj", "ses"])[["AF8-1", "AF8-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,-1]
corr_df_TP10 = eeg_long_df.groupby(["subj", "ses"])[["TP10-1", "TP10-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,-1]

# Combine individual electrode correlation dataframes into one big dataframe 
corr_df = pd.concat([corr_df_TP9, corr_df_AF7, corr_df_AF8, corr_df_TP10], axis=1)
corr_df.columns = ["TP9", "AF7", "AF8", "TP10"]
corr_df["subj"] = eeg_long_df.groupby(["subj", "ses"])[["TP9-1", "TP9-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,0]
corr_df["ses"] = eeg_long_df.groupby(["subj", "ses"])[["TP9-1", "TP9-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,1]

In [None]:
# Read in the structured task performance data

task_performance_df = pd.read_csv("task_performance.csv")

In [None]:
# Convert the 'subj' and 'ses' columns of the corr_df dataframe from their current data types to integers
corr_df["subj"] = corr_df["subj"].astype(int)
corr_df["ses"] = corr_df["ses"].astype(int)

In [None]:
# Merge the 'corr_df' dataframe with 'task_performance_df' on columns 'subj' and 'ses', using a left join
# This operation adds the columns from 'task_performance_df' to 'corr_df' based on matching 'subj' and 'ses' values
# Rows in 'corr_df' that do not match in 'task_performance_df' will be included but have NaN values for the new columns

corr_df = corr_df.merge(task_performance_df, on=["subj", "ses"], how="left")

In [None]:
# Remove all rows from the 'corr_df' dataframe that contain any missing values (NaNs)

corr_df = corr_df.dropna()

In [None]:
# Count the unique number of subjects in the 'subj' column of the 'corr_df' dataframe
# This is achieved by selecting the 'subj' column, calling .unique() to get an array of unique values,
# and then accessing the first element of the .shape attribute, which represents the number of unique subjects

corr_df.subj.unique().shape[0]

In [None]:
# Export the correlation dataframe to a csv file for later analysis
# Update filename based on epoch duration

corr_df.to_csv("merged_corr_df_1s_epoch.csv")