In [None]:
# Load Dependencies

import pickle
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

In [None]:
# After step 1 is complete
# Load in the preprocessed and structured eeg data

# This code defines file for opening a pickeled object
# Update filename based on epoch duration
file = open("preprocessed_interpolate_bandpass_.1_20/preprocessed_data_1s_epoch.obj",'rb')


# This code reads in the pickle file
preprocess_dict = pickle.load(file)
file.close()

In [None]:
# Restructure data as a long dataframe

# Initalize empty lists for structuring data
eeg_long_df_list = []
subj_list = []
ses_list = []

# For loop that reads in data from preprocess_dict and stores it with the new variables
for subj in list(preprocess_dict.keys()):
    for ses in list(preprocess_dict[subj].keys()):
        temp_df = preprocess_dict[subj][ses]
        eeg_long_df_list.append(temp_df)
        subj_list.append([subj] * temp_df.shape[0])
        ses_list.append([ses] * temp_df.shape[0])
        
subj_list = [item for sublist in subj_list for item in sublist]
ses_list = [item for sublist in ses_list for item in sublist]

In [None]:
# Convert the list to a dataframe
eeg_long_df = pd.concat(eeg_long_df_list).reset_index()

In [None]:
# Add subj_list and ses_list to eeg_long_df
eeg_long_df["subj"] = subj_list
eeg_long_df["ses"] = ses_list

In [None]:
eeg_long_df

In [None]:
# Save the DataFrame as a CSV file for later processing 
# Update file name depending on epoch duration

eeg_long_df.to_csv('preprocessed_interpolate_bandpass_.1_20/eeg_long_df_1s_epoch.csv', index=False)

In [None]:
# Create a df
# This is where you will store the number of cells rejected by autoreject
# Filtered by subj by session by electrode
zero_counts_df = pd.DataFrame(columns=['subj', 'ses', 'electrode', 'zero_count', 'nonzero_count', 'total', 'percent'])

In [None]:
# Define electrode columns that you will filter by in eeg_long_df
electrodes = ['AF7-1', 'AF8-1', 'TP9-1', 'TP10-1', 'AF7-2', 'AF8-2', 'TP9-2', 'TP10-2']

In [None]:
# Create a new DataFrame to store results
results = []

# Count zeros and non-zeros for each subj, ses, and electrode
# Calculate total number or rows for each subj for each session for each electrode
# Calculate the percentage of rows dropped for each subj for each session for each electrode
for (subj, ses), group in eeg_long_df.groupby(['subj', 'ses']):
    for electrode in electrodes:
        zero_count = (group[electrode] == 0).sum()
        nonzero_count = (group[electrode] > 0).sum()
        total = zero_count + nonzero_count
        percent = (zero_count / total) * 100 if total != 0 else 0  # To handle division by zero

        results.append({
            'subj': subj,
            'ses': ses,
            'electrode': electrode,
            'zero_count': zero_count,
            'nonzero_count': nonzero_count,
            'total': total,
            'percent': percent
        })

# Convert list of dicts to DataFrame
zero_counts_df = pd.DataFrame(results)

In [None]:
zero_counts_df

In [None]:
# Get the average number of data dropped:

average_percent = zero_counts_df['percent'].mean()
print("Average percentage of zero counts:", average_percent)

In [None]:
# Save as csv for subsequent processing
# Update file path name as necessary

df_average_percent = pd.DataFrame({'Average_Percent': [average_percent]})

# Save the DataFrame to a CSV file
df_average_percent.to_csv('data_dropped/average_percent_4s_epoch.csv', index=False)

In [None]:
# For rejected epochs, autoreject replaces that value with a zero
# That creates a problem when trying to calculate ISCs
# This is because lots of zeros artifically reduce correlation magnitude
# This code replaces zero wit nan to deal with that issue
eeg_long_df[["TP9-1", "AF7-1", "AF8-1", "TP10-1", "TP9-2", "AF7-2", "AF8-2", "TP10-2"]] = \
eeg_long_df.loc[:, ["TP9-1", "AF7-1", "AF8-1", "TP10-1", "TP9-2", "AF7-2", "AF8-2", "TP10-2"]].replace(0, np.nan)

In [None]:
# Measure EEG inter-subject correlation
# Make a big correlation matrix for each electrode for each pair of participants
# Then index that correlation matrix, selecting the correct cell and restructuring as a dataframe

corr_df_TP9 = eeg_long_df.groupby(["subj", "ses"])[["TP9-1", "TP9-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,-1]
corr_df_AF7 = eeg_long_df.groupby(["subj", "ses"])[["AF7-1", "AF7-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,-1]
corr_df_AF8 = eeg_long_df.groupby(["subj", "ses"])[["AF8-1", "AF8-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,-1]
corr_df_TP10 = eeg_long_df.groupby(["subj", "ses"])[["TP10-1", "TP10-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,-1]

# Combine individual electrode correlation dataframes into one big dataframe 
corr_df = pd.concat([corr_df_TP9, corr_df_AF7, corr_df_AF8, corr_df_TP10], axis=1)
corr_df.columns = ["TP9", "AF7", "AF8", "TP10"]
corr_df["subj"] = eeg_long_df.groupby(["subj", "ses"])[["TP9-1", "TP9-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,0]
corr_df["ses"] = eeg_long_df.groupby(["subj", "ses"])[["TP9-1", "TP9-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,1]

In [None]:
# Read in the structured task performance data

task_performance_df = pd.read_csv("task_performance.csv")

In [None]:
# Convert the 'subj' and 'ses' columns of the corr_df dataframe from their current data types to integers
corr_df["subj"] = corr_df["subj"].astype(int)
corr_df["ses"] = corr_df["ses"].astype(int)

In [None]:
# Merge the 'corr_df' dataframe with 'task_performance_df' on columns 'subj' and 'ses', using a left join
# This operation adds the columns from 'task_performance_df' to 'corr_df' based on matching 'subj' and 'ses' values
# Rows in 'corr_df' that do not match in 'task_performance_df' will be included but have NaN values for the new columns

corr_df = corr_df.merge(task_performance_df, on=["subj", "ses"], how="left")

In [None]:
# Remove all rows from the 'corr_df' dataframe that contain any missing values (NaNs)

corr_df = corr_df.dropna()

In [None]:
# Count the unique number of subjects in the 'subj' column of the 'corr_df' dataframe
# This is achieved by selecting the 'subj' column, calling .unique() to get an array of unique values,
# and then accessing the first element of the .shape attribute, which represents the number of unique subjects

corr_df.subj.unique().shape[0]

In [None]:
# Export the correlation dataframe to a csv file for later analysis
# Update filename based on epoch duration

corr_df.to_csv("preprocessed_interpolate_bandpass_.1_20/merged_corr_df_1s_epoch.csv")