In [1]:
# Load Dependencies

import pickle
import numpy as np
import pandas as pd
from scipy.stats import pearsonr

In [2]:
# After step 1 is complete
# Load in the preprocessed and structured eeg data

# This code defines file for opening a pickeled object
# Update filename based on epoch duration
file = open("preprocessed_interpolate_bandpass_.1_20/preprocessed_data_1s_epoch.obj",'rb')


# This code reads in the pickle file
preprocess_dict = pickle.load(file)
file.close()

In [3]:
# Restructure data as a long dataframe

# Initalize empty lists for structuring data
eeg_long_df_list = []
subj_list = []
ses_list = []

# For loop that reads in data from preprocess_dict and stores it with the new variables
for subj in list(preprocess_dict.keys()):
    for ses in list(preprocess_dict[subj].keys()):
        temp_df = preprocess_dict[subj][ses]
        eeg_long_df_list.append(temp_df)
        subj_list.append([subj] * temp_df.shape[0])
        ses_list.append([ses] * temp_df.shape[0])
        
subj_list = [item for sublist in subj_list for item in sublist]
ses_list = [item for sublist in ses_list for item in sublist]

In [4]:
# Convert the list to a dataframe
eeg_long_df = pd.concat(eeg_long_df_list).reset_index()

In [5]:
# Add subj_list and ses_list to eeg_long_df
eeg_long_df["subj"] = subj_list
eeg_long_df["ses"] = ses_list

In [6]:
eeg_long_df

Unnamed: 0,level_0,index,epoch,time,TP9-1,AF7-1,AF8-1,TP10-1,TP9-2,AF7-2,AF8-2,TP10-2,subj,ses
0,0,0,0,0.000000,0.222848,11.608067,0.223902,0.058120,-0.483754,-5.345793,-0.345295,-0.346974,001,001
1,1,1,0,0.003906,-1.673274,-25.432445,-2.610390,0.434157,2.526975,151.898427,2.187296,-2.175046,001,001
2,2,2,0,0.007812,-3.492170,-54.826924,-4.886415,0.673630,4.259710,290.713836,4.544382,-4.696435,001,001
3,3,3,0,0.011719,-5.221177,-70.724021,-6.137424,0.643151,3.749553,396.613306,6.575249,-8.378026,001,001
4,4,4,0,0.015625,-6.957718,-70.413833,-6.059825,0.200740,0.575384,462.006339,8.172946,-13.309297,001,001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32646907,75003,53755,292,0.980469,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,181,003
32646908,75004,53756,292,0.984375,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,181,003
32646909,75005,53757,292,0.988281,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,181,003
32646910,75006,53758,292,0.992188,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,181,003


In [None]:
# Save the DataFrame as a CSV file for later processing 
# Update file name depending on epoch duration

eeg_long_df.to_csv('preprocessed_interpolate_bandpass_.1_20/eeg_long_df_1s_epoch.csv', index=False)

In [7]:
# Create a df
# This is where you will store the number of cells rejected by autoreject
# Filtered by subj by session by electrode
zero_counts_df = pd.DataFrame(columns=['subj', 'ses', 'electrode', 'zero_count', 'nonzero_count', 'total', 'percent'])

In [8]:
# Define electrode columns that you will filter by in eeg_long_df
electrodes = ['AF7-1', 'AF8-1', 'TP9-1', 'TP10-1', 'AF7-2', 'AF8-2', 'TP9-2', 'TP10-2']

In [9]:
# Create a new DataFrame to store results
results = []

# Count zeros and non-zeros for each subj, ses, and electrode
# Calculate total number or rows for each subj for each session for each electrode
# Calculate the percentage of rows dropped for each subj for each session for each electrode
for (subj, ses), group in eeg_long_df.groupby(['subj', 'ses']):
    for electrode in electrodes:
        zero_count = (group[electrode] == 0).sum()
        nonzero_count = (group[electrode] > 0).sum()
        total = zero_count + nonzero_count
        percent = (zero_count / total) * 100 if total != 0 else 0  # To handle division by zero

        results.append({
            'subj': subj,
            'ses': ses,
            'electrode': electrode,
            'zero_count': zero_count,
            'nonzero_count': nonzero_count,
            'total': total,
            'percent': percent
        })

# Convert list of dicts to DataFrame
zero_counts_df = pd.DataFrame(results)

In [10]:
zero_counts_df

Unnamed: 0,subj,ses,electrode,zero_count,nonzero_count,total,percent
0,001,001,AF7-1,13312,21634,34946,38.093058
1,001,001,AF8-1,13312,24934,38246,34.806254
2,001,001,TP9-1,13312,24901,38213,34.836312
3,001,001,TP10-1,13312,25407,38719,34.381053
4,001,001,AF7-2,13312,22220,35532,37.464820
...,...,...,...,...,...,...,...
2595,181,003,TP10-1,53760,11817,65577,81.979962
2596,181,003,AF7-2,30976,22170,53146,58.284725
2597,181,003,AF8-2,30976,22087,53063,58.375893
2598,181,003,TP9-2,30976,23596,54572,56.761709


In [11]:
# Get the average number of data dropped:

average_percent = zero_counts_df['percent'].mean()
print("Average percentage of zero counts:", average_percent)

Average percentage of zero counts: 72.11443898783055


In [12]:
# Save as csv for subsequent processing
# Update file path name as necessary

df_average_percent = pd.DataFrame({'Average_Percent': [average_percent]})

# Save the DataFrame to a CSV file
df_average_percent.to_csv('data_dropped/average_percent_1s_epoch.csv', index=False)

In [13]:
# For rejected epochs, autoreject replaces that value with a zero
# That creates a problem when trying to calculate ISCs
# This is because lots of zeros artifically reduce correlation magnitude
# This code replaces zero wit nan to deal with that issue
eeg_long_df[["TP9-1", "AF7-1", "AF8-1", "TP10-1", "TP9-2", "AF7-2", "AF8-2", "TP10-2"]] = \
eeg_long_df.loc[:, ["TP9-1", "AF7-1", "AF8-1", "TP10-1", "TP9-2", "AF7-2", "AF8-2", "TP10-2"]].replace(0, np.nan)

In [14]:
# Measure EEG inter-subject correlation
# Make a big correlation matrix for each electrode for each pair of participants
# Then index that correlation matrix, selecting the correct cell and restructuring as a dataframe

corr_df_TP9 = eeg_long_df.groupby(["subj", "ses"])[["TP9-1", "TP9-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,-1]
corr_df_AF7 = eeg_long_df.groupby(["subj", "ses"])[["AF7-1", "AF7-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,-1]
corr_df_AF8 = eeg_long_df.groupby(["subj", "ses"])[["AF8-1", "AF8-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,-1]
corr_df_TP10 = eeg_long_df.groupby(["subj", "ses"])[["TP10-1", "TP10-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,-1]

# Combine individual electrode correlation dataframes into one big dataframe 
corr_df = pd.concat([corr_df_TP9, corr_df_AF7, corr_df_AF8, corr_df_TP10], axis=1)
corr_df.columns = ["TP9", "AF7", "AF8", "TP10"]
corr_df["subj"] = eeg_long_df.groupby(["subj", "ses"])[["TP9-1", "TP9-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,0]
corr_df["ses"] = eeg_long_df.groupby(["subj", "ses"])[["TP9-1", "TP9-2"]].corr().iloc[0::2,-1].reset_index().iloc[:,1]

In [15]:
# Read in the structured task performance data

task_performance_df = pd.read_csv("task_performance.csv")

In [16]:
# Convert the 'subj' and 'ses' columns of the corr_df dataframe from their current data types to integers
corr_df["subj"] = corr_df["subj"].astype(int)
corr_df["ses"] = corr_df["ses"].astype(int)

In [17]:
# Merge the 'corr_df' dataframe with 'task_performance_df' on columns 'subj' and 'ses', using a left join
# This operation adds the columns from 'task_performance_df' to 'corr_df' based on matching 'subj' and 'ses' values
# Rows in 'corr_df' that do not match in 'task_performance_df' will be included but have NaN values for the new columns

corr_df = corr_df.merge(task_performance_df, on=["subj", "ses"], how="left")

In [42]:
# Read in the raw self-report data

self_report_df = pd.read_csv("self_report_data.csv")

In [43]:
# Drop the columns "Q40_participant1" and "Q40_participant2" (these are "Other" entries for ethnicity)

self_report_df = self_report_df.drop(columns=['Q40_participant1', 'Q40_participant2'])

In [44]:
# Calculate the mean of cognitive and affective trust for each session

for ses in range(1, 4):
    # Affective trust columns for this session
    
    affective_cols = [col for col in self_report_df.columns if 'affecttrust' in col and f'ses{ses}' in col]
    self_report_df[f'average_affecttrust_ses{ses}'] = self_report_df[affective_cols].mean(axis=1)
    
    # Cognitive trust columns for this session
    
    cognitive_cols = [col for col in self_report_df.columns if 'cognitivetrust' in col and f'ses{ses}' in col]
    self_report_df[f'average_cognitivetrust_ses{ses}'] = self_report_df[cognitive_cols].mean(axis=1)

In [45]:
# Create a new df with unique subj and ses combinations

subj_ses_combinations = pd.DataFrame([(s, ses) for s in self_report_df['subj'].unique() for ses in range(1, 4)],
                                      columns=['subj', 'ses'])

# Merge the averages into this new df

for ses in range(1, 4):
    subj_ses_combinations = subj_ses_combinations.merge(
        self_report_df[['subj', f'average_affecttrust_ses{ses}', f'average_cognitivetrust_ses{ses}']],
        on='subj', how='left')

    # Assign the values only for the current session
    
    subj_ses_combinations.loc[subj_ses_combinations['ses'] == ses, 'affective_trust'] = subj_ses_combinations[f'average_affecttrust_ses{ses}']
    subj_ses_combinations.loc[subj_ses_combinations['ses'] == ses, 'cognitive_trust'] = subj_ses_combinations[f'average_cognitivetrust_ses{ses}']

In [46]:
# Drop the temporary average columns

for ses in range(1, 4):
    subj_ses_combinations.drop(columns=[f'average_affecttrust_ses{ses}', f'average_cognitivetrust_ses{ses}'], inplace=True)


In [47]:
subj_ses_combinations

Unnamed: 0,subj,ses,affective_trust,cognitive_trust
0,1,1,2.2,3.250000
1,1,2,2.6,3.500000
2,1,3,2.6,3.416667
3,3,1,2.6,3.583333
4,3,2,2.6,4.000000
...,...,...,...,...
466,180,2,3.4,4.000000
467,180,3,3.7,4.083333
468,181,1,2.6,3.750000
469,181,2,3.2,3.750000


In [49]:
# Print out all column names to understand their structure
print(self_report_df.columns.tolist())

['subj', 'Participant_1', 'Q1_participant1_ses1_affecttrust1', 'Q2_participant1_ses1_affecttrust2', 'Q3_participant1_ses1_affecttrust3', 'Q4_participant1_ses1_affecttrust4', 'Q5_participant1_ses1_affecttrust5', 'Q6_participant1_ses1_cognitivetrust1', 'Q7_participant1_ses1_cognitivetrust2', 'Q8_participant1_ses1_cognitivetrust3', 'Q9_participant1_ses1_cognitivetrust4', 'Q10_participant1_ses1_cognitivetrust5', 'Q11_participant1_ses1_cognitivetrust6', 'Q12_participant1_ses2_affecttrust1', 'Q13_participant1_ses2_affecttrust2', 'Q14_participant1_ses2_affecttrust3', 'Q15_participant1_ses2_affecttrust4', 'Q16_participant1_ses2_affecttrust5', 'Q17_participant1_ses2_cognitivetrust1', 'Q18_participant1_ses2_cognitivetrust2', 'Q19_participant1_ses2_cognitivetrust3', 'Q20_participant1_ses2_cognitivetrust4', 'Q21_participant1_ses2_cognitivetrust5', 'Q22_participant1_ses2_cognitivetrust6', 'Q23_participant1_ses3_affecttrust1', 'Q24_participant1_ses3_affecttrust2', 'Q25_participant1_ses3_affecttrust3

In [57]:
import pandas as pd
import numpy as np

# Assuming self_report_df is loaded

# Create DataFrame for subj and ses combinations
subj_ses_combinations = pd.DataFrame([(s, ses) for s in self_report_df['subj'].unique() for ses in range(1, 4)],
                                      columns=['subj', 'ses'])

sessions = range(1, 4)
trust_types = [('affecttrust', 5), ('cognitivetrust', 6)]  # Adjust numbers based on actual questions per session

for ses in sessions:
    for trust_type, num_questions in trust_types:
        # Column for storing correlations
        correlation_col_name = f"correlation_{trust_type}_ses{ses}"
        subj_ses_combinations[correlation_col_name] = np.nan  # Initialize with NaNs

        for index, row in self_report_df.iterrows():
            # Generate column names for participant 1 and participant 2
            p1_cols = [f"Q{i+1}_participant1_ses{ses}_{trust_type}{i+1}" for i in range(num_questions)]
            p2_cols = [f"Q{i+1}_participant2_ses{ses}_{trust_type}{i+1}" for i in range(num_questions)]

            # Only proceed if all required columns exist in the DataFrame
            if all(col in self_report_df.columns for col in p1_cols + p2_cols):
                # Extract the data and drop NAs to ensure equal length
                p1_values = row[p1_cols].dropna()
                p2_values = row[p2_cols].dropna()

                # Ensure both have the same length and more than one value to calculate correlation
                if len(p1_values) == len(p2_values) and len(p1_values) > 1:
                    # Check if there is any variance, as correlation requires variability in data
                    if p1_values.std() > 0 and p2_values.std() > 0:
                        correlation = np.corrcoef(p1_values, p2_values)[0, 1]
                    else:
                        correlation = np.nan
                else:
                    correlation = np.nan

                subj_ses_combinations.at[index, correlation_col_name] = correlation

# Output the results
print(subj_ses_combinations.head())

# Additional debug output for sessions with NaN correlations
for ses in sessions:
    for trust_type, num_questions in trust_types:
        # Print sample data to understand why correlations might be NaN
        if 'cognitive' in trust_type:  # Adjust according to your focus
            print(f"Debugging Session {ses}, Trust Type: {trust_type}")
            p1_cols = [f"Q{i+1}_participant1_ses{ses}_{trust_type}{i+1}" for i in range(num_questions)]
            p2_cols = [f"Q{i+1}_participant2_ses{ses}_{trust_type}{i+1}" for i in range(num_questions)]

            # Check if columns exist
            if all(col in self_report_df.columns for col in p1_cols + p2_cols):
                # Extract data for the first few rows to check
                sample_data = self_report_df.head()[p1_cols + p2_cols]
                print(sample_data.describe())
                # Check for variance and NaNs in the sample data
                print("Variance:\n", sample_data.var())
                print("NaN counts:\n", sample_data.isna().sum())



   subj  ses  correlation_affecttrust_ses1  correlation_cognitivetrust_ses1  \
0     1    1                      0.157243                              NaN   
1     1    2                      0.102062                              NaN   
2     1    3                     -0.612372                              NaN   
3     3    1                           NaN                              NaN   
4     3    2                           NaN                              NaN   

   correlation_affecttrust_ses2  correlation_cognitivetrust_ses2  \
0                           NaN                              NaN   
1                           NaN                              NaN   
2                           NaN                              NaN   
3                           NaN                              NaN   
4                           NaN                              NaN   

   correlation_affecttrust_ses3  correlation_cognitivetrust_ses3  
0                           NaN                  

In [56]:
# Additional debug output for sessions with NaN correlations
for ses in sessions:
    for trust_type, num_questions in trust_types:
        # Print sample data to understand why correlations might be NaN
        if 'cognitive' in trust_type:  # Adjust according to your focus
            print(f"Debugging Session {ses}, Trust Type: {trust_type}")
            p1_cols = [f"Q{i+1}_participant1_ses{ses}_{trust_type}{i+1}" for i in range(num_questions)]
            p2_cols = [f"Q{i+1}_participant2_ses{ses}_{trust_type}{i+1}" for i in range(num_questions)]

            # Check if columns exist
            if all(col in self_report_df.columns for col in p1_cols + p2_cols):
                # Extract data for the first few rows to check
                sample_data = self_report_df.head()[p1_cols + p2_cols]
                print(sample_data.describe())
                # Check for variance and NaNs in the sample data
                print("Variance:\n", sample_data.var())
                print("NaN counts:\n", sample_data.isna().sum())


Debugging Session 1, Trust Type: cognitivetrust
Debugging Session 2, Trust Type: cognitivetrust
Debugging Session 3, Trust Type: cognitivetrust


In [37]:
subj_ses_combinations

Unnamed: 0,subj,ses,correlation_affecttrust_ses1,correlation_cognitivetrust_ses1,correlation_affecttrust_ses2,correlation_cognitivetrust_ses2,correlation_affecttrust_ses3,correlation_cognitivetrust_ses3
0,1,1,,,,,,
1,1,2,,,,,,
2,1,3,,,,,,
3,3,1,,,,,,
4,3,2,,,,,,
...,...,...,...,...,...,...,...,...
466,180,2,,,,,,
467,180,3,,,,,,
468,181,1,,,,,,
469,181,2,,,,,,


In [None]:
# Merge the 'corr_df' dataframe with 'subj_ses_combinations' on columns 'subj' and 'ses', using a left join
# This operation adds the columns from 'subj_ses_combinations' to 'corr_df' based on matching 'subj' and 'ses' values
# Rows in 'corr_df' that do not match in 'subj_ses_combinations' will be included but have NaN values for the new columns

corr_df = corr_df.merge(subj_ses_combinations, on=["subj", "ses"], how="left")

In [None]:
corr_df

In [None]:
# Remove all rows from the 'corr_df' dataframe that contain any missing values (NaNs)

corr_df = corr_df.dropna()

In [None]:
# Count the unique number of subjects in the 'subj' column of the 'corr_df' dataframe
# This is achieved by selecting the 'subj' column, calling .unique() to get an array of unique values,
# and then accessing the first element of the .shape attribute, which represents the number of unique subjects

corr_df.subj.unique().shape[0]

In [None]:
# Export the correlation dataframe to a csv file for later analysis
# Update filename based on epoch duration

corr_df.to_csv("preprocessed_interpolate_bandpass_.1_20/merged_corr_df_1s_epoch.csv")