# Read In Data

In [1]:
# import libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

In [2]:
# read in train and validate set
df = pd.read_csv('Train_and_Validate_EEG.csv')

# read in test set
df_test = pd.read_csv('Test_Set_EEG.csv')

# Data Processing

## Data Cleaning and Transformation

In [3]:
# drop unnamed column between psd columns and coherence columns
df.drop('Unnamed: 122', axis=1, inplace=True)
df_test.drop('Unnamed: 120', axis=1, inplace=True)

# fill in missing values for iq and education with the average value
df['education'] = df['education'].fillna(df['education'].mean())
df['IQ'] = df['IQ'].fillna(df['IQ'].mean()) 
df_test['education'] = df_test['education'].fillna(df_test['education'].mean())
df_test['IQ'] = df_test['IQ'].fillna(df_test['IQ'].mean()) 

# one-hot encode sex
df = pd.get_dummies(df, columns=['sex'], dtype=int, drop_first=True)
df_test = pd.get_dummies(df_test, columns=['sex'], dtype=int, drop_first=True)

## Feature Engineering

### Training data

In [4]:
# Selecting psd data
psd = df.loc[:, 'AB.A.delta.a.FP1':'AB.F.gamma.s.O2']

# Creating dictionary that will store our results before converting into DataFrame
psd_aggregate_dict = {}

# Different categories used to identify the regions and types of coherence connections
# The index of the region corresponds to the index of the set of electrodes.
zone_names = ['FP', 'F', 'T', 'C', 'P', 'O']
electrode_names = [['a.FP1', 'b.FP2'], ['c.F7', 'd.F3', 'e.Fz', 'f.F4', 'g.F8'], ['h.T3','l.T4','m.T5','q.T6'], ['i.C3','j.Cz','k.C4',], ['n.P3','o.Pz','p.P4'], ['r.O1','s.O2']]
band_names = ['A.delta', 'B.theta', 'C.alpha', 'D.beta', 'E.highbeta', 'F.gamma']

# Iterating over the frequency bands
for band in band_names:
    # Iterating over the different regions
    for i in range(6):
        psd_column_name = 'AB.'+band+'.'+zone_names[i]
        psd_aggregate_dict[psd_column_name] = np.zeros(len(psd))
        # Iterating over every electrode in each region
        for j in range(len(electrode_names[i])):
            psd_aggregate_dict[psd_column_name] += psd['AB.'+band+'.'+electrode_names[i][j]]

# Converting dictionary into DataFrame
psd_aggregate = pd.DataFrame(psd_aggregate_dict)

In [5]:
# Selecting coherence data
coherence = df.loc[:, 'COH.A.delta.a.FP1.b.FP2':]

# Creating dictionary that will store our results before converting into DataFrame
coherence_hemispheres_aggregate_dict = {}

# Different categories used to identify the regions and types of coherence connections
# The index of the region corresponds to the index of the set of electrodes.
# The parity of the index of electrodes in the region corresponds to the hemisphere location.
zone_names = ['FP','F', 'T', 'C', 'P', 'O']
electrode_names = [['a.FP1', 'b.FP2'], ['c.F7', 'f.F4', 'd.F3', 'g.F8', 'e.Fz'], ['h.T3','l.T4','m.T5','q.T6'], ['i.C3','k.C4','j.Cz'], ['n.P3','p.P4','o.Pz'], ['r.O1','s.O2']]
band_names = ['A.delta', 'B.theta', 'C.alpha', 'D.beta', 'E.highbeta', 'F.gamma']

# Iterating over the frequency bands
for band in band_names:
    # Iterating over the different regions, taking care to not double count coherences
    for i in range(6):
        for j in range(i,6):
            coh_column_name = 'COH.hemispheres.'+band+'.'+zone_names[i]+'.'+zone_names[j]
            coherence_hemispheres_aggregate_dict[coh_column_name] = np.zeros(len(coherence))
            # Iterating over every electrode coherence combination
            for a in range(len(electrode_names[i])):
                for b in range(len(electrode_names[j])):
                    # Inter-region coherence
                    if i!=j:
                        # Counting coherences between electrodes in different hemispheres as this has been shown  to be a great indicator 
                        # of mental disorders in the speciality literature. Electrodes in different hemispheres have different index parities.
                        if (a+b)%2 ==1:
                            try:
                                coherence_hemispheres_aggregate_dict[coh_column_name] +=coherence['COH.'+band+'.'+electrode_names[i][a]+'.'+electrode_names[j][b]].values
                            except:
                                coherence_hemispheres_aggregate_dict[coh_column_name] +=coherence['COH.'+band+'.'+electrode_names[j][b]+'.'+electrode_names[i][a]].values
                    # Intra-region coherence
                    elif(a<b):
                        if (a+b)%2 == 1:
                            try:
                                coherence_hemispheres_aggregate_dict[coh_column_name] +=coherence['COH.'+band+'.'+electrode_names[i][a]+'.'+electrode_names[j][b]].values
                            except:
                                coherence_hemispheres_aggregate_dict[coh_column_name] +=coherence['COH.'+band+'.'+electrode_names[j][b]+'.'+electrode_names[i][a]].values

# Converting dictionary into DataFrame
coherence_hemispheres_aggregate = pd.DataFrame(coherence_hemispheres_aggregate_dict)

In [6]:
# create features matrix
X = pd.DataFrame()

# add sex, age, education, and iq to features matrix
X['sex_M'] = df['sex_M']
X['age'] = df['age']
X['education'] = df['education']
X['IQ'] = df['IQ']

# add psd aggregates and coherence aggregates (see above blocks) to features matrix
X = pd.concat([X, psd_aggregate, coherence_hemispheres_aggregate], axis=1)

In [7]:
# create target vector
y = df['main.disorder']

# print shape of target vector (to confirm dimension matches features matrix)
print("Shape of target vector:",y.shape)

# separate the dataframe based on main condition
mood = X[y == "Mood disorder"]
healthy = X[y == "Healthy control"]
schizo = X[y == "Schizophrenia"]
ocd = X[y == "Obsessive compulsive disorder"]
addict = X[y == "Addictive disorder"]
trauma = X[y == "Trauma and stress related disorder"]
anxiety = X[y == "Anxiety disorder"]

# perform ANOVA test to see which features are statistically significant
f_value, p_value = stats.f_oneway(mood, healthy, schizo, ocd, addict, trauma, anxiety)
p_value_df = pd.DataFrame({'Feature': X.columns, 'p_value': p_value})

# remove features that are not statistically significant
significant_features = p_value_df.loc[p_value_df['p_value'] < 0.05, 'Feature'].tolist()
X = X[significant_features]

# print new shape of features matrix
print("Shape of significant features matrix:", X.shape)

Shape of target vector: (852,)
Shape of significant features matrix: (852, 27)


In [8]:
# encode target vector and ensure that it is of type "category"
label_encoder = LabelEncoder()
y.astype("category")
y_train = label_encoder.fit_transform(y)

### Testing data

In [9]:
# Selecting psd data
psd_test = df_test.loc[:, 'AB.A.delta.a.FP1':'AB.F.gamma.s.O2']

# Creating dictionary that will store our results before converting into DataFrame
psd_aggregate_dict_test = {}

# Different categories used to identify the regions and types of coherence connections
# The index of the region corresponds to the index of the set of electrodes.
zone_names = ['FP', 'F', 'T', 'C', 'P', 'O']
electrode_names = [['a.FP1', 'b.FP2'], ['c.F7', 'd.F3', 'e.Fz', 'f.F4', 'g.F8'], ['h.T3','l.T4','m.T5','q.T6'], ['i.C3','j.Cz','k.C4',], ['n.P3','o.Pz','p.P4'], ['r.O1','s.O2']]
band_names = ['A.delta', 'B.theta', 'C.alpha', 'D.beta', 'E.highbeta', 'F.gamma']

# Iterating over the frequency bands
for band in band_names:
    # Iterating over the different regions
    for i in range(6):
        psd_column_name = 'AB.'+band+'.'+zone_names[i]
        psd_aggregate_dict_test[psd_column_name] = np.zeros(len(psd_test))
        # Iterating over every electrode in each region
        for j in range(len(electrode_names[i])):
            psd_aggregate_dict_test[psd_column_name] += psd_test['AB.'+band+'.'+electrode_names[i][j]]

# Converting dictionary into DataFrame
psd_aggregate_test = pd.DataFrame(psd_aggregate_dict_test)

In [10]:
# Selecting coherence data
coherence_test = df_test.loc[:, 'COH.A.delta.a.FP1.b.FP2':]

# Creating dictionary that will store our results before converting into DataFrame
coherence_hemispheres_aggregate_dict_test = {}

# Different categories used to identify the regions and types of coherence connections
# The index of the region corresponds to the index of the set of electrodes.
# The parity of the index of electrodes in the region corresponds to the hemisphere location.
zone_names = ['FP','F', 'T', 'C', 'P', 'O']
electrode_names = [['a.FP1', 'b.FP2'], ['c.F7', 'f.F4', 'd.F3', 'g.F8', 'e.Fz'], ['h.T3','l.T4','m.T5','q.T6'], ['i.C3','k.C4','j.Cz'], ['n.P3','p.P4','o.Pz'], ['r.O1','s.O2']]
band_names = ['A.delta', 'B.theta', 'C.alpha', 'D.beta', 'E.highbeta', 'F.gamma']

# Iterating over the frequency bands
for band in band_names:
    # Iterating over the different regions, taking care to not double count coherences
    for i in range(6):
        for j in range(i,6):
            coh_column_name = 'COH.hemispheres.'+band+'.'+zone_names[i]+'.'+zone_names[j]
            coherence_hemispheres_aggregate_dict_test[coh_column_name] = np.zeros(len(coherence_test))
            # Iterating over every electrode coherence combination
            for a in range(len(electrode_names[i])):
                for b in range(len(electrode_names[j])):
                    # Inter-region coherence
                    if i!=j:
                        # Counting coherences between electrodes in different hemispheres as this has been shown  to be a great indicator 
                        # of mental disorders in the speciality literature. Electrodes in different hemispheres have different index parities.
                        if (a+b)%2 == 1:
                            try:
                                coherence_hemispheres_aggregate_dict_test[coh_column_name] +=coherence_test['COH.'+band+'.'+electrode_names[i][a]+'.'+electrode_names[j][b]].values
                            except:
                                coherence_hemispheres_aggregate_dict_test[coh_column_name] +=coherence_test['COH.'+band+'.'+electrode_names[j][b]+'.'+electrode_names[i][a]].values
                    # Intra-region coherence
                    elif(a<b):
                        if (a+b)%2 == 1:
                            try:
                                coherence_hemispheres_aggregate_dict_test[coh_column_name] +=coherence_test['COH.'+band+'.'+electrode_names[i][a]+'.'+electrode_names[j][b]].values
                            except:
                                coherence_hemispheres_aggregate_dict_test[coh_column_name] +=coherence_test['COH.'+band+'.'+electrode_names[j][b]+'.'+electrode_names[i][a]].values

# Converting dictionary into DataFrame
coherence_hemispheres_aggregate_test = pd.DataFrame(coherence_hemispheres_aggregate_dict_test)

In [11]:
# create features matrix
X_test_set = pd.DataFrame()

# add sex, age, education, and iq to features matrix
X_test_set['sex_M'] = df_test['sex_M']
X_test_set['age'] = df_test['age']
X_test_set['education'] = df_test['education']
X_test_set['IQ'] = df_test['IQ']

# add psd aggregates and coherence aggregates (see above blocks) to features matrix
X_test_set = pd.concat([X_test_set, psd_aggregate_test, coherence_hemispheres_aggregate_test], axis=1)
X_test_set = X_test_set[significant_features]

# print new shape of testing matrix (to confirm 27 significant features included)
print("Shape of testing matrix:", X_test_set.shape)

Shape of testing matrix: (93, 27)


# Logistic Regression Model

In [12]:
# logistic regression model
# scale features matrix
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)
    
# train the model
model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=3000, l1_ratio=0.5)
model.fit(X_train_scaled, y_train)

## Applying the model on the testing data

In [13]:
# scale features matrix
X_test_scaled = scaler.transform(X_test_set)

# predict target vector of testing set
y_pred = model.predict(X_test_scaled)

# Decode the predictions back to original labels
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Print the predictions
print(y_pred_labels)

# Convert to list
y_pred_labels = list(y_pred_labels)

['Mood disorder' 'Addictive disorder' 'Mood disorder' 'Addictive disorder'
 'Addictive disorder' 'Mood disorder' 'Mood disorder' 'Mood disorder'
 'Mood disorder' 'Schizophrenia' 'Mood disorder' 'Mood disorder'
 'Mood disorder' 'Addictive disorder' 'Mood disorder' 'Mood disorder'
 'Mood disorder' 'Trauma and stress related disorder' 'Addictive disorder'
 'Mood disorder' 'Mood disorder' 'Addictive disorder' 'Mood disorder'
 'Trauma and stress related disorder' 'Anxiety disorder' 'Mood disorder'
 'Healthy control' 'Mood disorder' 'Mood disorder' 'Mood disorder'
 'Mood disorder' 'Mood disorder' 'Addictive disorder' 'Schizophrenia'
 'Mood disorder' 'Anxiety disorder' 'Mood disorder' 'Addictive disorder'
 'Mood disorder' 'Anxiety disorder' 'Healthy control' 'Addictive disorder'
 'Mood disorder' 'Trauma and stress related disorder' 'Mood disorder'
 'Addictive disorder' 'Addictive disorder' 'Addictive disorder'
 'Addictive disorder' 'Addictive disorder' 'Mood disorder'
 'Addictive disorder' 'H

# Output CSV

In [14]:
import csv

firstcol = list(df_test['ID'])
data = list(zip(firstcol, y_pred_labels))
data.insert(0, ('ID', 'main.disorder.class'))

# Write to CSV
file = open('Test_Prediction_EEG.csv', 'w')
writer = csv.writer(file)
writer.writerows(data)
file.close()


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=dc94faa8-178e-4415-9314-a4f7e3905a4b' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>