### the EEG data has already been preprocessed in matlab using EEGlab
This project aims to detect cognitive distraction in drivers using their EEG data. Due to privacy concerns, the data cannot be shared.

In [4]:
# !pip install pyEDFlib pandas

## Data combining and labeling

In [13]:
import pyedflib
import pandas as pd
import os

def edf_to_csv(edf_file_path, csv_file_path):
    # Open the EDF file
    f = pyedflib.EdfReader(edf_file_path)
    
    # Extract signal labels
    signal_labels = f.getSignalLabels()
    
    # Initialize a dictionary to store the data
    data_dict = {label: [] for label in signal_labels}
    
    # Extract the data for each signal
    for i in range(f.signals_in_file):
        data_dict[signal_labels[i]] = f.readSignal(i)
    
    # Close the EDF file
    f.close()
    
    # Create a DataFrame
    df = pd.DataFrame(data_dict)
    
    # Save the DataFrame as a CSV file
    df.to_csv(csv_file_path, index=False)


# Specify the directory containing the EDF files
#edf_directory = r"C:\Users\DELL\Documents\AllStudentsRecords\clean\name"

# Specify the directory where the CSV files should be saved
csv_directory = r"C:\Users\DELL\Documents\AllStudentsRecords\clean\data"


# Iterate over all EDF files in the specified directory
for filename in os.listdir(edf_directory):
    if filename.endswith(".edf"):
        edf_file_path = os.path.join(edf_directory, filename)
        csv_file_path = os.path.join(csv_directory, os.path.splitext(filename)[0] + ".csv")
        
        # Convert the EDF file to CSV
        edf_to_csv(edf_file_path, csv_file_path)


In [14]:
def add_label_to_csv(csv_file_path, labeled_csv_path):
    # Extract the label from the file name 
    filename = os.path.basename(csv_file_path)
    label = filename.split('_')[-1].split('.')[0] 
    
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)
    
    # Add a new column 'Label' with the extracted label
    df['Label'] = label
    
    # Save the labeled DataFrame to the new CSV file
    df.to_csv(labeled_csv_path, index=False)


#directory containing the original CSV files
csv_directory = r"C:\Users\DELL\Documents\AllStudentsRecords\clean\data"

#directory where the labeled CSV files should be saved
labeled_csv_directory = r"C:\Users\DELL\Documents\AllStudentsRecords\clean\labeled"


In [15]:
# Iterate over all CSV files in the specified directory
for filename in os.listdir(csv_directory):
    if filename.endswith(".csv"):
        csv_file_path = os.path.join(csv_directory, filename)
        labeled_csv_path = os.path.join(labeled_csv_directory, filename)
        add_label_to_csv(csv_file_path, labeled_csv_path)

In [None]:
import pandas as pd
import os

def remove_status_and_combine_csv_files(directory, output_file):
    # List to hold DataFrames
    dfs = []
    
    # Iterate over all CSV files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            # Read each CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Remove the 'Status' column if it exists
            if 'Status' in df.columns:
                df = df.drop(columns=['Status'])
            
            # Append the modified DataFrame to the list
            dfs.append(df)
    
    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Save the combined DataFrame to a single CSV file
    combined_df.to_csv(output_file, index=False)
    


In [21]:
# Specify the directory containing the labeled CSV files
labeled_csv_directory = r"C:\Users\DELL\Documents\AllStudentsRecords\clean\labeled"

# Specify the path for the output combined CSV file
output_file = r"C:\Users\DELL\Documents\AllStudentsRecords\clean\combined_labeled_data.csv"

# Combine all the labeled CSV files into one and remove 'Status' column
remove_status_and_combine_csv_files(labeled_csv_directory, output_file)

##### Load the Data and set parameters

In [22]:
data = pd.read_csv(output_file)

#set parametrs 
window_size = 100  
sampling_rate = 128  #I set this value according to the sampling rate of the Emotiv device

#lists to store features and labels
all_features = []
all_labels = []

##### Function to extract time-domain and frequency-domain features from a each segment

In [23]:
import numpy as np
from scipy.stats import skew, kurtosis
from scipy.signal import welch

def extract_features_from_channel(segment, sf):
    features = []
    
    # Time-Domain Features
    features.append(np.mean(segment))                 # Mean
    features.append(np.std(segment))                  # Standard Deviation
    features.append(np.var(segment))                  # Variance
    features.append(skew(segment))                    # Skewness
    features.append(kurtosis(segment))                # Kurtosis
    features.append(np.max(segment) - np.min(segment))  # Peak-to-Peak Amplitude
    
    # Frequency-Domain Features
    f, Pxx = welch(segment, fs=sf)  # Compute Power Spectral Density (PSD)
    
    # Band powers
    delta_power = np.sum(Pxx[(f >= 0.5) & (f < 4)])
    theta_power = np.sum(Pxx[(f >= 4) & (f < 8)])
    alpha_power = np.sum(Pxx[(f >= 8) & (f < 13)])
    beta_power = np.sum(Pxx[(f >= 13) & (f < 30)])
    gamma_power = np.sum(Pxx[(f >= 30) & (f <= 45)])
    
    features.extend([delta_power, theta_power, alpha_power, beta_power, gamma_power])
    
    
    return features

##### To address the issue of a small dataset size, the data is divided into smaller segments. This segmentation helps in managing and processing the data more effectively

In [24]:
# Group data by labels to ensure segments don't mix labels
grouped_data = data.groupby('Label')

# Loop over each group to create segments and extract features
for label, group in grouped_data:
    # Loop through the group in windows
    for start in range(0, len(group) - window_size + 1, window_size):
        segment = group.iloc[start:start + window_size]  # Define the segment
        
        segment_features = []
        
        for channel in group.columns[:-1]:  # Exclude the 'Label' column
            channel_data = segment[channel].values  # Get the segment data for this channel
            channel_features = extract_features_from_channel(channel_data, sampling_rate)
            segment_features.extend(channel_features)  
        
        all_features.append(segment_features)
        
        # Assign the label to this segment
        all_labels.append(label)

  freqs, _, Pxy = _spectral_helper(x, y, fs, window, nperseg, noverlap,


In [25]:
from sklearn.preprocessing import LabelEncoder
# Convert features and labels to DataFrame and Series
X_features = pd.DataFrame(all_features)
y = pd.Series(all_labels)

# Encode the labels into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [26]:
from sklearn.preprocessing import StandardScaler

# Normalize the feature data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_features)

In [27]:
from tensorflow.keras.utils import to_categorical
# Reshape X to be 3D [samples, time steps, features] as required by GRU
X_reshaped = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# Convert labels to categorical (one-hot encoding) if it's a classification problem
y_categorical = to_categorical(y_encoded)

### Split the data

In [28]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_reshaped, y_categorical, test_size=0.2, random_state=42, stratify=y_encoded )

In [29]:
print('number of samples: ', X_train.shape[0])
print('number of time steps per sample: ', X_train.shape[1])
print('number of features per time step: ', X_train.shape[2])

number of samples:  909
number of time steps per sample:  1
number of features per time step:  352


### Build the GRU model

In [30]:
import joblib
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout



# Build the GRU model
model = Sequential()
model.add(GRU(100, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(GRU(100))
model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dense(y_categorical.shape[1], activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


  super().__init__(**kwargs)


### Train the model

In [31]:
history = model.fit(X_train, y_train, epochs=12, batch_size=32, validation_data=(X_test, y_test), verbose=2)

Epoch 1/12
29/29 - 3s - 114ms/step - accuracy: 0.5699 - loss: 0.9468 - val_accuracy: 0.7149 - val_loss: 0.7157
Epoch 2/12
29/29 - 0s - 6ms/step - accuracy: 0.7844 - loss: 0.5399 - val_accuracy: 0.8421 - val_loss: 0.3948
Epoch 3/12
29/29 - 0s - 6ms/step - accuracy: 0.8669 - loss: 0.3226 - val_accuracy: 0.8509 - val_loss: 0.3043
Epoch 4/12
29/29 - 0s - 6ms/step - accuracy: 0.9186 - loss: 0.2027 - val_accuracy: 0.8816 - val_loss: 0.2665
Epoch 5/12
29/29 - 0s - 6ms/step - accuracy: 0.9571 - loss: 0.1323 - val_accuracy: 0.8860 - val_loss: 0.2542
Epoch 6/12
29/29 - 0s - 6ms/step - accuracy: 0.9648 - loss: 0.1013 - val_accuracy: 0.9035 - val_loss: 0.2393
Epoch 7/12
29/29 - 0s - 7ms/step - accuracy: 0.9615 - loss: 0.0877 - val_accuracy: 0.9298 - val_loss: 0.2262
Epoch 8/12
29/29 - 0s - 5ms/step - accuracy: 0.9791 - loss: 0.0582 - val_accuracy: 0.9167 - val_loss: 0.2264
Epoch 9/12
29/29 - 0s - 6ms/step - accuracy: 0.9802 - loss: 0.0523 - val_accuracy: 0.9079 - val_loss: 0.2258
Epoch 10/12
29/29

### Evaluate the model on the test set

In [32]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print('Test Accuracy: ', accuracy )

8/8 - 0s - 6ms/step - accuracy: 0.9254 - loss: 0.2124
Test Accuracy:  0.9254385828971863


In [33]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)


report = classification_report(y_true, y_pred_classes, target_names=label_encoder.classes_)
print("classification Report:")
print(report)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step
classification Report:
              precision    recall  f1-score   support

        chat       0.97      0.99      0.98        72
        math       0.91      0.88      0.90        73
      nochat       0.89      0.92      0.90        83

    accuracy                           0.93       228
   macro avg       0.93      0.93      0.93       228
weighted avg       0.93      0.93      0.93       228

