In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import numpy as np
sys.path.append('..')

import dataproc.preprocessing as dp

In [14]:
#load and preprocessing the data

# Loaded variable 'df' from URI: /home/francesco/git/esn_rpy/data/signal_eeg.csv
import pandas as pd


filename = '/home/francesco/git/esn_rpy/data/signal_eeg.csv'

df_origin = pd.read_csv(filename)
nrow = df_origin.shape[0]

print(f'Origin shape: {df_origin.shape}')

f_rate = 0.2
df = pd.read_csv(filename)[:int(np.ceil(nrow * f_rate))]

print(f'Resampled df: {df.shape}')


Origin shape: (249720, 17)
Resampled df: (49944, 17)


In [21]:
#clean and preprocessing the data and apply feature reduction
df_clean =  dp.clean_data(df.copy())
X, y = dp.split_data_target(df_clean)


print(f'Shape of the data: {X.shape}')
print(f'Shape of the label: {y.shape}')
print(f'Number of classes: {len(np.unique(y))}')

Shape of the data: (31339, 12)
Shape of the label: (31339,)
Number of classes: 5


In [None]:
#apply feature reduction
lda_reduced, lda = dp.lda_process(X, y, n_components=None)

#require a specific variance threshold
required_var = 0.95

X_reduced = dp.get_data_for_variance(required_variance=required_var, lda=lda, X=lda_reduced)
print(f'Number of components required to reach {required_var*100}% variance: {X_reduced.shape[1]}')
print(f'Shape of the reduced data: {X_reduced.shape}')


Number of components required to reach 95.0% variance: 3
Shape of the reduced data: (31339, 3)


In [22]:
#splt the data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42, stratify=y)

In [23]:
#get information about the data
print(f'Shape of the training data: {X_train.shape}')
print(f'Shape of the test data: {X_test.shape}')
print(f'Number of classes: {len(np.unique(y))}')
#print the number of classes in training and test set
print(f'Number of classes in training set: {len(np.unique(y_train))}')
print(f'Number of classes in test set: {len(np.unique(y_test))}')
#give me the distribution of classes in training and test set
print(f'Distribution of classes in training set: {np.bincount(y_train)}')
print(f'Distribution of classes in test set: {np.bincount(y_test)}')

Shape of the training data: (25071, 3)
Shape of the test data: (6268, 3)
Number of classes: 5
Number of classes in training set: 5
Number of classes in test set: 5
Distribution of classes in training set: [6150 6150 2490 5148 5133]
Distribution of classes in test set: [1538 1538  622 1287 1283]


In [20]:
#code to classify with SVM the data loeded above
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create and train the SVM model
svm_model = SVC(kernel='rbf', C=1.0, random_state=42, decision_function_shape='ovr')
svm_model.fit(X_train, y_train)
# Evaluate the model on the test set

y_pred = svm_model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the SVM model: {accuracy * 100:.2f}%')
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))



Accuracy of the SVM model: 28.11%
Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.27      0.28      1538
           1       0.26      0.79      0.39      1538
           2       0.88      0.02      0.05       622
           3       0.62      0.03      0.05      1287
           4       0.68      0.06      0.11      1283

    accuracy                           0.28      6268
   macro avg       0.55      0.23      0.18      6268
weighted avg       0.49      0.28      0.20      6268

