In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import numpy as np
sys.path.append('..')

import dataproc.preprocessing as dp

In [3]:
#load and preprocessing the data

# Loaded variable 'df' from URI: /home/francesco/git/esn_rpy/data/signal_eeg.csv
import pandas as pd


filename = '/home/francesco/git/esn_rpy/data/signal_eeg.csv'

df_origin = pd.read_csv(filename)
nrow = df_origin.shape[0]

print(f'Origin shape: {df_origin.shape}')

f_rate = 1
df = pd.read_csv(filename)[:int(np.ceil(nrow * f_rate))]

print(f'Resampled df: {df.shape}')


Origin shape: (249720, 17)
Resampled df: (249720, 17)


In [19]:
#clean and preprocessing the data and apply feature reduction
df_clean =  dp.clean_data(df.copy(), scale=True)
X, y = dp.split_data_target(df_clean)


print(f'Shape of the data: {X.shape}')
print(f'Shape of the label: {y.shape}')
print(f'Number of classes: {len(np.unique(y))}')

Shape of the data: (130931, 12)
Shape of the label: (130931,)
Number of classes: 20


In [20]:
#apply feature reduction
X_transformed, lda = dp.lda_process(X, y, n_components=None)


#require a specific variance threshold
required_var = 0.95

X_reduced_lda = dp.get_data_for_variance(required_variance=required_var, ca=lda, X=X_transformed)
print(f'Number of components required to reach {required_var*100}% variance: {X_reduced_lda.shape[1]}')
print(f'Shape of the reduced data: {X_reduced_lda.shape}')


Number of components required to reach 95.0% variance: 6
Shape of the reduced data: (130931, 6)


In [21]:
X_transformed_pca, pca = dp.pca_process(X, n_components=None)
#require a specific variance threshold
required_var = 0.95

X_reduced_pca = dp.get_data_for_variance(required_variance=required_var, ca=pca, X=X_transformed_pca)
print(f'Number of components required to reach {required_var*100}% variance: {X_reduced_pca.shape[1]}')
print(f'Shape of the reduced data: {X_reduced_pca.shape}')

Number of components required to reach 95.0% variance: 9
Shape of the reduced data: (130931, 9)


In [23]:
print('Feature reduced data shape:')
print(X_transformed[0:5])
print('Extract component:')
print(X_reduced_lda[0:5])

Feature reduced data shape:
[[ 0.03230148 -0.02892469 -0.01809318  0.0316722  -0.00390796 -0.01955923
   0.03988031 -0.06356248  0.01131036 -0.06934005  0.01959182  0.06672294]
 [ 0.1173437  -0.21019148  0.34212868 -0.15926629 -0.35378641 -0.49194994
  -0.28289487  0.00173375  0.14276417 -0.10536552  0.02026901 -0.31303716]
 [ 0.18314029 -0.39510052  0.63863478 -0.27685971 -0.63957579 -0.84706545
  -0.58943897  0.07372448  0.24522465 -0.15941737 -0.04253329 -0.52986506]
 [ 0.2153767  -0.5826665   0.82627876 -0.27797511 -0.81244762 -1.0100013
  -0.86301783  0.1551087   0.28163312 -0.23277422 -0.18835125 -0.49912707]
 [ 0.20956397 -0.76556128  0.89258409 -0.1614915  -0.84611663 -0.97386061
  -1.08662526  0.24194129  0.2061177  -0.3031868  -0.3766472  -0.25088056]]
Extract component:
[[ 0.03230148 -0.02892469 -0.01809318  0.0316722  -0.00390796 -0.01955923]
 [ 0.1173437  -0.21019148  0.34212868 -0.15926629 -0.35378641 -0.49194994]
 [ 0.18314029 -0.39510052  0.63863478 -0.27685971 -0.63957

In [24]:
#trasform a single sample
print('Transforming a single sample...')
lda.transform(X[0,:].reshape(1, -1))

Transforming a single sample...


array([[ 0.03230148, -0.02892469, -0.01809318,  0.0316722 , -0.00390796,
        -0.01955923,  0.03988031, -0.06356248,  0.01131036, -0.06934005,
         0.01959182,  0.06672294]])

In [25]:
#splt the data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_reduced_lda, y, test_size=0.2, random_state=42, stratify=y)

In [26]:
#get information about the data
print(f'Shape of the training data: {X_train.shape}')
print(f'Shape of the test data: {X_test.shape}')
print(f'Number of classes: {len(np.unique(y))}')
#print the number of classes in training and test set
print(f'Number of classes in training set: {len(np.unique(y_train))}')
print(f'Number of classes in test set: {len(np.unique(y_test))}')
#give me the distribution of classes in training and test set
print(f'Distribution of classes in training set: {np.bincount(y_train)}')
print(f'Distribution of classes in test set: {np.bincount(y_test)}')

Shape of the training data: (104744, 6)
Shape of the test data: (26187, 6)
Number of classes: 20
Number of classes in training set: 20
Number of classes in test set: 20
Distribution of classes in training set: [6150 6150 5146 5158 5120 5152 5107 5127 5148 5139 5133 5133 5126 5139
 5133 5133 5139 5152 5139 5120]
Distribution of classes in test set: [1538 1538 1286 1290 1280 1288 1277 1281 1287 1285 1283 1283 1282 1285
 1283 1283 1285 1288 1285 1280]


In [None]:
#code to classify with SVM the data loeded above
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Create and train the SVM model
svm_model = SVC(kernel='rbf', C=1.0)
svm_model.fit(X_train, y_train)
# Evaluate the model on the test set

y_pred = svm_model.predict(X_test)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy of the SVM model: {accuracy * 100:.2f}%')
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

