# Studio 03

# Importing libraires

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1 - Data processing

### Concatnating the datasets

In [2]:
#Concatenating the data
file_paths = [f'ampc/w{i}.csv' for i in range(1, 5)]

data = pd.concat([pd.read_csv(f) for f in file_paths], ignore_index=True)

data.to_csv('./combined_data.csv', index = False)

data.head()

Unnamed: 0,acc_mean_x_right,acc_mean_y_right,acc_mean_z_right,acc_mean_xyz_right,acc_mean_xy_right,acc_mean_yz_right,acc_mean_zx_right,acc_mean_pitch_right,acc_mean_roll_right,acc_std_x_right,...,gyro_max_yz_left,gyro_max_zx_left,gyro_peak_x_left,gyro_peak_y_left,gyro_peak_z_left,gyro_peak_xyz_left,gyro_peak_xy_left,gyro_peak_yz_left,gyro_peak_zx_left,class
0,-0.1733,0.14864,0.98128,1.1065,0.44735,1.0611,1.0332,9.9751,52.281,0.25398,...,137.85,79.286,4,4,3,2,2,2,4,2
1,-0.40618,0.24715,0.79471,1.0178,0.52388,0.86595,0.96693,-30.421,48.213,0.26456,...,269.08,103.56,3,1,2,2,2,2,1,2
2,-0.4967,0.37167,0.70283,1.0402,0.68213,0.80958,0.9651,-23.068,52.897,0.35638,...,158.42,114.7,2,3,2,1,1,2,2,2
3,-0.2878,0.15882,0.91688,1.0974,0.50834,1.0276,0.99884,3.2451,31.009,0.29577,...,283.65,120.46,3,2,2,3,4,3,2,2
4,-0.56189,0.36946,0.68668,1.3085,0.91759,1.0218,1.1201,-24.118,47.579,0.5681,...,199.69,93.039,4,2,2,3,3,3,2,2


### Shuffle data

In [3]:
shuffled_data = data.sample(n=len(data)).reset_index(drop=True)

shuffled_data.to_csv('./all_data.csv', index = False)

shuffled_data.head()

Unnamed: 0,acc_mean_x_right,acc_mean_y_right,acc_mean_z_right,acc_mean_xyz_right,acc_mean_xy_right,acc_mean_yz_right,acc_mean_zx_right,acc_mean_pitch_right,acc_mean_roll_right,acc_std_x_right,...,gyro_max_yz_left,gyro_max_zx_left,gyro_peak_x_left,gyro_peak_y_left,gyro_peak_z_left,gyro_peak_xyz_left,gyro_peak_xy_left,gyro_peak_yz_left,gyro_peak_zx_left,class
0,-0.63301,0.091617,0.61149,1.0991,0.85542,0.77485,0.99276,-35.276,28.415,0.4245,...,349.3,151.52,2,4,2,3,3,5,2,2
1,-0.76782,-0.50672,0.5005,1.2313,1.0761,0.92489,0.94408,-50.987,-11.631,0.18505,...,70.476,19.381,6,4,4,5,5,5,5,2
2,-0.49158,-0.68475,0.48799,1.3326,1.114,1.0898,0.91036,-17.526,-25.436,0.49022,...,2.2089,3.0778,4,8,5,6,7,10,4,0
3,-0.029878,-0.75385,0.65347,1.0005,0.75527,0.99943,0.65522,21.196,-87.016,0.036613,...,4.8291,4.461,4,5,4,4,5,5,4,0
4,-0.028538,0.39652,0.90336,1.0604,0.5119,1.0063,0.96149,33.567,68.161,0.32079,...,328.27,440.46,6,7,4,5,8,5,6,2


# 2 - Model Training 

### Split features and target variable

In [4]:
from sklearn.model_selection import train_test_split

# Split features and target vairable
X = shuffled_data.drop('class', axis=1)
y = shuffled_data['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 


### Train-test split model training 

In [5]:
from sklearn import svm
from sklearn.metrics import accuracy_score

clf = svm.SVC() # Linear Kernel
clf.fit(X_train, y_train) # Training the model
y_pred = clf.predict(X_test) # Predicting the model
accuracy_score = accuracy_score(y_test, y_pred) # Calculating the accuracy of the model

f"Accuracy of the model: {accuracy_score*100:2f}%"

'Accuracy of the model: 88.879335%'

### 10-fold cross validation mean accuracy 

In [6]:
from sklearn import svm
from sklearn.model_selection import cross_val_score

clf = svm.SVC()
scores = cross_val_score(clf, X, y, cv = 10) # 10-fold cross validation

f"{scores.mean()*100:2f}%"

'89.216490%'

# 3 - Hyperparameter Tuning

### Use GridSeachCV to find the best set of values for the SVC model

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

 # Radial Basis Function kernel
param_grid = {
    'C' : [0.1, 1, 10, 100, 1000],
    'gamma' : [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel' : ['rbf']
}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

grid.fit(X_train, y_train)

grid.best_params_

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.838 total time=  12.7s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.837 total time=  12.9s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.837 total time=  12.8s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.837 total time=   9.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.837 total time=   8.7s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.838 total time=   9.1s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.837 total time=   8.6s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.837 total time=   8.4s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.837 total time=   8.3s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.837 total time=   8.3s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.838 total time=   8.6s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}

### Train-test split training with hyperparameter tuning

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# best_C = 10
# best_gamma = 0.0001
# best_kernel = rbf

best_C = grid.best_params_['C']
best_gamma = grid.best_params_['gamma']
best_kernel = grid.best_params_['kernel']

svc_hyp = SVC(C = best_C, gamma = best_gamma, kernel = best_kernel)

svc_hyp.fit(X_train, y_train)

y_pred_hyp = svc_hyp.predict(X_test)

accuracy_score_new = accuracy_score(y_test, y_pred_hyp)

print(f"Accuracy of the model after hyperparameter tuning: {accuracy_score_new*100:2f}%")

Accuracy of the model after hyperparameter tuning: 83.490972%


### 10-fold cross validation mean accuracy

In [9]:
# Perform 10-fold cross-validation
cv_scores = cross_val_score(svc_hyp, X, y, cv=10)

# Calculate and print the mean accuracy across all 10 folds
cv_accuracy = cv_scores.mean()

f"10-Fold Cross-Validation Accuracy after hyperparameter tuning: {cv_accuracy * 100:.2f}%"


'10-Fold Cross-Validation Accuracy after hyperparameter tuning: 84.27%'

# 4 - Feature Selection

In [10]:
# Import libraries
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

### Select features and split data based on selected features

In [11]:
# Select top 100 features
selector = SelectKBest(f_classif, k=100)

# Create a pipeline with a feature selector and classifier
X_selected = selector.fit_transform(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)

### Train-test split training with feature selection and hyperparameter tuning

In [12]:
svc_hyp_selected = SVC(C = best_C, gamma = best_gamma, kernel = best_kernel)

# Fit/train the model
svc_hyp_selected.fit(X_train, y_train)

# Predict the model
y_pred_hyp_selected = svc_hyp_selected.predict(X_test)

accuracy_score_selected = accuracy_score(y_test, y_pred_hyp_selected)

print(f"Accuracy of the model after feature selection: {accuracy_score_selected*100:.2f}%")

Accuracy of the model after feature selection: 85.61%


# 10-fold cross validation mean accuracy score

In [13]:
cv_scores_selected = cross_val_score(svc_hyp_selected, X_selected, y, cv=10)

cv_accuracy_selected = cv_scores_selected.mean()

f"10-Fold Cross-Validation Accuracy after feature selection: {cv_accuracy_selected * 100:.2f}%"

'10-Fold Cross-Validation Accuracy after feature selection: 85.60%'

# 5 - Dimensionality reduction

In [14]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

### Perform PCA to reduce dimensionality

In [15]:
# initialize PCA to reduce the dimensionality to 10 components
pca = PCA(n_components=10)

# fit PCA on the training data
X_pca = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)


### Train-test split model training with reduced dimensionality

In [16]:
svc_hyp_pca = SVC(C = best_C, gamma = best_gamma, kernel = best_kernel)

# Fit/train the model
svc_hyp_pca.fit(X_train, y_train)

# Predict the model
y_pred_hyp_pca = svc_hyp_pca.predict(X_test)

accuracy_score_pca = accuracy_score(y_test, y_pred_hyp_pca)

print(f"Accuracy of the model after PCA: {accuracy_score_pca*100:.2f}%")


Accuracy of the model after PCA: 84.69%


### 10-fold cross validation mean accuracy

In [17]:
# Perform 10-fold cross-validation
cv_scores_pca = cross_val_score(svc_hyp_pca, X_pca, y, cv=10)

cv_accuracy_pca = cv_scores_pca.mean()

f"10-Fold Cross-Validation Accuracy after PCA: {cv_accuracy_pca * 100:.2f}%"

'10-Fold Cross-Validation Accuracy after PCA: 84.34%'

# 6 - Testing with other classifiers

### Import libraries

In [18]:
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

### Load dataset and split data 

In [19]:
df = pd.read_csv('all_data.csv')

X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## SGD - Stochastic Gradient Descent 

### Train and test accuracy of SGD model

In [20]:
sgd_model = SGDClassifier(random_state=42)

sgd_model.fit(X_train, y_train)

y_pred = sgd_model.predict(X_test)

accuracy_score_sgd = accuracy_score(y_test, y_pred)

f"Accuracy of the model using Stochastic Gradient Descent: {accuracy_score_sgd*100:.2f}%"

'Accuracy of the model using Stochastic Gradient Descent: 88.39%'

### 10-fold cross validation mean accuracy score

In [21]:
cv_scores_sgd = cross_val_score(sgd_model, X, y, cv=10)

cv_accuracy_sgd = cv_scores_sgd.mean()

f"10-Fold Cross-Validation Accuracy using Stochastic Gradient Descent: {cv_accuracy_sgd * 100:.2f}%"

'10-Fold Cross-Validation Accuracy using Stochastic Gradient Descent: 86.59%'

## Random Forest Classifier

### Train-test split training

In [22]:
rf_model = RandomForestClassifier(random_state = 42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

accuracy_score_rf = accuracy_score(y_test, y_pred_rf)

f"Accuracy of the model using Random Forest: {accuracy_score_rf*100:.2f}%"

'Accuracy of the model using Random Forest: 92.00%'

### 10-fold cross validation mean accuracy

In [23]:
cv_scores_rf = cross_val_score(rf_model, X, y, cv=10)

cv_accuracy_rf = cv_scores_rf.mean()

f"10-Fold Cross-Validation Accuracy using Random Forest: {cv_accuracy_rf * 100:.2f}%"

'10-Fold Cross-Validation Accuracy using Random Forest: 92.65%'

## MLP Classifier

### Train-test split training

In [24]:
mlp_model = MLPClassifier(random_state = 42)

mlp_model.fit(X_train, y_train)

y_pred_mlp = mlp_model.predict(X_test)

accuracy_score_mlp = accuracy_score(y_test, y_pred_mlp)

f"Accuracy of the model using Multi-Layer Perceptron: {accuracy_score_mlp*100:.2f}%"

'Accuracy of the model using Multi-Layer Perceptron: 86.84%'

### 10-fold cross-validation mean accuracy

In [25]:
cv_scores_mlp = cross_val_score(mlp_model, X, y, cv=10)

cv_accuracy = cv_scores_mlp.mean()

f"10-Fold Cross-Validation Accuracy using Multi-Layer Perceptron: {cv_accuracy * 100:.2f}%"

'10-Fold Cross-Validation Accuracy using Multi-Layer Perceptron: 84.74%'