In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Data Processing

In [2]:
train_feat = np.load("datasets/train/train_feature.npz", allow_pickle=True)
train_feat_X = train_feat['features']
train_feat_Y = train_feat['label']
val_feat = np.load("datasets/valid/valid_feature.npz", allow_pickle=True)
val_feat_X = val_feat['features']
val_feat_Y = val_feat['label']

In [3]:
# flatten the features  
train_feat_X = np.array([x.flatten() for x in train_feat_X])
val_feat_X = np.array([x.flatten() for x in val_feat_X])

## Model Training before PCA

In [6]:
#apply xgboost
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(train_feat_X, train_feat_Y)
y_pred = model.predict(val_feat_X)
accuracy = accuracy_score(val_feat_Y, y_pred)

Accuracy: 


In [8]:
print("Accuracy",(accuracy * 100.0))

Accuracy 98.3640081799591


In [17]:
# apply ada boost 
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(n_estimators=100, random_state=0)
model.fit(train_feat_X, train_feat_Y)
y_pred = model.predict(val_feat_X)
accuracy = accuracy_score(val_feat_Y, y_pred)
print("Accuracy",(accuracy * 100.0))

Accuracy 98.15950920245399


## After PCA

In [27]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_feat_X)

pca = PCA(n_components=128)  
X_pca = pca.fit_transform(X_scaled)

In [28]:
print(X_pca.shape)

(7080, 128)


In [29]:
# Apply XG boost classifier
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_pca, train_feat_Y)

# Predict the validation set
val_scaled = scaler.transform(val_feat_X)
val_pca = pca.transform(val_scaled)
val_pred = model.predict(val_pca)

# Calculate the accuracy
accuracy = accuracy_score(val_feat_Y, val_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9795501022494888


In [24]:
# Apply adaboost classifier
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()
model.fit(X_pca, train_feat_Y)

# Predict the validation set
val_pred = model.predict(val_pca)

# Calculate the accuracy
accuracy = accuracy_score(val_feat_Y, val_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9631901840490797


In [25]:
# import randomforest classifier and train using the pca reduced dat
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
model.fit(X_pca, train_feat_Y)

# Predict the validation set
val_pred = model.predict(val_pca)

# Calculate the accuracy
accuracy = accuracy_score(val_feat_Y, val_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.9059304703476483


In [26]:
X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(train_feat_X, train_feat_Y, test_size=0.8, random_state=42)
X_train_40, X_test_40, y_train_40, y_test_40 = train_test_split(train_feat_X, train_feat_Y, test_size=0.6, random_state=42)
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(train_feat_X, train_feat_Y, test_size=0.4, random_state=42)
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(train_feat_X, train_feat_Y, test_size=0.2, random_state=42)

# Checking Performance Analysis on amount of data

### For XGBoost without PCA

In [30]:
# Apply XG boost classifier on 20% of the data
model20 = XGBClassifier()
model20.fit(X_train_20, y_train_20)
y_pred20 = model20.predict(X_test_20)
accuracy20 = accuracy_score(y_test_20, y_pred20)
print(f"Accuracy: {accuracy20}")

Accuracy: 0.955861581920904


In [31]:
model40 = XGBClassifier()
model40.fit(X_train_40, y_train_40)
y_pred40 = model40.predict(X_test_40)
accuracy40 = accuracy_score(y_test_40, y_pred40)
print(f"Accuracy: {accuracy40}")


Accuracy: 0.9722222222222222


In [32]:
model60 = XGBClassifier()
model60.fit(X_train_60, y_train_60)
y_pred60 = model60.predict(X_test_60)
accuracy60 = accuracy_score(y_test_60, y_pred60)
print(f"Accuracy: {accuracy60}")

Accuracy: 0.9781073446327684


In [33]:
model80 = XGBClassifier()
model80.fit(X_train_80, y_train_80)
y_pred80 = model80.predict(X_test_80)
accuracy80 = accuracy_score(y_test_80, y_pred80)
print(f"Accuracy: {accuracy80}")

Accuracy: 0.9837570621468926


### For Adaboost without PCA

In [34]:
ada20 = AdaBoostClassifier()
ada20.fit(X_train_20, y_train_20)
y_pred20 = ada20.predict(X_test_20)
accuracy20 = accuracy_score(y_test_20, y_pred20)
print(f"Accuracy: {accuracy20}")

Accuracy: 0.9470338983050848


In [35]:
ada40 = AdaBoostClassifier()
ada40.fit(X_train_40, y_train_40)
y_pred40 = ada40.predict(X_test_40)
accuracy40 = accuracy_score(y_test_40, y_pred40)
print(f"Accuracy: {accuracy40}")

Accuracy: 0.9677495291902072


In [36]:
ada60 = AdaBoostClassifier()
ada60.fit(X_train_60, y_train_60)
y_pred60 = ada60.predict(X_test_60)
accuracy60 = accuracy_score(y_test_60, y_pred60)
print(f"Accuracy: {accuracy60}")

Accuracy: 0.9692796610169492


In [37]:
ada80 = AdaBoostClassifier()
ada80.fit(X_train_80, y_train_80)
y_pred80 = ada80.predict(X_test_80)
accuracy80 = accuracy_score(y_test_80, y_pred80)
print(f"Accuracy: {accuracy80}")

Accuracy: 0.9774011299435028


### AdaBoost with PCA

In [38]:
# split into sets of data but with pca reduced data
X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(X_pca, train_feat_Y, test_size=0.8, random_state=42)
X_train_40, X_test_40, y_train_40, y_test_40 = train_test_split(X_pca, train_feat_Y, test_size=0.6, random_state=42)
X_train_60, X_test_60, y_train_60, y_test_60 = train_test_split(X_pca, train_feat_Y, test_size=0.4, random_state=42)
X_train_80, X_test_80, y_train_80, y_test_80 = train_test_split(X_pca, train_feat_Y, test_size=0.2, random_state=42)

In [39]:
ada20 = AdaBoostClassifier()
ada20.fit(X_train_20, y_train_20)
y_pred20 = ada20.predict(X_test_20)
accuracy20 = accuracy_score(y_test_20, y_pred20)
print(f"Accuracy: {accuracy20}")

Accuracy: 0.9385593220338984


In [40]:
ada40 = AdaBoostClassifier()
ada40.fit(X_train_40, y_train_40)
y_pred40 = ada40.predict(X_test_40)
accuracy40 = accuracy_score(y_test_40, y_pred40)
print(f"Accuracy: {accuracy40}")

Accuracy: 0.949623352165725


In [41]:
ada60 = AdaBoostClassifier()
ada60.fit(X_train_60, y_train_60)
y_pred60 = ada60.predict(X_test_60)
accuracy60 = accuracy_score(y_test_60, y_pred60)
print(f"Accuracy: {accuracy60}")

Accuracy: 0.9576271186440678


In [42]:
ada80 = AdaBoostClassifier()
ada80.fit(X_train_80, y_train_80)
y_pred80 = ada80.predict(X_test_80)
accuracy80 = accuracy_score(y_test_80, y_pred80)
print(f"Accuracy: {accuracy80}")

Accuracy: 0.9625706214689266
