In [15]:
import math

import pandas as pd
import numpy as np
from sklearn import *
import matplotlib.pyplot as plt
import xgboost as xgb
from pandas.plotting import scatter_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import PowerTransformer, PolynomialFeatures


In [16]:
# Unpacking data from pickle file and basic information about the data
data = pd.read_pickle('./ass2.pickle')
train, dev, test = data['train'], data['dev'], data['test'] 
print(f"number of features: {len(train.columns) - 1}")
print(f"types of labels: {train['target'].unique()}")

print(f"number of rows in train: {len(train)}")
print(f"number of rows in dev: {len(dev)}")
print(f"number of rows in test: {len(test)}")


number of features: 42
types of labels: [2 1 0]
number of rows in train: 40533
number of rows in dev: 13512
number of rows in test: 13512


In [17]:
#Checking for missing values
train.isnull().sum()

f0        0
f1        0
f2        0
f3        0
f4        0
f5        0
f6        0
f7        0
f8        0
f9        0
f10       0
f11       0
f12       0
f13       0
f14       0
f15       0
f16       0
f17       0
f18       0
f19       0
f20       0
f21       0
f22       0
f23       0
f24       0
f25       0
f26       0
f27       0
f28       0
f29       0
f30       0
f31       0
f32       0
f33       0
f34       0
f35       0
f36       0
f37       0
f38       0
f39       0
f40       0
f41       0
target    0
dtype: int64

There are no missing values

In [18]:
# Information about the training data
train.groupby('target').describe()

Unnamed: 0_level_0,f0,f0,f0,f0,f0,f0,f0,f0,f1,f1,...,f40,f40,f41,f41,f41,f41,f41,f41,f41,f41
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,3917.0,1.061527,0.833138,0.0,0.0,1.0,2.0,2.0,3917.0,0.669135,...,0.0,2.0,3917.0,0.003574,0.074868,0.0,0.0,0.0,0.0,2.0
1,9882.0,1.082676,0.908831,0.0,0.0,1.0,2.0,2.0,9882.0,0.572759,...,0.0,2.0,9882.0,0.000911,0.038952,0.0,0.0,0.0,0.0,2.0
2,26734.0,0.89908,0.798188,0.0,0.0,1.0,2.0,2.0,26734.0,0.509052,...,0.0,2.0,26734.0,0.001272,0.037681,0.0,0.0,0.0,0.0,2.0


the "count" column tells us that the data is unbalanced. There are way more samples of label "2" than the other labels.

In [19]:
# Splitting the data into features and labels dataframes
train_features, train_labels = train.drop('target', axis=1), train['target']
dev_features, dev_labels = dev.drop('target', axis=1), dev['target']
test_features, test_labels = test.drop('target', axis=1), test['target']

In [20]:
# Check XG Boost
pipe = Pipeline(steps=[('scalar', PowerTransformer(method='yeo-johnson')), ('polynomial_features', PolynomialFeatures()),('classifier', xgb.XGBClassifier(objective='multi:softmax', num_class=3))])
xg_param_space = [{'classifier__learning_rate': [0.2, 0.3, 0.4],
                 'classifier__n_estimators': [100, 250, 400]}]
clf = GridSearchCV(pipe, xg_param_space, verbose=5)
clf.fit(train_features, train_labels)
y_pred = clf.predict(dev_features)
print(clf.best_estimator_)
print(metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END classifier__gamma=0, classifier__lambda=1, classifier__learning_rate=0.2, classifier__n_estimators=100;, score=0.826 total time=  10.7s
[CV 2/5] END classifier__gamma=0, classifier__lambda=1, classifier__learning_rate=0.2, classifier__n_estimators=100;, score=0.825 total time=   9.8s
[CV 3/5] END classifier__gamma=0, classifier__lambda=1, classifier__learning_rate=0.2, classifier__n_estimators=100;, score=0.822 total time=  10.4s
[CV 4/5] END classifier__gamma=0, classifier__lambda=1, classifier__learning_rate=0.2, classifier__n_estimators=100;, score=0.827 total time=  10.4s
[CV 5/5] END classifier__gamma=0, classifier__lambda=1, classifier__learning_rate=0.2, classifier__n_estimators=100;, score=0.829 total time=  10.5s
[CV 1/5] END classifier__gamma=0, classifier__lambda=1, classifier__learning_rate=0.2, classifier__n_estimators=250;, score=0.835 total time=  25.3s
[CV 2/5] END classifier__gamma=0, classifier__

array([[ 381,  342,  547],
       [ 181, 2751,  441],
       [ 194,  310, 8365]], dtype=int64)

In [ ]:
# Check Random Forest 
pipe = Pipeline(steps=[('scalar', PowerTransformer(method='yeo-johnson')), ('polynomial_features', PolynomialFeatures()), ('classifier', RandomForestClassifier())])
forest_param_space = [{'classifier__n_estimators': [100, 250, 400],
                 'classifier__max_depth': [None, 30, 50],
                 'classifier__min_samples_split': [2, 3, 4]}]
clf = GridSearchCV(pipe, forest_param_space, verbose=5)
clf.fit(train_features, train_labels)
y_pred = clf.predict(dev_features)
print(clf.best_estimator_)
print(metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [ ]:
# Check Gradient Boosting
pipe = Pipeline(steps=[('scalar', PowerTransformer(method='yeo-johnson')), ('polynomial_features', PolynomialFeatures()), ('classifier', GradientBoostingClassifier())])
gb_param_space = [{'classifier__learning_rate': [0.1, 0.2],
                 'classifier__max_depth': [2, 3],
                 'classifier__min_samples_split': [2, 3],
                 'classifier__n_estimators': [100, 250, 400]}]
clf = GridSearchCV(pipe, gb_param_space, verbose=5)
clf.fit(train_features, train_labels)
y_pred = clf.predict(dev_features)
print(clf.best_estimator_)
print(metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [ ]:
# Check SVC
pipe = Pipeline(steps=[('scalar', PowerTransformer(method='yeo-johnson')),('classifier', SVC(gamma='auto'))])
svc_param_space = [{'classifier__class_weight': [None, 'balanced'], 'classifier__C': [1, 2, 3]}]
clf = GridSearchCV(pipe, svc_param_space, verbose=5)
clf.fit(train_features, train_labels)
y_pred = clf.predict(dev_features)
print(clf.best_estimator_)
print(metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [None]:
#Under sampling
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42, sampling_strategy = 'not minority')
train_features_resampled, train_labels_resampled = rus.fit_resample(train_features, train_labels)
rf = RandomForestClassifier()
rf.fit(train_features_resampled, train_labels_resampled)
y_pred = rf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [None]:
#Over sampling
from imblearn.over_sampling import SMOTE
smote = SMOTE()
train_features_resampled, train_labels_resampled = smote.fit_resample(train_features, train_labels)
rf = RandomForestClassifier()
rf.fit(train_features_resampled, train_labels_resampled)
y_pred = rf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [None]:
# from sklearn.feature_selection import SelectKBest, chi2
# 
# k_best_selector = SelectKBest(chi2, k=40)
# train_features_new = k_best_selector.fit_transform(train_features, train_labels)
# rf = RandomForestClassifier()
# rf.fit(train_features_new, train_labels)
# y_pred = rf.predict(dev_features[k_best_selector.get_feature_names_out()])
# accuracy = accuracy_score(dev_labels, y_pred)
# print("Accuracy:", accuracy)
# confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])