In [1]:
import math

import pandas as pd
import numpy as np
from sklearn import *
import matplotlib.pyplot as plt
import xgboost as xgb
from pandas.plotting import scatter_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
data = pd.read_pickle('./ass2.pickle')
train, dev, test = data['train'], data['dev'], data['test'] 
print(f"number of features: {len(train.columns) - 1}")
print(f"types of labels: {train['target'].unique()}")

print(f"number of rows in train: {len(train)}")
print(f"number of rows in dev: {len(dev)}")
print(f"number of rows in test: {len(test)}")


number of features: 42
types of labels: [2 1 0]
number of rows in train: 40533
number of rows in dev: 13512
number of rows in test: 13512


In [3]:
#Checking for missing values
train.isnull().sum()

f0        0
f1        0
f2        0
f3        0
f4        0
f5        0
f6        0
f7        0
f8        0
f9        0
f10       0
f11       0
f12       0
f13       0
f14       0
f15       0
f16       0
f17       0
f18       0
f19       0
f20       0
f21       0
f22       0
f23       0
f24       0
f25       0
f26       0
f27       0
f28       0
f29       0
f30       0
f31       0
f32       0
f33       0
f34       0
f35       0
f36       0
f37       0
f38       0
f39       0
f40       0
f41       0
target    0
dtype: int64

In [4]:
train.groupby('target').describe()

Unnamed: 0_level_0,f0,f0,f0,f0,f0,f0,f0,f0,f1,f1,...,f40,f40,f41,f41,f41,f41,f41,f41,f41,f41
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,3917.0,1.061527,0.833138,0.0,0.0,1.0,2.0,2.0,3917.0,0.669135,...,0.0,2.0,3917.0,0.003574,0.074868,0.0,0.0,0.0,0.0,2.0
1,9882.0,1.082676,0.908831,0.0,0.0,1.0,2.0,2.0,9882.0,0.572759,...,0.0,2.0,9882.0,0.000911,0.038952,0.0,0.0,0.0,0.0,2.0
2,26734.0,0.89908,0.798188,0.0,0.0,1.0,2.0,2.0,26734.0,0.509052,...,0.0,2.0,26734.0,0.001272,0.037681,0.0,0.0,0.0,0.0,2.0


In [5]:
train_features, train_labels = train.drop('target', axis=1), train['target']
dev_features, dev_labels = dev.drop('target', axis=1), dev['target']
test_features, test_labels = test.drop('target', axis=1), test['target']

In [6]:
rf = RandomForestClassifier()
rf.fit(train_features, train_labels)
y_pred = rf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

Accuracy: 0.8079484902309059


array([[ 144,  332,  794],
       [  84, 2268, 1021],
       [  68,  296, 8505]], dtype=int64)

In [7]:
# Hyper Parameter tuning
from sklearn.model_selection import GridSearchCV
# param_grid = {"n_estimators": [100, 200, 250], 'max_depth': [None, 1, 2, 3], 'max_features': list(range(int(math.sqrt(len(train_features.columns))), len(train_features.columns)+1, 3))}
param_grid = {"n_estimators": [100, 200, 250], 'max_depth': [None, 1, 2, 3]}
rf_hyper_tuned = GridSearchCV(rf, param_grid, cv=5).fit(train_features, train_labels)
y_pred = rf_hyper_tuned.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

Accuracy: 0.8128330373001776


array([[ 143,  342,  785],
       [  80, 2312,  981],
       [  55,  286, 8528]], dtype=int64)

In [8]:
clf = GradientBoostingClassifier(n_estimators=100, random_state=0).fit(train_features, train_labels)
y_pred = clf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

Accuracy: 0.7551805802249852


array([[  11,  240, 1019],
       [   1, 1701, 1671],
       [   3,  374, 8492]], dtype=int64)

In [9]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=42)
xgb_model.fit(train_features, train_labels)
y_pred = xgb_model.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

Accuracy: 0.8335553582001184


array([[ 199,  385,  686],
       [  94, 2629,  650],
       [  72,  362, 8435]], dtype=int64)

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(train_features, train_labels)
y_pred = clf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

Accuracy: 0.7556986382474837


array([[   2,  258, 1010],
       [   3, 1692, 1678],
       [   3,  349, 8517]], dtype=int64)

In [11]:
#Under sampling
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42, sampling_strategy = 'not minority')
train_features_resampled, train_labels_resampled = rus.fit_resample(train_features, train_labels)
rf = RandomForestClassifier()
rf.fit(train_features_resampled, train_labels_resampled)
y_pred = rf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

Accuracy: 0.6736974541148608


array([[ 798,  284,  188],
       [ 803, 2275,  295],
       [1799, 1040, 6030]], dtype=int64)

In [12]:
#Over sampling
from imblearn.over_sampling import SMOTE
smote = SMOTE()
train_features_resampled, train_labels_resampled = smote.fit_resample(train_features, train_labels)
rf = RandomForestClassifier()
rf.fit(train_features_resampled, train_labels_resampled)
y_pred = rf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

Accuracy: 0.7711663706335109


array([[ 456,  402,  412],
       [ 380, 2472,  521],
       [ 673,  704, 7492]], dtype=int64)

In [13]:
from sklearn.feature_selection import SelectKBest, chi2

k_best_selector = SelectKBest(chi2, k=40)
train_features_new = k_best_selector.fit_transform(train_features, train_labels)
rf = RandomForestClassifier()
rf.fit(train_features_new, train_labels)
y_pred = rf.predict(dev_features[k_best_selector.get_feature_names_out()])
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])



Accuracy: 0.8100947306098283


array([[ 150,  340,  780],
       [  89, 2306,  978],
       [  72,  307, 8490]], dtype=int64)