In [2]:
import pandas as pd
import numpy as np
from sklearn import *
import matplotlib.pyplot as plt
import xgboost as xgb
from pandas.plotting import scatter_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV


In [3]:
# Unpacking data from pickle file and basic information about the data
data = pd.read_pickle('./ass2.pickle')
train, dev, test = data['train'], data['dev'], data['test'] 
print(f"number of features: {len(train.columns) - 1}")
print(f"types of labels: {train['target'].unique()}")

print(f"number of rows in train: {len(train)}")
print(f"number of rows in dev: {len(dev)}")
print(f"number of rows in test: {len(test)}")


number of features: 42
types of labels: [2 1 0]
number of rows in train: 40533
number of rows in dev: 13512
number of rows in test: 13512


In [4]:
#Checking for missing values
train.isnull().sum()

f0        0
f1        0
f2        0
f3        0
f4        0
f5        0
f6        0
f7        0
f8        0
f9        0
f10       0
f11       0
f12       0
f13       0
f14       0
f15       0
f16       0
f17       0
f18       0
f19       0
f20       0
f21       0
f22       0
f23       0
f24       0
f25       0
f26       0
f27       0
f28       0
f29       0
f30       0
f31       0
f32       0
f33       0
f34       0
f35       0
f36       0
f37       0
f38       0
f39       0
f40       0
f41       0
target    0
dtype: int64

In [11]:
# Information about the training data
train.groupby('target').describe()

Unnamed: 0_level_0,f0,f0,f0,f0,f0,f0,f0,f0,f1,f1,...,f40,f40,f41,f41,f41,f41,f41,f41,f41,f41
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
target,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,3917.0,1.061527,0.833138,0.0,0.0,1.0,2.0,2.0,3917.0,0.669135,...,0.0,2.0,3917.0,0.003574,0.074868,0.0,0.0,0.0,0.0,2.0
1,9882.0,1.082676,0.908831,0.0,0.0,1.0,2.0,2.0,9882.0,0.572759,...,0.0,2.0,9882.0,0.000911,0.038952,0.0,0.0,0.0,0.0,2.0
2,26734.0,0.89908,0.798188,0.0,0.0,1.0,2.0,2.0,26734.0,0.509052,...,0.0,2.0,26734.0,0.001272,0.037681,0.0,0.0,0.0,0.0,2.0


In [6]:
# Splitting the data into features and labels dataframes
train_features, train_labels = train.drop('target', axis=1), train['target']
dev_features, dev_labels = dev.drop('target', axis=1), dev['target']
test_features, test_labels = test.drop('target', axis=1), test['target']

In [12]:
def model_grid_search(model, param_grid):
    """
    This functions performs grid search on model hyperparameters    
    :param model: A model object
    :param param_grid: dictionary of {hyperparam_1 : [value_1, ..., value_n], ..., hyperparam_n : [value_1, ..., value_n]}
    :return: The model with the hyperparameters that gives the best accuracy.
    """
    GS = GridSearchCV(estimator=model, param_grid=param_grid, verbose=5)
    best_model = GS.fit(train, train_labels)
    print("Best Model:", GS.best_estimator_)
    return best_model

In [8]:
# Check XGBoost
xg_params = {"learning_rate": [0.4, 0.3, 0.2], "gamma": [0, 1], "lambda": [1, 2, 3], "n_estimators": [100, 125, 150, 175]}
xg_model = xgb.XGBClassifier(objective='multi:softmax', num_class=3)
best_xg_model = model_grid_search(xg_model, xg_params)
y_pred = best_xg_model.predict(dev)
print("Accuracy:", metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

Accuracy: 0.7551805802249852


array([[  11,  240, 1019],
       [   1, 1701, 1671],
       [   3,  374, 8492]], dtype=int64)

In [9]:
# Check Random Forest
tree_params = {"max_depth": [None, 30, 50], "n_estimators": [100, 125, 150, 175], "min_samples_split":[2, 3, 4]}
tree_model = RandomForestClassifier()
best_tree_model = model_grid_search(tree_model, tree_params)
y_pred = best_tree_model.predict(dev)
print("Accuracy:", metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

Accuracy: 0.8335553582001184


array([[ 199,  385,  686],
       [  94, 2629,  650],
       [  72,  362, 8435]], dtype=int64)

In [ ]:
# Check Gradient Boost
gb_params = {"max_depth": [2, 3], "n_estimators": [100, 125, 150, 175], "min_samples_split": [2, 3], "learning_rate": [0.1, 0.01]}
gb_model = GradientBoostingClassifier()
best_gb_model = model_grid_search(gb_model, gb_params)
y_pred = best_gb_model.predict(dev)
print("Accuracy:", metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [14]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(train_features, train_labels)
y_pred = clf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

Accuracy: 0.7556986382474837


array([[   2,  258, 1010],
       [   3, 1692, 1678],
       [   3,  349, 8517]], dtype=int64)

In [None]:
#Under sampling
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42, sampling_strategy = 'not minority')
train_features_resampled, train_labels_resampled = rus.fit_resample(train_features, train_labels)
rf = RandomForestClassifier()
rf.fit(train_features_resampled, train_labels_resampled)
y_pred = rf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [None]:
#Over sampling
from imblearn.over_sampling import SMOTE
smote = SMOTE()
train_features_resampled, train_labels_resampled = smote.fit_resample(train_features, train_labels)
rf = RandomForestClassifier()
rf.fit(train_features_resampled, train_labels_resampled)
y_pred = rf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [13]:
from sklearn.feature_selection import SelectKBest, chi2

k_best_selector = SelectKBest(chi2, k=40)
train_features_new = k_best_selector.fit_transform(train_features, train_labels)
rf = RandomForestClassifier()
rf.fit(train_features_new, train_labels)
y_pred = rf.predict(dev_features[k_best_selector.get_feature_names_out()])
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])



Accuracy: 0.8087625814091178


array([[ 167,  336,  767],
       [  93, 2283,  997],
       [  73,  318, 8478]], dtype=int64)

In [None]:
def train_with_2_svc(features: pd.DataFrame, labels: pd.Series):
    category_not_2_features = features[labels != 2]
    category_not_2_labels = labels[labels != 2]
    