In [ ]:
import math

import pandas as pd
import numpy as np
from sklearn import *
import matplotlib.pyplot as plt
import xgboost as xgb
from pandas.plotting import scatter_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV


In [ ]:
# Unpacking data from pickle file and basic information about the data
data = pd.read_pickle('./ass2.pickle')
train, dev, test = data['train'], data['dev'], data['test'] 
print(f"number of features: {len(train.columns) - 1}")
print(f"types of labels: {train['target'].unique()}")

print(f"number of rows in train: {len(train)}")
print(f"number of rows in dev: {len(dev)}")
print(f"number of rows in test: {len(test)}")


In [ ]:
#Checking for missing values
train.isnull().sum()

In [ ]:
# Information about the training data
train.groupby('target').describe()

In [ ]:
# Splitting the data into features and labels dataframes
train_features, train_labels = train.drop('target', axis=1), train['target']
dev_features, dev_labels = dev.drop('target', axis=1), dev['target']
test_features, test_labels = test.drop('target', axis=1), test['target']

In [ ]:
def model_grid_search(model, param_grid):
    """
    This functions performs grid search on model hyperparameters
    :param model: A model object
    :param param_grid: dictionary of {hyperparam_1 : [value_1, ..., value_n], ..., hyperparam_n : [value_1, ..., value_n]}
    :return: The model with the hyperparameters that gives the best accuracy.
    """
    GS = GridSearchCV(estimator=model, param_grid=param_grid, verbose=5)
    best_model = GS.fit(train_features, train_labels)
    print("Best Model:", GS.best_estimator_)
    return best_model

In [ ]:
# Check XGBoost
xg_params = {"learning_rate": [0.4, 0.3, 0.2], "gamma": [0, 1], "lambda": [1, 2, 3], "n_estimators": [100, 125, 150, 175]}
xg_model = xgb.XGBClassifier(objective='multi:softmax', num_class=3)
best_xg_model = model_grid_search(xg_model, xg_params)
y_pred = best_xg_model.predict(dev)
print("Accuracy:", metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [ ]:
# Check Random Forest
tree_params = {"max_depth": [None, 30, 50], "n_estimators": [100, 125, 150, 175], "min_samples_split":[2, 3, 4]}
tree_model = RandomForestClassifier()
best_tree_model = model_grid_search(tree_model, tree_params)
y_pred = best_tree_model.predict(dev)
print("Accuracy:", metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [ ]:
# Check Gradient Boost
gb_params = {"max_depth": [2, 3], "n_estimators": [100, 125, 150, 175], "min_samples_split": [2, 3], "learning_rate": [0.1, 0.01]}
gb_model = GradientBoostingClassifier()
best_gb_model = model_grid_search(gb_model, gb_params)
y_pred = best_gb_model.predict(dev)
print("Accuracy:", metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [ ]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

pipe = Pipeline([('scaler', StandardScaler()),
                 ('selector', SelectKBest(k=30)),
                 ('classifier', xgb.XGBClassifier(objective='multi:softmax', num_class=3))])

search_space = [{'selector__k': [30, 35, 40, 42]},
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [100, 125, 150, 175],
                 'classifier__max_depth': [None, 30, 50],
                 'classifier__min_samples_split': [2, 3, 4]},
                {'classifier': [GradientBoostingClassifier()],
                 'classifier__learning_rate': [0.1, 0.2],
                 'classifier__max_depth': [2, 3],
                 'classifier__min_samples_split': [2, 3],
                 'classifier__n_estimators': [100, 125, 150, 175]},
                {'classifier': [xgb.XGBClassifier(objective='multi:softmax', num_class=3)],
                 'classifier__learning_rate': [0.2, 0.3, 0.4],
                 'classifier__gamma': [0, 1],
                 'classifier__lambda': [1, 2, 3],
                 'classifier__n_estimators': [100, 125, 150, 175]},
                {'classifier': [SVC(gamma='auto')]}]
clf = GridSearchCV(pipe, search_space, verbose=5)
clf.fit(train_features, train_labels)
y_pred = clf.predict(dev)
print(clf.best_estimator_)
print(metrics.accuracy_score(dev_labels, y_pred))

In [ ]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(train_features, train_labels)
y_pred = clf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [None]:
#Under sampling
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42, sampling_strategy = 'not minority')
train_features_resampled, train_labels_resampled = rus.fit_resample(train_features, train_labels)
rf = RandomForestClassifier()
rf.fit(train_features_resampled, train_labels_resampled)
y_pred = rf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [None]:
#Over sampling
from imblearn.over_sampling import SMOTE
smote = SMOTE()
train_features_resampled, train_labels_resampled = smote.fit_resample(train_features, train_labels)
rf = RandomForestClassifier()
rf.fit(train_features_resampled, train_labels_resampled)
y_pred = rf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [13]:
from sklearn.feature_selection import SelectKBest, chi2

k_best_selector = SelectKBest(chi2, k=40)
train_features_new = k_best_selector.fit_transform(train_features, train_labels)
rf = RandomForestClassifier()
rf.fit(train_features_new, train_labels)
y_pred = rf.predict(dev_features[k_best_selector.get_feature_names_out()])
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])



Accuracy: 0.8087625814091178


array([[ 167,  336,  767],
       [  93, 2283,  997],
       [  73,  318, 8478]], dtype=int64)