In [ ]:
import math
import pandas as pd
import numpy as np
from sklearn import *
import matplotlib.pyplot as plt
import xgboost as xgb
from pandas.plotting import scatter_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import PowerTransformer, PolynomialFeatures
import seaborn as sns

# **Data inspection and preprocessing**

In [ ]:
# Unpacking data from pickle file and basic information about the data
data = pd.read_pickle('./ass2.pickle')
train, dev, test = data['train'], data['dev'], data['test']

In [ ]:
# Show basic data information
print(f"number of features: {len(train.columns) - 1}")
print(f"types of labels: {train['target'].unique()}")

print(f"number of rows in train: {len(train)}")
print(f"number of rows in dev: {len(dev)}")
print(f"number of rows in test: {len(test)}")


In [3]:
#Checking for missing values
train.isnull().sum()

f0        0
f1        0
f2        0
f3        0
f4        0
f5        0
f6        0
f7        0
f8        0
f9        0
f10       0
f11       0
f12       0
f13       0
f14       0
f15       0
f16       0
f17       0
f18       0
f19       0
f20       0
f21       0
f22       0
f23       0
f24       0
f25       0
f26       0
f27       0
f28       0
f29       0
f30       0
f31       0
f32       0
f33       0
f34       0
f35       0
f36       0
f37       0
f38       0
f39       0
f40       0
f41       0
target    0
dtype: int64

There are no missing values

In [ ]:
# Show general information about the data (mean, ste, etc. by feature)
train.describe()

In [ ]:
# Information about the training data - group by label
train.groupby('target').describe()

the "count" column tells us that the data is unbalanced. There are way more samples of label "2" than the other labels.

In [5]:
# Splitting the data into features and labels dataframes
train_features, train_labels = train.drop('target', axis=1), train['target']
dev_features, dev_labels = dev.drop('target', axis=1), dev['target']
test_features, test_labels = test.drop('target', axis=1), test['target']

In [ ]:
# Check for redundant feature. We decide againts removing features since the correlations are not strong enough
sns.heatmap(train_features.corr())

For each pipeline in the learning section, we apply preprocessing method using PowerTransformer to map the features into more normal distribustion (Gaussian) and make learning easier. We showed the statistical data of each feature (for example, the mean of it), and it implies that this preprocess will achieve better results.

In [ ]:
# After preprocess, features will be cloder to form of Gaussian distribution
processed = PowerTransformer(method="yeo-johnson").fit(train_features).transform(train_features)
pd.DataFrame(processed).describe()

# **Learning**

We try few models, and for each one we try different hyperparameters using grid search. Finally we use accuracy score metric to evaluate the model, and also show the confusion matrix

In [ ]:
# Check XG Boost
# We try various learning rates and estimators
pipe = Pipeline(steps=[('scalar', PowerTransformer(method='yeo-johnson')), ('classifier', xgb.XGBClassifier(objective='multi:softmax', num_class=3))])
xg_param_space = [{'classifier__learning_rate': [0.2, 0.3, 0.4],
                 'classifier__n_estimators': [50, 100, 200]}]
xg_clf = GridSearchCV(pipe, xg_param_space, verbose=5)
xg_clf.fit(train_features, train_labels)
y_pred = xg_clf.predict(dev_features)
print(xg_clf.best_estimator_)
print("Accuracy:", metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [ ]:
# Check Random Forest
# We try various depth and minimal split parameters
pipe = Pipeline(steps=[('scalar', PowerTransformer(method='yeo-johnson')), ('polynomial_features', PolynomialFeatures()), ('classifier', RandomForestClassifier())])
forest_param_space = [{'classifier__n_estimators': [100, 250, 400],
                 'classifier__max_depth': [None, 30, 50],
                 'classifier__min_samples_split': [2, 3, 4]}]
rf_clf = GridSearchCV(pipe, forest_param_space, verbose=5)
rf_clf.fit(train_features, train_labels)
y_pred = rf_clf.predict(dev_features)
print(rf_clf.best_estimator_)
print("Accuracy:", metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [ ]:
# Check Gradient Boosting
# We try various hyperparameters for this model as well
pipe = Pipeline(steps=[('scalar', PowerTransformer(method='yeo-johnson')), ('classifier', GradientBoostingClassifier())])
gb_param_space = [{'classifier__learning_rate': [0.2, 0.3],
                 'classifier__max_depth': [2, 3],
                 'classifier__min_samples_split': [2, 3],
                 'classifier__n_estimators': [100, 250, 400]}]
gb_clf = GridSearchCV(pipe, gb_param_space, verbose=5)
gb_clf.fit(train_features, train_labels)
y_pred = gb_clf.predict(dev_features)
print(gb_clf.best_estimator_)
print("Accuracy:", metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

We note that we tried using PolynomialFeatures for XG boost and gradient boosting, but it caused our models to perform extremely slow

In [ ]:
# Check SVC
# We try few possible parameters
pipe = Pipeline(steps=[('scalar', PowerTransformer(method='yeo-johnson')),('classifier', SVC(gamma='auto'))])
svc_param_space = [{'classifier__class_weight': [None, 'balanced'], 'classifier__C': [1, 2, 3]}]
svc_clf = GridSearchCV(pipe, svc_param_space, verbose=5)
svc_clf.fit(train_features, train_labels)
y_pred = svc_clf.predict(dev_features)
print(svc_clf.best_estimator_)
print("Accuracy:", metrics.accuracy_score(dev_labels, y_pred))
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [None]:
# since the data is unbalanced, we try over and under sampling approaches
#Under sampling
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42, sampling_strategy = 'not minority')
train_features_resampled, train_labels_resampled = rus.fit_resample(train_features, train_labels)
rf = RandomForestClassifier()
rf.fit(train_features_resampled, train_labels_resampled)
y_pred = rf.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

In [None]:
#Over sampling
from imblearn.over_sampling import SMOTE
smote = SMOTE()
train_features_resampled, train_labels_resampled = smote.fit_resample(train_features, train_labels)
rf_over_sampling = clone(rf_clf)
rf_over_sampling.fit(train_features_resampled, train_labels_resampled)
y_pred = rf_over_sampling.predict(dev_features)
accuracy = accuracy_score(dev_labels, y_pred)
print("Accuracy:", accuracy)
confusion_matrix(y_true=dev_labels, y_pred=y_pred, labels=[0,1,2])

# **Test**

In [None]:
y_pred = xg_clf.predict(test_features)
print("Test accuracy:", metrics.accuracy_score(test_labels, y_pred))