In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import plot_importance

from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from collections import Counter


Using TensorFlow backend.


In [3]:
df = pd.read_csv("./data/Training_Data_0611N_cleaned.csv")
df.shape

(19595, 50)

### Removing categorical variables

In [4]:
df_non_cat = df.select_dtypes(exclude=['object'])

X = df_non_cat.drop(columns='profitable_flag').values
y = df_non_cat['profitable_flag'].values
print(Counter(y))

Counter({0.0: 15951, 1.0: 3644})


### Benchmarking using XGBoost

In [5]:
def model_accuracy(X,Y,model=XGBClassifier):
    
    test_size = 0.33
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)

    model = model()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    from sklearn.metrics import classification_report
    print(classification_report(y_test,predictions))
    return model, accuracy

### Oversampling

In [6]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)
print(Counter(y_over))

_, accuracy = model_accuracy(X_over, y_over)

Counter({0.0: 15951, 1.0: 15951})
              precision    recall  f1-score   support

         0.0       0.90      0.79      0.84      5309
         1.0       0.81      0.91      0.85      5219

    accuracy                           0.85     10528
   macro avg       0.85      0.85      0.85     10528
weighted avg       0.85      0.85      0.85     10528



### Undersampling

In [7]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(X, y)
print(Counter(y_under))

_, accuracy = model_accuracy(X_under, y_under)

Counter({0.0: 3644, 1.0: 3644})
              precision    recall  f1-score   support

         0.0       0.59      0.55      0.57      1227
         1.0       0.57      0.61      0.59      1179

    accuracy                           0.58      2406
   macro avg       0.58      0.58      0.58      2406
weighted avg       0.58      0.58      0.58      2406



### SMOTE