In [9]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import plot_importance

from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from collections import Counter


ModuleNotFoundError: No module named 'imblearn'

In [3]:
df = pd.read_csv("./data/Training_Data_0611N_cleaned.csv")
df.shape

(19595, 50)

### Removing categorical variables

In [4]:
df_non_cat = df.select_dtypes(exclude=['object'])

X = df_non_cat.drop(columns='profitable_flag').values
Y = df_non_cat['profitable_flag'].values

test_size = 0.33
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size)

print(Counter(Y_train))

Counter({0.0: 10657, 1.0: 2471})


### Benchmarking using XGBoost

In [8]:
def model_accuracy(X_train, Y_train, X_test, Y_test, model=XGBClassifier):

    model = model()
    model.fit(X_train, Y_train)
    
    Y_pred = model.predict(X_test)
    predictions = [round(value) for value in Y_pred]
    accuracy = accuracy_score(Y_test, predictions)
    from sklearn.metrics import classification_report
    print(classification_report(Y_test,predictions))
    return model, accuracy

### Oversampling

In [9]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_train_over, Y_train_over = oversample.fit_resample(X_train, Y_train)
print(Counter(Y_train_over))

_, accuracy = model_accuracy(X_train_over, Y_train_over, X_test, Y_test)

Counter({0.0: 10657, 1.0: 10657})
              precision    recall  f1-score   support

         0.0       0.84      0.83      0.84      5294
         1.0       0.28      0.29      0.28      1173

    accuracy                           0.73      6467
   macro avg       0.56      0.56      0.56      6467
weighted avg       0.74      0.73      0.74      6467



## Hyperparameter tuning with lightgbm

In [3]:
import lightgbm as lgb

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)


In [5]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
      }

In [6]:
def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [7]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

job exception: name 'xgb' is not defined


  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]


NameError: name 'xgb' is not defined

In [None]:
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size)
model = model()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))
return model, accuracy

### Undersampling

In [7]:
undersample = RandomUnderSampler(sampling_strategy='majority')
X_train_under, Y_train_under = undersample.fit_resample(X_train, Y_train)
print(Counter(Y_train_under))

_, accuracy = model_accuracy(X_train_under, Y_train_under, X_test, Y_test)

Counter({0.0: 2471, 1.0: 2471})
              precision    recall  f1-score   support

         0.0       0.86      0.57      0.69      5294
         1.0       0.23      0.58      0.33      1173

    accuracy                           0.57      6467
   macro avg       0.55      0.58      0.51      6467
weighted avg       0.75      0.57      0.62      6467



### SMOTE