In [1]:
from fastai.basics import *
from fastai.tabular.all import *
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
@patch
def export(self:TabularPandas, fname='export.pkl', pickle_protocol=2):
    "Export the contents of `self` without the items"
    old_to = self
    self = self.new_empty()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pickle.dump(self, open(Path(fname), 'wb'), protocol=pickle_protocol)
        self = old_to

In [3]:
def get_data(data_pth,is_train_ds=False):
    
    df = pd.read_csv(data_pth, low_memory=False)

    with open('artifacts/features.txt') as json_file:
            features = json.load(json_file)
    
    cont = features['cont']
    cat = features['cat']
    dep_var = features['dep_var']
    cols = cat+cont+[dep_var]
    
    df = df[cols]
    
    if is_train_ds:
        
        procs_nn = [Categorify, FillMissing]
        data_proc = TabularPandas(df, procs_nn, cat, cont, splits=None, y_names=dep_var)
        data_proc.export('artifacts/data-proc.pkl')
    
    else:
        with open('artifacts/data-proc.pkl', 'rb') as preproc_file:
            preproc = pickle.load(preproc_file)

            data_proc = preproc.train.new(df)
            data_proc.process()
    
    X,y = data_proc.train.xs,data_proc.train.y
    

    return X,y

In [4]:
X, y = get_data('data/train-sample.csv',is_train_ds=True)

In [45]:
# X.drop(['TransactionDT'],axis=1,inplace=True)

In [46]:
w = (y[y==0]).count() / (y[y==1]).count() 

In [47]:
w

27.571428571428573

In [48]:
 params = {
        # defines booster, gblinear for linear functions.
        "booster": "gbtree",
        # sampling ratio for training data.
        "subsample": 0.8,
        # sampling according to each tree.
        "colsample_bytree": 0.7,
        #number of trees
        "n_estimators": 400,
        # maximum depth of the tree, signifies complexity of the tree.
        "max_depth": 14,
        # minimum child weight, larger the term more conservative the tree.
        "min_child_weight": 14,
        "learning_rate": 0.0069,
        "objective": "binary:logistic",
        "scale_pos_weight": w
 }


In [49]:
model = XGBClassifier(**params)

In [50]:
aucs = []
accuracies = []
#Create stratified folds to avoid our model overfitting. we create 5 folds for this model
kf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

for idx in kf.split(X=X,y=y):
    train_idx, valid_idx = idx[0], idx[1]
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid,  y_valid = X.iloc[valid_idx], y.iloc[valid_idx]

    model.fit(
        X_train, 
        y_train, 
        eval_metric='auc', 
        eval_set=[(X_valid, y_valid)], 
        verbose=False, 
        early_stopping_rounds = 100
    )

    predictions = model.predict(X_valid)


    auc = roc_auc_score(y_valid, predictions)
    aucs.append(auc)
    accuracy = accuracy_score(y_valid, predictions)
    accuracies.append(accuracy)


# calculate the average of metrics for all folds
avg_auc = np.mean(aucs)
avg_accuracy = np.mean(accuracies)
print(f'Average AUC: {avg_auc}, Average Accuracy: {avg_accuracy}')







Average AUC: 0.722901554404145, Average Accuracy: 0.9488


In [51]:
predictions = model.predict(X_valid)


auc = roc_auc_score(y_valid, predictions)
auc

0.7236861584011843

In [52]:
from explainerdashboard import ClassifierExplainer, InlineExplainer

In [53]:
explainer = ClassifierExplainer(model, X_valid, y_valid)

Detected XGBClassifier model: Changing class type to XGBClassifierExplainer...
Note: model_output=='probability'. For XGBClassifier shap values normally get calculated against X_background, but paramater X_background=None, so using X instead
Generating self.shap_explainer = shap.TreeExplainer(model, X, model_output='probability', feature_perturbation='interventional')...
Note: Shap interaction values will not be available. If shap values in probability space are not necessary you can pass model_output='logodds' to get shap values in logodds without the need for a background dataset and also working shap interaction values...


In [54]:
ie = InlineExplainer(explainer)

In [55]:
ie.importances()


The 'environ['werkzeug.server.shutdown']' function is deprecated and will be removed in Werkzeug 2.1.



Calculating shap values...


In [56]:
ie.classifier.roc_auc()

 18%|====                | 886/5000 [00:18<01:23]       

 26%|=====               | 1290/5000 [00:26<01:14]       

Calculating roc auc curves...


 27%|=====               | 1340/5000 [00:27<01:13]       

Calculating prediction probabilities...




In [57]:
ie.classifier.confusion_matrix()


The 'environ['werkzeug.server.shutdown']' function is deprecated and will be removed in Werkzeug 2.1.




Calculating confusion matrices...


