<a href="https://colab.research.google.com/github/frankfaisca/machine-learning/blob/main/ex4_calibration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Load libs

import numpy as np
import pickle
import pandas as pd
from sklearn import preprocessing
from sklearn import utils

from sklearn.utils import resample


#Graphs
import matplotlib.pyplot as plt #visualization
from matplotlib import pyplot #visualization
%matplotlib inline
#static images of your plot embedded in the workbook

from sklearn.ensemble import GradientBoostingClassifier

#Evaluation
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report

import warnings

import warnings
warnings.filterwarnings("ignore")


In [None]:
# Load pickle file into train and test samples

outfilename = "/content/sample_data/A652.pickle"
file = open(outfilename , 'rb')
(X_train , y_train , X_val , y_val , X_test , y_test) = pickle.load(file)
print(f"Shapes: ", X_train.shape , X_val.shape , X_test.shape)


Shapes:  (10178, 24) (2565, 24) (765, 24)


In [None]:

X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
X_val = pd.DataFrame(X_val)
y_val = pd.DataFrame(y_val)
X_test = pd.DataFrame(X_test)
y_test = pd.DataFrame(y_test)


y_train[0] = y_train.applymap(lambda x: 1 if x > 0 else x)
y_val[0] = y_val.applymap(lambda x: 1 if x > 0 else x)
y_test[0] = y_test.applymap(lambda x: 1 if x > 0 else x)


In [None]:
# calibration function

def calibration_data(y_true, y_pred):
    df = pd.DataFrame({'y_true':y_true, 'y_pred_bucket': (y_pred//0.05)*0.05 + 0.025})
    cdf = df.groupby(['y_pred_bucket'], as_index=False).agg({'y_true':["mean","count"]})
    return cdf.y_true.values[:,0][cdf.y_true.values[:,1]>10], cdf.y_pred_bucket.values[cdf.y_true.values[:,1]>10]


In [None]:
seed = 1


## training model using random forest and isotonic regression for calibration
calibrated_rf = CalibratedClassifierCV(GradientBoostingClassifier())
calibrated_rf.fit(X_train, y_train)

## getting the output to visualize on test data
prob_true_calib, prob_pred_calib  = calibration_data(y_test[0], calibrated_rf.predict_proba(X_test)[:,1])

chart_df = pd.DataFrame({
    "actuals": prob_true_calib,
    "predicted": prob_pred_calib,
    "expected": prob_pred_calib
})
fig = px.line(
        data_frame = chart_df,
        markers = True,
        x = "predicted",
        y = ["actuals", "expected"],
        template = "plotly_dark")
fig.update_layout(
        title  = {"text": "Calibration Plot: Isotonic", "y": 0.95, "x": 0.5},
        xaxis_title="Predicted Probability",
        yaxis_title="Actual Probability",
        font = dict(size=15)
)
fig.show(renderer='colab')

In [None]:
## training model using random forest and sigmoid method for calibration
calibrated_sigmoid = CalibratedClassifierCV(
     GradientBoostingClassifier(random_state=seed), method = 'sigmoid')
calibrated_sigmoid.fit(X_train, y_train)

## getting the output to visualize on test data
prob_true_calib, prob_pred_calib  = calibration_data(y_test[0], calibrated_sigmoid.predict_proba(X_test)[:,1])

chart_df = pd.DataFrame({
    "actuals": prob_true_calib,
    "predicted": prob_pred_calib,
    "expected": prob_pred_calib
})
fig = px.line(
        data_frame = chart_df,
        markers = True,
        x = "predicted",
        y = ["actuals", "expected"],
        template = "plotly_dark")
fig.update_layout(
        title  = {"text": "Calibration Plot : Sigmoid", "y": 0.95, "x": 0.5},
        xaxis_title="Predicted Probability",
        yaxis_title="Actual Probability",
        font = dict(size=15)
)
fig.show(renderer='colab')