In [None]:
import pandas as pd
import os
import sys
import pickle

In [None]:
sys.path.append("../src/ga4_mlops/pipelines")

In [None]:
from data_preparation_utils import extract_column_names

In [None]:
def predict(abt_predict: pd.DataFrame, model) -> pd.DataFrame:
    """Make predictions on a given data frame.

    Args:
        abt_predict (pd.DataFrame): data frame to predict on
        model (): any model with predict_proba method

    Returns:
        pd.DataFrame: data frame with predicted scores
    """
    # logger.info("Applying model to get predictions...")

    info_cols, num_cols, cat_cols, _ = extract_column_names(abt_predict)

    scores = model.predict_proba(abt_predict[num_cols + cat_cols])[:, 1]

    predictions = abt_predict.loc[:, info_cols]
    predictions["y_score"] = scores

    return predictions

In [None]:
abt_test = pd.read_csv('../data/05_model_input/abt_test.csv')

In [None]:
with open('../data/06_models/model.pkl', 'rb') as pickle_file:
    model = pickle.load(pickle_file)

In [None]:
info_cols, num_cols, cat_cols, target_col = extract_column_names(abt_test)

In [None]:
model.predict_proba(abt_test[num_cols + cat_cols])

In [None]:
raw_preds = predict(abt_test, model)
raw_preds

In [None]:
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

In [None]:
calibration_model = CalibratedClassifierCV(model, method="isotonic", cv="prefit")

In [None]:
calibration_model.fit(
    X = abt_test[num_cols + cat_cols],
    y = abt_test[target_col]
)

In [None]:
calibrated_preds = predict(abt_test, calibration_model)
calibrated_preds

In [None]:
import numpy as np

In [None]:
np.min(raw_preds["y_score"]), np.mean(raw_preds["y_score"]), np.max(raw_preds["y_score"])

In [None]:
np.min(calibrated_preds["y_score"]), np.mean(calibrated_preds["y_score"]), np.max(calibrated_preds["y_score"])

In [None]:
# n_bins = int(max(5, abt_test.shape[0]/100))
n_bins = 25

In [None]:
raw_calibration_curve = calibration_curve(abt_test[target_col], raw_preds["y_score"], strategy="quantile", n_bins=n_bins)

In [None]:
calibrated_calibration_curve = calibration_curve(abt_test[target_col], calibrated_preds["y_score"], strategy="quantile", n_bins=n_bins)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

fig, ax = plt.subplots()
plt.plot(raw_calibration_curve[0], raw_calibration_curve[1], marker='o', linewidth=1, label='raw')
plt.plot(calibrated_calibration_curve[0], calibrated_calibration_curve[1], marker='o', linewidth=1, label='calibrated')

ax.add_line(mlines.Line2D([0, 1], [0, 1], color='black'))
fig.suptitle('Calibration plot (test subset)')
ax.set_xlabel('Predicted probability')
ax.set_ylabel('Fraction od positives')
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
plt.legend()
plt.show()

In [None]:
raw_preds = model.predict_proba(abt_test[num_cols + cat_cols])[:, 1]
cal_preds = calibration_model.predict_proba(abt_test[num_cols + cat_cols])[:, 1]

In [None]:
type(raw_preds), type(cal_preds)