In [3]:
%run ./common_init.ipynb



<Figure size 432x288 with 0 Axes>

Setup logging to file: out.log
Figure output directory saved in figure_output at /home/datarian/OneDrive/unine/Master_Thesis/ma-thesis-report/figures
 cwd: /data/home/datarian/git/master-thesis-msc-statistics/code


In [4]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pathlib

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, FunctionTransformer, maxabs_scale

from kdd98.config import Config

In [5]:
Config.set("model_store", "/data/home/datarian/OneDrive/unine/Master_Thesis/ma-thesis-report/models")
Config.set("data_dir", "/data/home/datarian/OneDrive/unine/Master_Thesis/ma-thesis-report/data")

In [7]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None):

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=classes)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=Config.get("seq_color_map"))
    if normalize:
        ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=classes, yticklabels=classes,
           ylabel='True',
           xlabel='Predicted')
    if title:
        ax.set(title=title)

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="black" if cm[i, j] < thresh else "white")
    fig.tight_layout()
    return ax

In [8]:
def prepare_result_store(file):
    result_store = pathlib.Path(
        Config.get("model_store"), file)
    if result_store.is_file():
        with open(result_store, "rb") as f:
            gridsearch_results = pickle.load(f)
    else:
        gridsearch_results = {m: {
                "best_estimator": None,
                "best_score": -1.0,
                "best_params": None,
                "cv_results": None
            } for m in ["GBM", "RF", "GLMnet", "NNet", "SVM"]}
        with open(result_store, "wb") as f:
            pickle.dump(gridsearch_results, f)
    return gridsearch_results

def update_result(model, gridsearch, results_file, ignore_score=False):
    if not model in ["GBM", "RF", "GLMnet", "NNet", "SVM"]:
        raise ValueError("Invalid model name")
    gridsearch_results = prepare_result_store(results_file)
    
    previous_score = gridsearch_results[model]["best_score"]
    if gridsearch.best_score_ > previous_score or ignore_score:
        log("Storing result. Score change: {}".format(gridsearch.best_score_-previous_score))
        log("Best params: {}".format(gridsearch.best_params_))
        gridsearch_results[model]["best_estimator"] = gridsearch.best_estimator_
        gridsearch_results[model]["best_params"] = gridsearch.best_params_
        gridsearch_results[model]["best_score"] = gridsearch.best_score_
        gridsearch_results[model]["cv_results"] = pd.DataFrame(gridsearch.cv_results_)
        with open(pathlib.Path(Config.get("model_store"),results_file), "wb") as f:
            pickle.dump(gridsearch_results, f)
    else:
        log("Best params: {}".format(gridsearch.best_params_))
        log("No improvement over previous search for {}. Not storing results.".format(model))

In [10]:
def print_cv_results(refit):
    results_file = "classifiers_refit_{}.pkl".format(refit)
    with open(pathlib.Path(Config.get("model_store"), results_file), "rb") as f:
        gridsearch_results = pickle.load(f)
    for m in gridsearch_results:
        if gridsearch_results[m]["best_estimator"]:
            cv_results = gridsearch_results[m]
            print("Model {}".format(m))
            print(classification_report_imbalanced(y_val.TARGET_B.values,cv_results["best_estimator"].predict(X_val.values)))
            results = cv_results["cv_results"].sort_values(by='mean_test_{}'.format(refit), ascending=False)
            print("Mean scores")
            print(results[[k for k, v in results.iteritems() if k.startswith("mean_test_")]].round(3))
            #print(results[['mean_test_f1', 'mean_test_recall', 'mean_test_auc']].round(3))
            if "best_params" in gridsearch_results[m].keys():
                print("Best params")
                print(gridsearch_results[m]["best_params"])
            print("*******************************************************")

In [11]:
def print_confusion_matrices(refit):
    results_file = "classifiers_refit_{}.pkl".format(refit)
    with open(pathlib.Path(Config.get("model_store"), results_file), "rb") as f:
        gridsearch_results = pickle.load(f)

    for m in gridsearch_results:
        if gridsearch_results[m]["best_estimator"]:
            y_predict = gridsearch_results[m]["best_estimator"].predict(X_val.values)
            gridsearch_results[m]["y_pred"] = y_predict
            print("Confusion matrix for model {}".format(m))
            plot_confusion_matrix(y_val.TARGET_B.values, y_predict, [1,0], title=m)
            save_fig("confusion_matrix_model_{}_refit_{}".format(m, refit))

In [12]:
def set_size(w,h, ax=None):
    """ w, h: width, height in inches """
    if not ax: ax=plt.gca()
    l = ax.figure.subplotpars.left
    r = ax.figure.subplotpars.right
    t = ax.figure.subplotpars.top
    b = ax.figure.subplotpars.bottom
    figw = float(w)/(r-l)
    figh = float(h)/(t-b)
    ax.figure.set_size_inches(figw, figh)
    return ax

In [13]:
def print_roc_auc_curve(refit, print_title=True):
    results_file = "classifiers_refit_{}.pkl".format(refit)
    with open(pathlib.Path(Config.get("model_store"), results_file), "rb") as f:
        gridsearch_results = pickle.load(f)

    def roc_curve_data(estimator):
        try:
            y_score = estimator.score(X_val.values)
        except Exception:
            y_score = estimator.predict_proba(X_val.values)[:,1]

        fpr, tpr, thresholds = roc_curve(y_val.TARGET_B.values, y_score, pos_label=1)
        return (fpr, tpr, thresholds)
    
    fig = plt.figure(figsize=(10,10))

    plt.xlabel("False Positive Rate (FPR)")
    plt.ylabel("True Positive Rate (TPR)")
    plt.xlim(0.0,1.0)
    plt.ylim(0.0,1.0)
    plt.plot([0, 1], [0, 1], color='black', lw=1.5, linestyle='--')

    for m in gridsearch_results:
        if gridsearch_results[m]["best_estimator"]:
            fpr, tpr, thresholds = roc_curve_data(gridsearch_results[m]["best_estimator"])
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=1, label="{}, area = {:.2}".format(m, roc_auc))
    plt.legend(loc="lower right", title="Metric: {}".format(refit))
    set_size(8,8)
    save_fig("roc_auc_compared_refit_{}".format(refit))

In [14]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from inspect import signature

def print_precision_recall_curve(refit, print_title=True):
    
    results_file = "classifiers_refit_{}.pkl".format(refit)
    with open(pathlib.Path(Config.get("model_store"), results_file), "rb") as f:
        gridsearch_results = pickle.load(f)
        
    def p_r_data(estimator):
        try:
            y_score = estimator.score(X_val.values)
        except Exception:
            y_score = estimator.predict_proba(X_val.values)[:,1]
        p, r, _ = precision_recall_curve(y_val.TARGET_B.values, y_score)
        
        fpr, tpr, thresholds = roc_curve(y_val.TARGET_B.values, y_score, pos_label=1)
        return (p, r)
    
    fig = plt.figure(figsize=(10,10))
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])

    for m in gridsearch_results:
        if gridsearch_results[m]["best_estimator"]:
            p, r = p_r_data(gridsearch_results[m]["best_estimator"])
            plt.plot(r, p, lw=1.5, label="{}".format(m))
    plt.legend(loc="upper right", title="Metric: {}".format(refit))
    set_size(8,8)
    save_fig("prec_rec_compared_refit_{}".format(refit))