In [5]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pathlib

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, FunctionTransformer, maxabs_scale

from kdd98.config import Config

In [3]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=classes)
    # Only use the labels that appear in the data
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    #else:
        #print('Confusion matrix, without normalization')

    #print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=Config.get("seq_color_map"))
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="black" if cm[i, j] < thresh else "white")
    fig.tight_layout()
    return ax

In [1]:
def prepare_result_store(file):
    result_store = pathlib.Path(
        Config.get("model_store"), file)
    if result_store.is_file():
        with open(result_store, "rb") as f:
            gridsearch_results = pickle.load(f)
    else:
        gridsearch_results = {m: {
                "best_estimator": None,
                "best_score": -1.0,
                "cv_results": None
            } for m in ["GBM", "RF", "GLMnet", "NNet", "SVM"]}
        with open(result_store, "wb") as f:
            pickle.dump(gridsearch_results, f)
    return gridsearch_results

def update_result(model, gridsearch, results_file):
    if not model in ["GBM", "RF", "GLMnet", "NNet"]:
        raise ValueError("Invalid model name")
    gridsearch_results = prepare_result_store(results_file)
    
    previous_score = gridsearch_results[model]["best_score"]
    if gridsearch.best_score_ > previous_score:
        log("Storing improved result. Improvement: {}".format(gridsearch.best_score_-previous_score))
        gridsearch_results[model]["best_estimator"] = gridsearch.best_estimator_
        gridsearch_results[model]["best_score"] = gridsearch.best_score_
        gridsearch_results[model]["cv_results"] = pd.DataFrame(gridsearch.cv_results_)
        
        file_name ="gridsearch_{}.pkl".format(model)
        with open(pathlib.Path(Config.get("model_store"),file_name),"wb") as f:
            pickle.dump(gridsearch, f)
        with open(pathlib.Path(Config.get("model_store"),results_file), "wb") as f:
            pickle.dump(gridsearch_results, f)
    else:
        log("Best params: {}".format(gridsearch.best_estimator_))
        log("No improvement over previous search for {}. Not storing results.".format(model))
        
def run_experiments(X_train, y_train, config, scoring, cv, refit, results_file):
    if not results_file:
        results_file = "gridsearch_results_dict_refit_{}.pkl".format(refit)
    
    for m in config:
        if config[m]["run"]:
            params = config[m]["param_grid"]
            pipe = config[m]["pipeline"]
            fit_params = config[m]["fit_params"]
            log("Starting gridsearch for {}".format(m))
            gridsearch = RandomizedSearchCV(
                pipe,
                params,
                scoring=scoring,
                n_jobs=-1,
                cv=10,
                pre_dispatch=16, # Limit dispatching to prevent memory overflow
                refit=refit,
                return_train_score=True,
                verbose=10)
            if fit_params:
                try:
                    gridsearch.fit(X_train, y_train, **fit_params)
                except Exception as e:
                    log("Fitting failed for {}. Message: {}".format(m, e))
                    break
            else:
                try:
                    gridsearch.fit(X_train, y_train)
                except Exception as e:
                    log("Fitting failed for {}. Message: {}".format(m, e))
                    break
            update_result(m, gridsearch, results_file)
        else:
            log("Skipping {}".format(m))

In [None]:
print("Set plot_confusion_matrix(), prepare_result_store(), update_result() and run_experiments()")