In [1]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pathlib

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, FunctionTransformer, maxabs_scale

from kdd98.config import Config

In [6]:
Config.set("model_store", "/data/home/datarian/OneDrive/unine/Master_Thesis/ma-thesis-report/models")
Config.set("data_dir", "/data/home/datarian/OneDrive/unine/Master_Thesis/ma-thesis-report/data")

In [7]:
Config.get("model_store")

'/data/home/datarian/OneDrive/unine/Master_Thesis/ma-thesis-report/models'

In [2]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None):

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=classes)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=Config.get("seq_color_map"))
    if normalize:
        ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=classes, yticklabels=classes,
           ylabel='True',
           xlabel='Predicted')
    if title:
        ax.set(title=title)

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="black" if cm[i, j] < thresh else "white")
    fig.tight_layout()
    return ax

In [4]:
def prepare_result_store(file):
    result_store = pathlib.Path(
        Config.get("model_store"), file)
    if result_store.is_file():
        with open(result_store, "rb") as f:
            gridsearch_results = pickle.load(f)
    else:
        gridsearch_results = {m: {
                "best_estimator": None,
                "best_score": -1.0,
                "best_params": None,
                "cv_results": None
            } for m in ["GBM", "RF", "GLMnet", "NNet", "SVM"]}
        with open(result_store, "wb") as f:
            pickle.dump(gridsearch_results, f)
    return gridsearch_results

def update_result(model, gridsearch, results_file, ignore_score=False):
    if not model in ["GBM", "RF", "GLMnet", "NNet", "SVM"]:
        raise ValueError("Invalid model name")
    gridsearch_results = prepare_result_store(results_file)
    
    previous_score = gridsearch_results[model]["best_score"]
    if gridsearch.best_score_ > previous_score or ignore_score:
        log("Storing result. Score change: {}".format(gridsearch.best_score_-previous_score))
        log("Best params: {}".format(gridsearch.best_params_))
        gridsearch_results[model]["best_estimator"] = gridsearch.best_estimator_
        gridsearch_results[model]["best_params"] = gridsearch.best_params_
        gridsearch_results[model]["best_score"] = gridsearch.best_score_
        gridsearch_results[model]["cv_results"] = pd.DataFrame(gridsearch.cv_results_)
        with open(pathlib.Path(Config.get("model_store"),results_file), "wb") as f:
            pickle.dump(gridsearch_results, f)
    else:
        log("Best params: {}".format(gridsearch.best_params_))
        log("No improvement over previous search for {}. Not storing results.".format(model))

In [9]:
import pickle
with open(pathlib.Path(Config.get("model_store"), "classifiers_refit_f1.pkl"), "rb") as f:
    clfs = pickle.load(f)

In [10]:
clfs["NNet"]

{'best_estimator': Pipeline(memory=None,
      steps=[('scaler', PowerTransformer(copy=True, method='yeo-johnson', standardize=True)), ('sampler', BorderlineSMOTE(k_neighbors=5, kind='borderline-1', m_neighbors=10, n_jobs=1,
         random_state=42, sampling_strategy=0.9507610378684456)), ('classifier', MLPClassifier(activation='relu', alpha=0.5...=True, solver='adam', tol=0.0001,
        validation_fraction=0.1, verbose=False, warm_start=False))]),
 'best_score': 0.13082304508831785,
 'best_params': {'classifier__alpha': 0.5,
  'classifier__hidden_layer_sizes': (28, 28),
  'classifier__learning_rate_init': 0.07925436438616416,
  'sampler__sampling_strategy': 0.9507610378684456},
 'cv_results':    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
 0      74.163042      6.649175         0.425421        0.126196   
 1      78.847192      7.775132         0.325821        0.125901   
 2      80.535002      9.309415         0.381549        0.119747   
 3      80.491182      9

In [6]:
def run_experiments(X_train, y_train, config, scoring, splits, refit):
    results_file = "classifiers_refit_{}.pkl".format(refit)
    
    for m in config:
        if config[m]["run"]:
            params = config[m]["param_grid"]
            pipe = config[m]["pipeline"]
            fit_params = config[m]["fit_params"]
            log("Starting gridsearch for {}".format(m))
            cv = StratifiedKFold(n_splits=splits, random_state=Config.get("random_seed"))
            gridsearch = RandomizedSearchCV(
                pipe,
                params,
                scoring=scoring,
                n_jobs=-1,
                cv=cv,
                pre_dispatch=4, # Limit dispatching to prevent memory overflow
                refit=refit,
                return_train_score=True,
                verbose=10)
            if fit_params:
                try:
                    gridsearch.fit(X_train, y_train, **fit_params)
                except Exception as e:
                    log("Fitting failed for {}. Message: {}".format(m, e))
                    break
            else:
                try:
                    gridsearch.fit(X_train, y_train)
                except Exception as e:
                    log("Fitting failed for {}. Message: {}".format(m, e))
                    break
            print("Fitting done. Best score: {}, best parameters: {}".format(gridsearch.best_score_, gridsearch.best_params_))
            update_result(m, gridsearch, results_file)
        else:
            log("Skipping {}".format(m))

In [None]:
print("Set plot_confusion_matrix(), prepare_result_store(), update_result() and run_experiments()")