In [60]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
%run ./common_init.ipynb

In [62]:
%run ./learning_init.ipynb

In [63]:
%autoreload 2
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# Load custom code
import kdd98.data_handler as dh
from kdd98.config import Config
import pickle

In [64]:
# Where to save the figures
PROJECT_ROOT_DIR = "../../"
CHAPTER_ID = "random_forest"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "figures", CHAPTER_ID)

if not os.path.exists(IMAGES_PATH):
    os.makedirs(IMAGES_PATH)

def save_fig(fig_id, tight_layout=True, fig_extension=["pdf", "png"], resolution=300):
    if tight_layout:
        plt.tight_layout()
    [plt.savefig(pathlib.Path(IMAGES_PATH, fig_id + "." + f), 
                 format=f,
                 dpi=resolution,
                 transparent=True,
                 bbox_inches='tight') for f in fig_extension]

# Loading Data

In [65]:
with open(pathlib.Path(Config.get("df_store"), "X_train.pd.pkl"), "rb") as f:
    kdd98_learn_feat = pickle.load(f)
with open(pathlib.Path(Config.get("df_store"), "y_train.pd.pkl"), "rb") as f:
    kdd98_learn_targets = pickle.load(f)
with open(pathlib.Path(Config.get("df_store"), "X_test.pd.pkl"), "rb") as f:
    kdd98_test_feat = pickle.load(f)
with open(pathlib.Path(Config.get("df_store"), "y_test.pd.pkl"), "rb") as f:
    kdd98_test_targets = pickle.load(f)
with open(pathlib.Path(Config.get("df_store"), "X_train_all_relevant.pd.pkl"), "rb") as f:
    kdd98_learn_feat_all_relevant = pickle.load(f)
with open(pathlib.Path(Config.get("df_store"), "X_test_all_relevant.pd.pkl"), "rb") as f:
    kdd98_test_feat_all_relevant = pickle.load(f)
    
# Extracting the data and resetting target to [-1, 1]
X_train = kdd98_learn_feat.values
X_train_all_relevant = kdd98_learn_feat_all_relevant.values
y_train = kdd98_learn_targets.loc[:,"TARGET_B"].astype("int64").values
X_test = kdd98_test_feat.values
X_test_all_relevant = kdd98_test_feat_all_relevant.values
y_test = kdd98_test_targets.loc[:,"TARGET_B"].astype("int64").values

In [8]:
from imblearn.over_sampling import RandomOverSampler

ovs = RandomOverSampler(sampling_strategy="minority", random_state=Config.get("random_seed"), )
X_resampled, y_resampled = ovs.fit_resample(X_train_all_relevant, y_train)

## Setting up classifier

In [25]:
# Setting up the classifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import make_scorer, roc_auc_score, log_loss, confusion_matrix
from sklearn.pipeline import Pipeline

In [68]:
param_grid = {
    "class_weight": ["balanced_subsample"],
    "n_estimators": [500, 800, 1000],
}

# We use the same strategy as for splitting into train / test set to provide cv splits:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=Config.get("random_seed"))

scoring = {"roc": make_scorer(roc_auc_score, average="weighted"),
           "neg_log_loss": "neg_log_loss"}

rf_classif = RandomForestClassifier(max_depth=8,
                                    oob_score=True,
                                    random_state=Config.get("random_seed"))

In [69]:
grid_search = GridSearchCV(
    rf_classif,
    param_grid,
    scoring=scoring,
    pre_dispatch=2,
    n_jobs=4,
    cv=cv,
    refit="neg_log_loss",
    return_train_score=True,
    verbose=3)

# Fitting

In [None]:
grid_search.fit(X_train_all_relevant, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
import pickle
import pathlib
with open(pathlib.Path(Config.get("model_store"), "random_forest_gridsearch.pkl"), "wb") as f:
    pickle.dump(grid_search, f)

In [None]:
results = grid_search.cv_results_

In [None]:
best_estimator = grid_search.best_estimator_

In [None]:
best_estimator.get_params()

In [None]:
kdd98_learn_feat_all_relevant.columns.values

In [None]:
gs_features = best_estimator.feature_importances_
gs_important_features = pd.DataFrame(data=gs_features, index=kdd98_learn_feat_all_relevant.columns.values, columns=["feature_importance"])

In [None]:
gs_important_features = gs_important_features.sort_values(ascending=False, by="feature_importance")

In [None]:
plt.rcParams['figure.figsize'] = (18, 10)
gs_important_features.head(n=40).sort_values(ascending=True, by="feature_importance").plot.barh()

# Predictions

In [None]:
y_predict = grid_search.predict(X_test_all_relevant)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict, labels=[1,0])

In [None]:
plot_confusion_matrix(y_test,y_predict, [1,0], normalize=True, cmap=Config.get("color_map"))

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thresholds = roc_curve(y_test, grid_search.predict_proba(X_test_all_relevant)[:,1],pos_label=1)
roc_auc = auc(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
plt.plot(fpr, tpr, lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.legend(loc="lower right")
plt.xlim(0.0,1.0)
plt.ylim(0.0,1.0)