In [1]:
%load_ext autoreload

In [2]:
%run ./common_init.ipynb

Setup logging to file: out.log
Figure output directory saved in figure_output at /home/datarian/OneDrive/unine/Master_Thesis/ma-thesis-report/figures


In [3]:
%autoreload 2
import numpy as np
from sklearn.impute import SimpleImputer
import pickle

# Load custom code
import kdd98.data_handler as dh
from kdd98.config import Config

from fancyimpute import KNN, IterativeImputer, BiScaler

Using TensorFlow backend.


In [4]:
# Where to save the figures
IMAGES_PATH = pathlib.Path(figure_output/'complete_analysis')

pathlib.Path(IMAGES_PATH).mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension=["pdf", "png"], resolution=300):
    if tight_layout:
        plt.tight_layout()
    [plt.savefig(pathlib.Path(IMAGES_PATH, fig_id + "." + f), 
                 format=f,
                 dpi=resolution,
                 transparent=True,
                 bbox_inches='tight') for f in fig_extension]

# Loading data

In [14]:
learning_provider = dh.KDD98DataProvider("cup98LRN.txt")
test_provider = dh.KDD98DataProvider("cup98VAL.txt")

In [15]:
learning = learning_provider.numeric_data
test = test_provider.numeric_data

# Imputation

In [17]:
set(learning["data"].columns.values) - set(test["data"].columns.values)

set()

In [18]:
from sklearn.impute import SimpleImputer

imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_imputed = imp_median.fit(learning["data"].values)

learning["data"] = pd.DataFrame(data=imp_median.fit_transform(learning["data"]),
                               index = learning["data"].index,
                               columns=learning["data"].columns)
test["data"] = pd.DataFrame(data=imp_median.transform(test["data"]),
                               index = test["data"].index,
                               columns=test["data"].columns)

# Feature Extraction 

In [22]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier

In [23]:
try:
    with open(pathlib.Path(Config.get("model_store"), "feature_selection_boruta.pkl"), "rb") as f:
        feat_selector = pickle.load(f)
except:
    # define random forest classifier, with utilising all cores and
    # sampling in proportion to y labels
    rf = RandomForestClassifier(n_jobs=-1,
                                max_depth=6,
                                class_weight='balanced')
    feat_selector = BorutaPy(rf,
                         n_estimators='auto',
                         max_iter=120,
                         perc=100, # controls how conservatively we select features. Lower means more false positives
                         verbose=2,
                         random_state=Config.get("random_seed"))
    feat_selector.fit(learning["data"].values, learning["targets"].loc[:,"TARGET_B"].values)

In [24]:
extracted_feature_names = learning["data"].columns.values[feat_selector.support_]

In [25]:
learning["data"] = pd.DataFrame(feat_selector.transform(learning["data"].values),
                                   columns = extracted_feature_names,
                                   index = learning["data"].index)
learning["feature_names"] = extracted_feature_names

test["data"] = pd.DataFrame(feat_selector.transform(test["data"].values),
                                   columns = extracted_feature_names,
                                   index = test["data"].index)
test["feature_names"] = extracted_feature_names

# Predictions

In [29]:
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVR
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from kdd98.transformers import Rescaler
from kdd98.prediction import Kdd98ProfitEstimator

## Learning binary classifier 

In [30]:
mlp_sampler = BorderlineSMOTE(random_state=Config.get("random_seed"))
mlp_scaler = Rescaler(transformer="ptrans")

mlp_classifier = MLPClassifier(
    hidden_layer_sizes=(50, 10,),
    alpha=0.5622,
    learning_rate_init=0.0842,
    early_stopping=True,
    random_state=Config.get("random_seed")
)

classifier = Pipeline([
    ("scaler", mlp_scaler),
    ("sampler", mlp_sampler),
    ("classifier", mlp_classifier)
])

regressor = SVR(C=72, degree=12, gamma=3.91)

Assess classifier generalisation error

In [None]:
print(classification_report(test["targets"].TARGET_B.values,classifier.predict(test["data"].values)))

Assess regressor performance

In [31]:
pe = Kdd98ProfitEstimator(classifier, regressor)

In [32]:
pe.fit(learning["data"], learning["targets"])

In [33]:
profit_learning = pe.predict(learning["data"].values)

In [42]:
print(profit_learningrning)

10939.111421708292


In [43]:
profit_test = pe.predict(test["data"].values)

In [44]:
print(profit_test)

10981.505897106652


In [40]:
np.sum(test["targets"].TARGET_D.values - 0.68)

10560.080000000013