In [69]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [70]:
%run ./common_init.ipynb

In [71]:
%autoreload 2
import numpy as np
from sklearn.impute import SimpleImputer
import pickle

# Load custom code
import kdd98.data_handler as dh
from kdd98.config import Config
from kdd98.transformers import *

from fancyimpute import KNN, IterativeImputer, BiScaler

In [72]:
# Where to save the figures
IMAGES_PATH = pathlib.Path(figure_output/'complete_analysis')

pathlib.Path(IMAGES_PATH).mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension=["pdf", "png"], resolution=300):
    if tight_layout:
        plt.tight_layout()
    [plt.savefig(pathlib.Path(IMAGES_PATH, fig_id + "." + f), 
                 format=f,
                 dpi=resolution,
                 transparent=True,
                 bbox_inches='tight') for f in fig_extension]

# Loading data

In [73]:
learning_provider = dh.KDD98DataProvider("cup98LRN.txt")
test_provider = dh.KDD98DataProvider("cup98VAL.txt")

In [14]:
learning = learning_provider.all_relevant_data
test = test_provider.all_relevant_data

Check that we have the same features in both sets

In [7]:
set(learning["data"].columns.values) - set(test["data"].columns.values)

set()

Data set dimensions

In [8]:
learning["data"].shape

(95412, 58)

In [21]:
test["data"].shape

(96367, 58)

# Predictions

In [9]:
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVR, SVC
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from kdd98.transformers import Rescaler
from kdd98.prediction import Kdd98ProfitEstimator

## Learning binary classifier 

In [10]:
mlp_sampler = BorderlineSMOTE(random_state=Config.get("random_seed"))
mlp_scaler = Rescaler(transformer="ptrans")
svc_scaler = Rescaler(transformer="ptrans")

svc_classifier = SVC(
    class_weight="balanced",
    C=35,
    coef0=0.174,
    kernel="poly",
    degree=3,
    probability=True,
    gamma="auto",
    random_state=Config.get("random_seed"))



mlp_classifier = MLPClassifier(
    hidden_layer_sizes=(50, 10,),
    alpha=0.5622,
    learning_rate_init=0.0842,
    early_stopping=True,
    random_state=Config.get("random_seed")
)

classifier = Pipeline([
    ("scaler", mlp_scaler),
    ("sampler", mlp_sampler),
    ("classifier", mlp_classifier)
])

#classifier = Pipeline([
#    ("scaler", svc_scaler),
#    ("classifier", svc_classifier)
#])

regressor = SVR(C=72, degree=12, gamma=3.91)

In [11]:
classifier.fit(learning["data"].values, learning["targets"].TARGET_B.values)

Pipeline(memory=None,
     steps=[('scaler', Rescaler(transformer='ptrans')), ('sampler', BorderlineSMOTE(k_neighbors=5, kind='borderline-1', m_neighbors=10, n_jobs=1,
        random_state=42, sampling_strategy='auto')), ('classifier', MLPClassifier(activation='relu', alpha=0.5622, batch_size='auto', beta_1=0.9,
       beta_2...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))])

Assess classifier generalisation error

In [23]:
test["targets"].TARGET_B.describe()

count    96367.000000
mean         0.050567
std          0.219113
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: TARGET_B, dtype: float64

In [24]:
print(classification_report(test["targets"].TARGET_B.astype("int").values,classifier.predict(test["data"].values)))

              precision    recall  f1-score   support

           0       0.96      0.59      0.73     91494
           1       0.07      0.56      0.12      4873

   micro avg       0.59      0.59      0.59     96367
   macro avg       0.51      0.58      0.42     96367
weighted avg       0.92      0.59      0.70     96367



Assess regressor performance

In [25]:
pe = Kdd98ProfitEstimator(classifier, regressor)

In [26]:
pe.fit(learning["data"], learning["targets"])

Kdd98ProfitEstimator(classifier=Pipeline(memory=None,
     steps=[('scaler', Rescaler(transformer='ptrans')), ('sampler', BorderlineSMOTE(k_neighbors=5, kind='borderline-1', m_neighbors=10, n_jobs=1,
        random_state=42, sampling_strategy='auto')), ('classifier', MLPClassifier(activation='relu', alpha=0.5622, batch_size='auto', beta_1=0.9,
       beta_2...=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]),
           regressor=SVR(C=72, cache_size=200, coef0=0.0, degree=12, epsilon=0.1, gamma=3.91,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))

In [27]:
profit_learning = pe.predict(learning["data"], learning["targets"])

In [28]:
print(sum(profit_learning[0]))
print(profit_learning[1])

1307
34201.369999999995


In [29]:
profit_test = pe.predict(test["data"], test["targets"])

In [30]:
print(sum(profit_test[0]))
print(profit_test[1])

96349
10572.320000000014


In [33]:
test["data"].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96367 entries, 188946 to 123497
Data columns (total 58 columns):
AGE                      96367 non-null float64
PEPSTRFL                 96367 non-null float64
HV1                      96367 non-null float64
HV2                      96367 non-null float64
HVP1                     96367 non-null float64
HVP2                     96367 non-null float64
HVP3                     96367 non-null float64
HVP4                     96367 non-null float64
HVP5                     96367 non-null float64
RP1                      96367 non-null float64
DMA                      96367 non-null float64
IC2                      96367 non-null float64
IC3                      96367 non-null float64
IC4                      96367 non-null float64
IC5                      96367 non-null float64
HHAS3                    96367 non-null float64
EC7                      96367 non-null float64
POBC2                    96367 non-null float64
CARDPROM             

In [31]:
np.sum(test["targets"].TARGET_D.values[test["targets"].TARGET_D.values > 0.0] - 0.68)

72775.99999999999

# Complete Pipeline

In [None]:
imputer = MedianImputer()
extractor = AllRelevantFeatureFilter()
predictor = Kdd98ProfitEstimator(classifier, regressor)

In [None]:
predictions = Pipeline([
    ("imputer", imputer),
    ("extractor", extractor),
    ("predictor", predictor)
])

In [None]:
predictions.fit(learning["data"],learning["targets"])