# 02. Train Model

ToDo:
    * bin
    * other cat encoding
    * 999 -> mean or median

Done:
    * new feature pdays == 999 (not improve)
    * test all classifier algorithms

Changelog:
    * separate transformer from classification pipeline
    * use gridsearchcv
    * use cv

In [None]:
VERSION = '180703_v01'

In [None]:
# all imports
import sys
sys.path.insert(0, '../../src/')

import pandas as pd
import os
import transformers
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from transformers import (ModelTransformer, DataFrameColumnExtractor, ToDictTransformer)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import cross_validation as cv
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.lda import LDA
from sklearn.qda import QDA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

#evaluators

### Read data

In [None]:
DATA_DIR = '../../data/raw/'
TRAIN_DATASET_PATH = os.path.join(DATA_DIR, 'train.csv')
TEST_DATASET_PATH = os.path.join(DATA_DIR, 'test.csv')

In [None]:
df_train = pd.read_csv(TRAIN_DATASET_PATH, encoding='utf-8')
df_test = pd.read_csv(TEST_DATASET_PATH, encoding='utf-8')

### Train/Test Data

In [None]:
X = df_train.drop(['y', 'id'], axis=1)
y = df_train['y']

In [None]:
# X_train_yes = X_train[y_train == 1]
# y_train_yes = y_train[y_train == 1]

# X_train = pd.concat([X_train, X_train_yes])
# y_train = pd.concat([y_train, y_train_yes])

# X_train = pd.concat([X_train, X_train_yes])
# y_train = pd.concat([y_train, y_train_yes])

### Define model

In [None]:
CATEGORICAL_FEATURES = [
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'poutcome',
    'month',
    'day_of_week'
]

NUMERIC_FEATURES = [                    
    'age',
#     'campaign',
    'pdays',
#     'previous',
    'emp.var.rate',
    'cons.price.idx',
    'cons.conf.idx',
    'euribor3m',
    'nr.employed'
]

TO_APPLY_LOG = [
]

TO_APPLY_CUBE_ROOT = [

]

TO_APPLY_RECIPROCAL = [
    'campaign',
    'previous'
]

TO_BIN = [

]

TRAINING_FEATURES = NUMERIC_FEATURES + CATEGORICAL_FEATURES
ALL_COLUMNS = TRAINING_FEATURES + ['id', 'y']

In [None]:
transformer = make_pipeline(
    make_pipeline(

        make_union(
            make_pipeline(
                make_union(
                    make_pipeline(
                        DataFrameColumnExtractor(NUMERIC_FEATURES),
                    ),
                    make_pipeline(
                        DataFrameColumnExtractor(TO_APPLY_RECIPROCAL),
                        FunctionTransformer(transformers.sum_1),
                        FunctionTransformer(transformers.apply_reciprocal),
                    )
                ),
                StandardScaler(),
            ),
            make_pipeline(
                DataFrameColumnExtractor(CATEGORICAL_FEATURES),
                ToDictTransformer(),
                DictVectorizer(sparse=False)
            ),
#             make_pipeline(
#                 DataFrameColumnExtractor(['pdays']),
#                 FunctionTransformer(transformers.extract_not_prev_contacted)
#             ),
        ),
        SelectKBest(f_classif, k=55),
#         PolynomialFeatures(interaction_only=True)
    )
)

# DataFrameColumnExtractor(to_bin),
# FunctionTransformer(transformers.bin_data, kw_args={'columns': to_bin, 'bins': bins}),

In [None]:
predictor = Pipeline([
    ('voting', VotingClassifier(estimators=[
                ('gbc', GradientBoostingClassifier(random_state=42)), 
                ('xgb', XGBClassifier(random_state=42)), 
                ('lr', LogisticRegression(random_state=42)),
                ('rf', RandomForestClassifier(random_state=42)),
                ('gnb', GaussianNB()),
                ('mlp', MLPClassifier(random_state=42, solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2)))
               ], voting='soft'))
])

In [None]:
pipeline= Pipeline([
    ('transformer', transformer),
    ('predictor', predictor)
])

### Cross validation

In [None]:
scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1_weighted')

In [None]:
scores.mean()

In [None]:
0.8869241113147683

### Evaluate model

In [None]:
X_train, X_test, y_train, y_test = cv.train_test_split(X,
                                                       y,
                                                       test_size=0.25,
                                                       random_state=42)

In [None]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)

In [None]:
print(metrics.classification_report(y_test, y_pred))

print("Precision: "+ str(metrics.precision_score(y_test, y_pred)))
print("Recall: "+ str(metrics.recall_score(y_test, y_pred)))
print("F1 Score: "+ str(metrics.f1_score(y_test, y_pred, average='weighted')))

### HyperParameter Tuning

### Generate submission file

In [None]:
X_test = df_test.drop(['id'], axis=1)

In [None]:
pipeline.fit(X, y)
pred_submission = pipeline.predict(X_test)

In [None]:
ids_submission = df_submission['id']

In [None]:
# Save submission file
SUBMISSION_DATA_DIR = '../../data/submission/'
SUBMISSION_FILE_PATH = os.path.join(SUBMISSION_DATA_DIR, VERSION+'.csv')

df_submission = pd.DataFrame({'id':ids_submission, 'y':pred_submission})
df_submission.to_csv(SUBMISSION_FILE_PATH, index=False)

In [None]:
df_submission.info()