# 02. Train Model

In [None]:
VERSION = '180612_v2'

In [25]:
# all imports
import sys
sys.path.insert(0, '../../src/')

import pandas as pd
import os
import transformers
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from transformers import (ModelTransformer, DataFrameColumnExtractor, ToDictTransformer)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import cross_validation as cv
from sklearn import metrics

#evaluators
#gridcv

### Read data

In [26]:
DATA_DIR = '../../data/raw/'
TRAIN_DATASET_PATH = os.path.join(DATA_DIR, 'train.csv')
TEST_DATASET_PATH = os.path.join(DATA_DIR, 'test.csv')

In [27]:
df_train = pd.read_csv(TRAIN_DATASET_PATH, encoding='utf-8')
df_test = pd.read_csv(TEST_DATASET_PATH, encoding='utf-8')

### Train/Test Data

In [28]:
X = df_train.drop(['y', 'id'], axis=1)
y = df_train['y']#.astype(bool)

In [29]:
X_train, X_test, y_train, y_test = cv.train_test_split(X,
                                                       y,
                                                       test_size=0.25,
                                                       random_state=42)

In [30]:
# X_test = df_test.drop(['id'], axis=1)

### Train model

In [31]:
CATEGORICAL_FEATURES = [
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'poutcome',
    'month',
    'day_of_week'
]

NUMERIC_FEATURES = [                    
    'age',
    'pdays',
    'emp.var.rate',
    'cons.price.idx',
    'cons.conf.idx',
    'euribor3m',
    'nr.employed'
]

TO_APPLY_LOG = [
]

TO_APPLY_CUBE_ROOT = [

]

TO_APPLY_RECIPROCAL = [
    'campaign',
    'previous'

]

TO_BIN = [

]

TRAINING_FEATURES = NUMERIC_FEATURES + CATEGORICAL_FEATURES
ALL_COLUMNS = TRAINING_FEATURES + ['id', 'y']

In [34]:
transformer_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            make_union(
                make_pipeline(
                    DataFrameColumnExtractor(NUMERIC_FEATURES),
#                     Imputer(strategy="median", axis=0),
                ),
                make_pipeline(
                    DataFrameColumnExtractor(TO_APPLY_RECIPROCAL),
#                     Imputer(strategy="median", axis=0),
                    FunctionTransformer(transformers.sum_1),
                    FunctionTransformer(transformers.apply_reciprocal),
                )
            ),
            StandardScaler(),
        ),
        make_pipeline(
            DataFrameColumnExtractor(CATEGORICAL_FEATURES),
            ToDictTransformer(),
            DictVectorizer(sparse=False)
        )
    )
)

In [35]:
predictor_pipeline = make_pipeline(
#      make_union(
#          ModelTransformer(LogisticRegression(random_state=42)),
#          FunctionTransformer(transformers.all_columns)
#      ),
     GradientBoostingClassifier(random_state=42, max_depth=4) 
)

In [36]:
pipeline= Pipeline([
    ('features', transformer_pipeline),
    ('predictor', predictor_pipeline)
])

In [37]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features', Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('dataframecolumnextractor', DataFrameColumnExtractor(columns=['age', 'p...100, presort='auto', random_state=42,
              subsample=1.0, verbose=0, warm_start=False))]))])

### Evaluate model

In [38]:
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)

In [39]:
print(metrics.classification_report(y_test, y_pred))

print("Precision: "+ str(metrics.precision_score(y_test, y_pred)))
print("Recall: "+ str(metrics.recall_score(y_test, y_pred)))
print("F1 Score: "+ str(metrics.f1_score(y_test, y_pred, average='binary')))

             precision    recall  f1-score   support

          0       0.92      0.99      0.95      8077
          1       0.67      0.25      0.37       985

avg / total       0.89      0.91      0.89      9062

Precision: 0.6738544474393531
Recall: 0.25380710659898476
F1 Score: 0.3687315634218289


### Generate submission file

In [40]:
DATA_DIR = '../../data/raw/'
TRAIN_DATASET_PATH = os.path.join(DATA_DIR, 'train.csv')
TEST_DATASET_PATH = os.path.join(DATA_DIR, 'test.csv')

In [41]:
df_train = pd.read_csv(TRAIN_DATASET_PATH, encoding='utf-8')
df_test = pd.read_csv(TEST_DATASET_PATH, encoding='utf-8')

In [42]:
X = df_train.drop(['y', 'id'], axis=1)
y = df_train['y']#.astype(bool)

X_test = df_test.drop(['id'], axis=1)

In [43]:
pipeline.fit(X, y)

KeyboardInterrupt: 

In [None]:
pred_submission = pipeline.predict(X_test)

In [None]:
ids_submission = df_test['id']

In [None]:
# Save submission file
SUBMISSION_DATA_DIR = '../../data/'
SUBMISSION_FILE_PATH = os.path.join(SUBMISSION_DATA_DIR, 'submission_+'VERSION'+.csv')

df_submission = pd.DataFrame({'id':ids_submission, 'y':pred_submission})
df_submission.to_csv(SUBMISSION_FILE_PATH, index=False)

In [None]:
df_submission.info()