# 02. Train Model

In [1]:
VERSION = '180703_v01'

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# all imports
import sys
sys.path.insert(0, '../../src/')

import pandas as pd
import os
import transformers
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from transformers import (ModelTransformer, DataFrameColumnExtractor, ToDictTransformer)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import cross_validation as cv
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score



### Read data

In [4]:
DATA_DIR = '../../data/raw/'
TRAIN_DATASET_PATH = os.path.join(DATA_DIR, 'train.csv')
TEST_DATASET_PATH = os.path.join(DATA_DIR, 'test.csv')

In [5]:
df_train = pd.read_csv(TRAIN_DATASET_PATH, encoding='utf-8')
df_test = pd.read_csv(TEST_DATASET_PATH, encoding='utf-8')

### Train/Test Data

In [6]:
X = df_train.drop(['y', 'id'], axis=1)
y = df_train['y']

### Define model

In [7]:
CATEGORICAL_FEATURES = [
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'poutcome',
    'month',
    'day_of_week'
]

NUMERIC_FEATURES = [                    
    'age',
    'pdays',
    'emp.var.rate',
    'cons.price.idx',
    'cons.conf.idx',
    'euribor3m',
    'nr.employed'
]


TO_APPLY_RECIPROCAL = [
    'campaign',
    'previous'
]


TRAINING_FEATURES = NUMERIC_FEATURES + CATEGORICAL_FEATURES
ALL_COLUMNS = TRAINING_FEATURES + ['id', 'y']

In [8]:
transformer = make_pipeline(
    make_pipeline(

        make_union(
            make_pipeline(
                make_union(
                    make_pipeline(
                        DataFrameColumnExtractor(NUMERIC_FEATURES),
                    ),
                    make_pipeline(
                        DataFrameColumnExtractor(TO_APPLY_RECIPROCAL),
                        FunctionTransformer(transformers.sum_1),
                        FunctionTransformer(transformers.apply_reciprocal),
                    )
                ),
                StandardScaler(),
            ),
            make_pipeline(
                DataFrameColumnExtractor(CATEGORICAL_FEATURES),
                ToDictTransformer(),
                DictVectorizer(sparse=False)
            ),
        ),
        SelectKBest(f_classif, k=55),
    )
)

In [9]:
predictor = Pipeline([
    ('voting', VotingClassifier(estimators=[
                ('gbc', GradientBoostingClassifier(random_state=42)), 
                ('xgb', XGBClassifier(random_state=42)), 
                ('lr', LogisticRegression(random_state=42)),
                ('rf', RandomForestClassifier(random_state=42)),
                ('gnb', GaussianNB()),
                ('mlp', MLPClassifier(random_state=42, solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2)))
               ], voting='soft'))
])

In [10]:
pipeline= Pipeline([
    ('transformer', transformer),
    ('predictor', predictor)
])

### Cross validation

In [11]:
scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1_weighted')

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [12]:
scores.mean()

0.8869241113147683

### Classification Report

In [13]:
X_train, X_test, y_train, y_test = cv.train_test_split(X,
                                                       y,
                                                       test_size=0.25,
                                                       random_state=42)

In [14]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)

  if diff:


In [15]:
print(metrics.classification_report(y_test, y_pred))

print("Precision: "+ str(metrics.precision_score(y_test, y_pred)))
print("Recall: "+ str(metrics.recall_score(y_test, y_pred)))
print("F1 Score: "+ str(metrics.f1_score(y_test, y_pred, average='weighted')))

             precision    recall  f1-score   support

          0       0.92      0.97      0.95      8077
          1       0.61      0.35      0.44       985

avg / total       0.89      0.91      0.89      9062

Precision: 0.6125
Recall: 0.34822335025380713
F1 Score: 0.8933858876924368


### HyperParameter Tuning

### Generate submission file

In [16]:
X_test = df_test.drop(['id'], axis=1)

In [17]:
pipeline.fit(X, y)
pred_submission = pipeline.predict(X_test)

  if diff:


In [18]:
ids_submission = df_test['id']

In [19]:
# Save submission file
SUBMISSION_DATA_DIR = '../../data/submission/'
SUBMISSION_FILE_PATH = os.path.join(SUBMISSION_DATA_DIR, VERSION+'.csv')

df_submission = pd.DataFrame({'id':ids_submission, 'y':pred_submission})
df_submission.to_csv(SUBMISSION_FILE_PATH, index=False)

In [20]:
df_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9062 entries, 0 to 9061
Data columns (total 2 columns):
id    9062 non-null int64
y     9062 non-null int64
dtypes: int64(2)
memory usage: 141.7 KB
