# 02. Train Model

In [2]:
VERSION = '180612_v1'

In [3]:
# all imports
import sys
sys.path.insert(0, '../../src/')

import pandas as pd
import os
import transformers
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from transformers import (ModelTransformer, DataFrameColumnExtractor, ToDictTransformer)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import cross_validation as cv
from sklearn import metrics

#evaluators
#gridcv



### Read data

In [4]:
DATA_DIR = '../../data/raw/'
TRAIN_DATASET_PATH = os.path.join(DATA_DIR, 'train.csv')
TEST_DATASET_PATH = os.path.join(DATA_DIR, 'test.csv')

In [5]:
df_train = pd.read_csv(TRAIN_DATASET_PATH, encoding='utf-8')
df_test = pd.read_csv(TEST_DATASET_PATH, encoding='utf-8')

### Train/Test Data

In [6]:
X = df_train.drop(['y', 'id'], axis=1)
y = df_train['y']#.astype(bool)

In [7]:
X_train, X_test, y_train, y_test = cv.train_test_split(X,
                                                       y,
                                                       test_size=0.25,
                                                       random_state=42)

In [8]:
# X_test = df_test.drop(['id'], axis=1)

### Train model

In [9]:
CATEGORICAL_FEATURES = [
    'job',
    'marital',
    'education',
    'default',
    'housing',
    'loan',
    'contact',
    'poutcome',
    'month',
    'day_of_week'
]

NUMERIC_FEATURES = [                    
    'age',
    'campaign',
    'pdays',
    'previous',
    'emp.var.rate',
    'cons.price.idx',
    'cons.conf.idx',
    'euribor3m',
    'nr.employed'
]

TO_APPLY_LOG = [
]

TO_APPLY_CUBE_ROOT = [

]

TO_APPLY_RECIPROCAL = [
]

TO_BIN = [

]

TRAINING_FEATURES = NUMERIC_FEATURES + CATEGORICAL_FEATURES
ALL_COLUMNS = TRAINING_FEATURES + ['id', 'y']

In [10]:
transformer_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            make_union(
                make_pipeline(
                    DataFrameColumnExtractor(NUMERIC_FEATURES),
#                     Imputer(strategy="median", axis=0),
                )
            ),
            StandardScaler(),
        ),
        make_pipeline(
            DataFrameColumnExtractor(CATEGORICAL_FEATURES),
            ToDictTransformer(),
            DictVectorizer(sparse=False)
        )
    )
)

In [11]:
predictor_pipeline = make_pipeline(
#      make_union(
#          ModelTransformer(LogisticRegression(random_state=42)),
#          FunctionTransformer(transformers.all_columns)
#      ),
     GradientBoostingClassifier(random_state=42, max_depth=4) 
)

In [12]:
pipeline= Pipeline([
    ('features', transformer_pipeline),
    ('predictor', predictor_pipeline)
])

In [13]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features', Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline', Pipeline(steps=[('dataframecolumnextractor', DataFrameColumnExtractor(columns=['age', 'cam...100, presort='auto', random_state=42,
              subsample=1.0, verbose=0, warm_start=False))]))])

### Evaluate model

In [14]:
y_pred = pipeline.predict(X_test)
y_pred_proba = pipeline.predict_proba(X_test)

In [15]:
print(metrics.classification_report(y_test, y_pred))

print("Precision: "+ str(metrics.precision_score(y_test, y_pred)))
print("Recall: "+ str(metrics.recall_score(y_test, y_pred)))
print("F1 Score: "+ str(metrics.f1_score(y_test, y_pred)))

             precision    recall  f1-score   support

          0       0.92      0.98      0.95      8077
          1       0.67      0.26      0.37       985

avg / total       0.89      0.91      0.89      9062

Precision: 0.6737967914438503
Recall: 0.25583756345177666
F1 Score: 0.3708609271523179


### Generate submission file

In [16]:
DATA_DIR = '../../data/raw/'
TRAIN_DATASET_PATH = os.path.join(DATA_DIR, 'train.csv')
TEST_DATASET_PATH = os.path.join(DATA_DIR, 'test.csv')

In [17]:
df_train = pd.read_csv(TRAIN_DATASET_PATH, encoding='utf-8')
df_test = pd.read_csv(TEST_DATASET_PATH, encoding='utf-8')

In [18]:
X = df_train.drop(['y', 'id'], axis=1)
y = df_train['y']#.astype(bool)

X_test = df_test.drop(['id'], axis=1)

In [19]:
pipeline.fit(X, y)

Pipeline(steps=[('features', Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('featureunion', FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline', Pipeline(steps=[('dataframecolumnextractor', DataFrameColumnExtractor(columns=['age', 'cam...100, presort='auto', random_state=42,
              subsample=1.0, verbose=0, warm_start=False))]))])

In [20]:
pred_submission = pipeline.predict(X_test)

In [21]:
ids_submission = df_test['id']

In [27]:
# Save submission file
SUBMISSION_DATA_DIR = '../../data/submission/'
SUBMISSION_FILE_PATH = os.path.join(SUBMISSION_DATA_DIR, VERSION+'.csv')

df_submission = pd.DataFrame({'id':ids_submission, 'y':pred_submission})
df_submission.to_csv(SUBMISSION_FILE_PATH, index=False)

In [28]:
df_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9062 entries, 0 to 9061
Data columns (total 2 columns):
id    9062 non-null int64
y     9062 non-null int64
dtypes: int64(2)
memory usage: 141.7 KB
