# Model Training

At this stage, a model will be developed by using the processed data. GridSearch requires lots of sources; therefore, this notebook should be run separately on a compute-optimized cluster.

### Outline
    - Load Models from S3
    - Train Benchmark Models (Dummy, Logreg, KNN) and also Selected Models (DecisionTree, RandomForest) with default parameters.
    - Train Selected Models with different hyperparameters by using GridSearchCV with 5 Folds (Avoid Overfitting)
    - Upload models to S3

In [3]:
import pickle
import pandas as pd

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

## Load Data

In [4]:
train = pd.read_csv("../data/processed/train.csv", index_col=0)

In [5]:
train.head()

Unnamed: 0,recurrence,target,time,amount,total_trans_amount,total_trans_count,avg_trans_amount,time_after_last_trans,web,mobile,...,age_66_71,age_72_77,age_78_83,age_84_101,income_100k_120k,income_30k_40k,income_40k_50k,income_50k_60k,income_60k_75k,income_75k_100k
1615981b0a114e42bc1b0f65b2ac4c8b-2298d6c36e964ae4a3e7e9706d1fb8c2-0,0.0,0,0.484211,0.028179,0.061282,0.107143,0.021072,0.094737,1,1,...,0,0,0,0,0,0,0,0,1,0
7c04f1c0d5764d878c4301062c245ecd-5a8bc65990b245e5a138643cd4eb9837-1,0.25,1,0.126316,0.013745,0.038207,0.071429,0.017515,0.157895,0,1,...,0,0,0,0,0,0,1,0,0,0
8348766a2f6846f195b63ae4a0ea899b-3f207df678b143eea3cee63160fa8bed-0,0.0,1,0.305263,0.015762,0.026258,0.035714,0.018066,0.4,1,1,...,0,0,0,0,0,0,0,1,0,0
28be7b9a77ee43569a5f7773127fc47b-4d5c57ea9a6940dd891ad53e9dbe8da0-0,0.0,1,0.284211,0.004182,0.004926,0.071429,0.002228,0.0,1,1,...,0,0,0,0,0,0,0,1,0,0
2481f1fcfbcb4b288e5a03af02d95373-3f207df678b143eea3cee63160fa8bed-0,0.0,1,0.378947,0.029986,0.045313,0.035714,0.031194,0.326316,1,1,...,0,0,0,0,0,0,0,0,0,1


In [6]:
train.shape

(44260, 48)

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44260 entries, 1615981b0a114e42bc1b0f65b2ac4c8b-2298d6c36e964ae4a3e7e9706d1fb8c2-0 to 48f7bc851d79485aba255b24325f4649-2298d6c36e964ae4a3e7e9706d1fb8c2-0
Data columns (total 48 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   recurrence             44260 non-null  float64
 1   target                 44260 non-null  int64  
 2   time                   44260 non-null  float64
 3   amount                 44260 non-null  float64
 4   total_trans_amount     44260 non-null  float64
 5   total_trans_count      44260 non-null  float64
 6   avg_trans_amount       44260 non-null  float64
 7   time_after_last_trans  44260 non-null  float64
 8   web                    44260 non-null  int64  
 9   mobile                 44260 non-null  int64  
 10  social                 44260 non-null  int64  
 11  was_null_profile       44260 non-null  int64  
 12  oft_bogo               44260 non-nu

In [11]:
feat_cols = [
    'recurrence', 'time', 'amount', 'total_trans_amount', 'total_trans_count', 
    'avg_trans_amount', 'time_after_last_trans', 'web', 'mobile', 'social', 
    'was_null_profile', 'oft_bogo', 'oft_discount', 'dfc_5', 'dfc_10', 'delta_3', 
    'delta_4', 'delta_5', 'delta_7', 'delta_10', 'rw_0', 'rw_2', 'rw_3', 'rw_5', 'rw_10', 
    'member_new', 'member_old', 'gen_F', 'gen_O', 'age_18_23', 'age_24_29', 'age_30_35', 
    'age_36_41', 'age_42_47', 'age_48_53', 'age_53_59', 'age_60_65', 'age_66_71', 'age_72_77', 
    'age_78_83', 'age_84_101', 'income_30k_40k', 'income_40k_50k', 'income_50k_60k', 
    'income_60k_75k', 'income_75k_100k', 'income_100k_120k'
]

target_cols = ["target"]

## Train Models with Default Parameters

In [12]:
X_train, y_train = train[feat_cols].values, train[target_cols].values.ravel()

#### Dummy

In [13]:
dummy = DummyClassifier()
dummy.fit(X_train, y_train)

DummyClassifier()

#### Logistic Regression

In [14]:
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

In [15]:
y_train

array([0, 1, 1, ..., 0, 2, 1])

#### KNN

In [16]:
knn = KNeighborsClassifier(algorithm='kd_tree')
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='kd_tree')

#### Decision Tree

In [17]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

DecisionTreeClassifier()

#### Random Forest

In [18]:
rforest = RandomForestClassifier()
rforest.fit(X_train, y_train)

RandomForestClassifier()

### HyperParameter Tuning

#### Decision Tree

In [19]:
param_grid = {"criterion": ["gini", "entropy"],
              "min_samples_split": [2, 5, 10, 50, 100],
              "min_samples_leaf": [5, 10, 20, 50, 100],
              "max_features": ["auto", "sqrt", 5, 10, 20, 50]}

dgrid = GridSearchCV(estimator=dtree, 
                     param_grid=param_grid,
                     cv=5,
                     verbose=10,
                     scoring='f1_weighted')

dgrid.fit(X_train, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
[CV 1/5; 1/300] START criterion=gini, max_features=auto, min_samples_leaf=5, min_samples_split=2
[CV 1/5; 1/300] END criterion=gini, max_features=auto, min_samples_leaf=5, min_samples_split=2;, score=0.719 total time=   0.1s
[CV 2/5; 1/300] START criterion=gini, max_features=auto, min_samples_leaf=5, min_samples_split=2
[CV 2/5; 1/300] END criterion=gini, max_features=auto, min_samples_leaf=5, min_samples_split=2;, score=0.726 total time=   0.1s
[CV 3/5; 1/300] START criterion=gini, max_features=auto, min_samples_leaf=5, min_samples_split=2
[CV 3/5; 1/300] END criterion=gini, max_features=auto, min_samples_leaf=5, min_samples_split=2;, score=0.723 total time=   0.1s
[CV 4/5; 1/300] START criterion=gini, max_features=auto, min_samples_leaf=5, min_samples_split=2
[CV 4/5; 1/300] END criterion=gini, max_features=auto, min_samples_leaf=5, min_samples_split=2;, score=0.711 total time=   0.1s
[CV 5/5; 1/300] START criterion=gini

250 fits failed out of a total of 1500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
250 fits failed with the following error:
Traceback (most recent call last):
  File "/home/jovyan/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jovyan/venv/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 937, in fit
    super().fit(
  File "/home/jovyan/venv/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 308, in fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]

 0.73168002 0.73255822 0.73659536 0.74433263 0.74245939 0.74164264
 0.74213432 

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['auto', 'sqrt', 5, 10, 20, 50],
                         'min_samples_leaf': [5, 10, 20, 50, 100],
                         'min_samples_split': [2, 5, 10, 50, 100]},
             scoring='f1_weighted', verbose=10)

In [20]:
dtree_hyp = dgrid.best_estimator_
dtree_hyp.fit(X_train, y_train)

DecisionTreeClassifier(max_features=20, min_samples_leaf=100,
                       min_samples_split=50)

#### Random Forest

In [21]:
param_grid = {
    "n_estimators": [10, 50, 100],
    "criterion": ["gini", "entropy"],
    "min_samples_split": [5, 10, 100],
    "min_samples_leaf": [5, 10, 100],
    "max_features": ["sqrt", 10, 50]
}

rgrid = GridSearchCV(estimator=rforest, 
                     param_grid=param_grid,
                     cv=5,
                     verbose=10,
                     scoring='f1_weighted')

rgrid.fit(X_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
[CV 1/5; 1/162] START criterion=gini, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=10
[CV 1/5; 1/162] END criterion=gini, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=10;, score=0.762 total time=   0.6s
[CV 2/5; 1/162] START criterion=gini, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=10
[CV 2/5; 1/162] END criterion=gini, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=10;, score=0.754 total time=   0.5s
[CV 3/5; 1/162] START criterion=gini, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=10
[CV 3/5; 1/162] END criterion=gini, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=10;, score=0.765 total time=   0.4s
[CV 4/5; 1/162] START criterion=gini, max_features=sqrt, min_samples_leaf=5, min_samples_split=5, n_estimators=10
[CV 4/5; 1/162] END criterion=gini, max_featur

270 fits failed out of a total of 810.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "/home/jovyan/venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jovyan/venv/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 442, in fit
    trees = Parallel(
  File "/home/jovyan/venv/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/home/jovyan/venv/lib/python3.9/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/home/

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['sqrt', 10, 50],
                         'min_samples_leaf': [5, 10, 100],
                         'min_samples_split': [5, 10, 100],
                         'n_estimators': [10, 50, 100]},
             scoring='f1_weighted', verbose=10)

In [22]:
rforest_hyp = rgrid.best_estimator_
rforest_hyp.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', max_features=10, min_samples_leaf=5,
                       min_samples_split=5)

#### Save Models

In [23]:
pickle.dump(dummy, open("../model/dummy.pkl", "wb"))
pickle.dump(logreg, open("../model/logreg.pkl", "wb"))
pickle.dump(knn, open("../model/knn.pkl", "wb"))
pickle.dump(dtree, open("../model/dtree.pkl", "wb"))
pickle.dump(dtree_hyp, open("../model/dtree_hyp.pkl", "wb"))
pickle.dump(rforest, open("../model/rforest.pkl", "wb"))
pickle.dump(rforest_hyp, open("../model/rforest_hyp.pkl", "wb"))

FileNotFoundError: [Errno 2] No such file or directory: '../model/dummy.pkl'

----

### AWS

Load the Data

In [None]:
import boto3
import sagemaker

# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = "bucket"

key = "train.csv"
file_path = 's3://{}/{}'.format(bucket, key)
train = pd.read_csv(file_path, index_col=0)

#### Upload Models to S3

In [None]:
bucket = "bucket"
model_dir = 'model'
prefix = "model"

# upload data to S3
sagemaker_session.upload_data(path=model_dir, bucket=bucket, key_prefix=prefix)

#### Check S3 bucket

In [None]:
empty_check = []
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

assert len(empty_check) !=0, 'S3 bucket is empty.'