# Import software libraries

In [1]:
import sys                                                  # Read system parameters.
import numpy as np                                          # Work with multi-dimensional arrays.
import pandas as pd                                         # Manipulate and analyze data.
import sklearn                                              # Train and evaluate machine learning models.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, \
                                  MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from collections import Counter                             # Count objects in containers.
import warnings
warnings.filterwarnings('ignore')

# Ensure results are reproducible.
np.random.seed(1)

# Summarize software libraries used.
print('Libraries used in this project:')
print('- Python {}'.format(sys.version))
print('- NumPy {}'.format(np.__version__))
print('- pandas {}'.format(pd.__version__))
print('- scikit-learn {}'.format(sklearn.__version__))

Libraries used in this project:
- Python 3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]
- NumPy 1.19.5
- pandas 1.2.4
- scikit-learn 0.23.2


# Load and preview the data

In [2]:
users_data = pd.read_pickle('users_data_final.pickle')

users_data.head(n = 3)

Unnamed: 0,user_id,number_transactions,total_amount_usd,job_management,job_technician,job_entrepreneur,job_blue-collar,job_retired,job_admin.,job_services,...,duration,campaign,pdays,previous,term_deposit,device_desktop,device_tablet,single,age_group_encoded,month_joined
0,9231c446-cb16-4b2b-a7f7-ddfc8b25aaf6,3.0,2143.0,1,0,0,0,0,0,0,...,261,1,-1,0,False,0,0,0,4,8
1,bb92765a-08de-4963-b432-496524b39157,0.0,1369.42,0,1,0,0,0,0,0,...,151,1,-1,0,False,1,0,1,2,7
2,573de577-49ef-42b9-83da-d3cfb817b5c1,2.0,2.0,0,0,1,0,0,0,0,...,76,1,-1,0,False,0,0,0,1,6


# Split the data into train and test sets

In [3]:
target = users_data.term_deposit
features = users_data.drop(['user_id', 'term_deposit'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(features, target)

# Define an initial pipeline

In [4]:
pipe = Pipeline([('scaler', MinMaxScaler()),
                 ('reduce_dim', PCA(n_components = 2)),
                 ('model', DecisionTreeClassifier())])

# Evaluate the initial pipeline

In [5]:
pipe = pipe.fit(X_train, y_train)
print('Model accuracy on test data:', pipe.score(X_test, y_test))

Model accuracy on test data: 0.8110668437361664


In [6]:
y_pred = pipe.predict(X_test)
print(Counter(y_pred))

Counter({False: 9935, True: 1360})


In [7]:
results = pd.concat([y_test.iloc[:5], X_test.iloc[:5]], axis = 1)
results.insert(1, 'term_deposit_pred', y_pred[:5])
results

Unnamed: 0,term_deposit,term_deposit_pred,number_transactions,total_amount_usd,job_management,job_technician,job_entrepreneur,job_blue-collar,job_retired,job_admin.,...,contact_telephone,duration,campaign,pdays,previous,device_desktop,device_tablet,single,age_group_encoded,month_joined
43308,False,True,0.0,1369.42,1,0,0,0,0,0,...,1,129,1,188,2,0,0,0,2,3
32770,False,False,2.0,246.0,0,0,0,1,0,0,...,0,191,1,150,7,0,0,0,1,7
17440,False,False,0.0,1369.42,0,0,0,0,0,0,...,0,780,2,-1,0,0,0,0,2,5
36164,False,False,0.0,1369.42,0,0,0,1,0,0,...,0,114,2,-1,0,1,0,1,2,10
29218,False,False,0.0,1369.42,0,0,0,0,0,0,...,0,111,1,-1,0,0,0,0,2,5


# Tune the pipeline

In [8]:
scalers = [None, StandardScaler(), MinMaxScaler()]
pca_dims = [None, PCA(n_components = 2), PCA(n_components = 5)]
models = [DecisionTreeClassifier(),
          RandomForestClassifier(random_state = 1)]

params = {'scaler': scalers,
          'reduce_dim': pca_dims,
          'model': models}

In [9]:
gs = GridSearchCV(pipe, params, n_jobs = -1, verbose = 2,
                  cv = StratifiedKFold(5, shuffle = False)). \
fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  1.8min finished


# Evaluate the tuned pipeline

In [10]:
print('Best accuracy score:', gs.score(X_test, y_test))
print('Best parameters:    ', gs.best_params_)

Best accuracy score: 0.888092076139885
Best parameters:     {'model': RandomForestClassifier(random_state=1), 'reduce_dim': None, 'scaler': MinMaxScaler()}


In [11]:
y_pred = gs.predict(X_test) 
print(Counter(y_pred))

Counter({False: 10599, True: 696})


In [12]:
results['term_deposit_pred'] = y_pred[:5]
results

Unnamed: 0,term_deposit,term_deposit_pred,number_transactions,total_amount_usd,job_management,job_technician,job_entrepreneur,job_blue-collar,job_retired,job_admin.,...,contact_telephone,duration,campaign,pdays,previous,device_desktop,device_tablet,single,age_group_encoded,month_joined
43308,False,False,0.0,1369.42,1,0,0,0,0,0,...,1,129,1,188,2,0,0,0,2,3
32770,False,False,2.0,246.0,0,0,0,1,0,0,...,0,191,1,150,7,0,0,0,1,7
17440,False,False,0.0,1369.42,0,0,0,0,0,0,...,0,780,2,-1,0,0,0,0,2,5
36164,False,False,0.0,1369.42,0,0,0,1,0,0,...,0,114,2,-1,0,1,0,1,2,10
29218,False,False,0.0,1369.42,0,0,0,0,0,0,...,0,111,1,-1,0,0,0,0,2,5


# Test the model generated by the pipeline on new data

In [13]:
new_data = pd.read_csv('new_users_data.csv')

new_data

Unnamed: 0,number_transactions,total_amount_usd,job_management,job_technician,job_entrepreneur,job_blue-collar,job_retired,job_admin.,job_services,job_self-employed,...,contact_telephone,duration,campaign,pdays,previous,device_desktop,device_tablet,single,age_group_encoded,month_joined
0,4,237.1,0,0,0,0,1,0,0,0,...,0,76,1,-1,0,0,1,0,4,11
1,2,-43.12,0,0,1,0,0,0,0,0,...,0,145,2,-1,0,0,1,1,2,3
2,1,789.45,0,1,0,0,0,0,0,0,...,0,172,3,-1,0,1,0,0,1,4
3,4,3291.41,0,1,0,0,0,0,0,0,...,0,35,1,-1,0,0,1,1,3,6


In [14]:
y_pred = gs.predict(new_data)

new_data.insert(0, 'term_deposit_pred', y_pred)
new_data

Unnamed: 0,term_deposit_pred,number_transactions,total_amount_usd,job_management,job_technician,job_entrepreneur,job_blue-collar,job_retired,job_admin.,job_services,...,contact_telephone,duration,campaign,pdays,previous,device_desktop,device_tablet,single,age_group_encoded,month_joined
0,False,4,237.1,0,0,0,0,1,0,0,...,0,76,1,-1,0,0,1,0,4,11
1,False,2,-43.12,0,0,1,0,0,0,0,...,0,145,2,-1,0,0,1,1,2,3
2,False,1,789.45,0,1,0,0,0,0,0,...,0,172,3,-1,0,1,0,0,1,4
3,False,4,3291.41,0,1,0,0,0,0,0,...,0,35,1,-1,0,0,1,1,3,6
