## Building a classification model using Scikit Learn

This is an example notebook on how you would develop a classification model using Scikit Learn. 

If you see below, the model is trained and used for prediction in the same code. 

### Business Problem

To predict wheather a banking customer will subscribe to a term deposit as a result of direct marketing campaigns (phone calls). The dataset if from the UIC Machine Learning Repository.

### Step 1. Importing Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import cPickle
from sklearn import tree
import json

### Step 2. Importing and Examining the data

In [2]:
data = pd.read_csv('model_data/banking.csv', header = 0)

## Understanding the data
data = data.dropna() ##Dropping NA fields

print(data.shape)
print(list(data.columns))

(41188, 21)
['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y']


In [3]:
## Inspecting the data
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1


### Step 3. Feature Engineering

#### Categorical Encoding 

In [4]:
categorical_variable_cols = ['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']

for var in categorical_variable_cols:
    categ_list = 'var_' + var
    categ_list = pd.get_dummies(data[var],prefix = var)
    data1=data.join(categ_list)
    data = data1

## Creating feilds to keep
data_vars = data.columns.values.tolist()
to_keep=[i for i in data_vars if i not in categorical_variable_cols]

In [5]:
to_keep

['age',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'emp_var_rate',
 'cons_price_idx',
 'cons_conf_idx',
 'euribor3m',
 'nr_employed',
 'y',
 'job_admin.',
 'job_blue-collar',
 'job_entrepreneur',
 'job_housemaid',
 'job_management',
 'job_retired',
 'job_self-employed',
 'job_services',
 'job_student',
 'job_technician',
 'job_unemployed',
 'job_unknown',
 'marital_divorced',
 'marital_married',
 'marital_single',
 'marital_unknown',
 'education_basic.4y',
 'education_basic.6y',
 'education_basic.9y',
 'education_high.school',
 'education_illiterate',
 'education_professional.course',
 'education_university.degree',
 'education_unknown',
 'default_no',
 'default_unknown',
 'default_yes',
 'housing_no',
 'housing_unknown',
 'housing_yes',
 'loan_no',
 'loan_unknown',
 'loan_yes',
 'contact_cellular',
 'contact_telephone',
 'month_apr',
 'month_aug',
 'month_dec',
 'month_jul',
 'month_jun',
 'month_mar',
 'month_may',
 'month_nov',
 'month_oct',
 'month_sep',
 'day_of_week_fri',


In [6]:
data_final = data[to_keep]
print("Fields - after categorical encoding:")
print(data_final.columns.values)

Fields - after categorical encoding:
['age' 'duration' 'campaign' 'pdays' 'previous' 'emp_var_rate'
 'cons_price_idx' 'cons_conf_idx' 'euribor3m' 'nr_employed' 'y'
 'job_admin.' 'job_blue-collar' 'job_entrepreneur' 'job_housemaid'
 'job_management' 'job_retired' 'job_self-employed' 'job_services'
 'job_student' 'job_technician' 'job_unemployed' 'job_unknown'
 'marital_divorced' 'marital_married' 'marital_single' 'marital_unknown'
 'education_basic.4y' 'education_basic.6y' 'education_basic.9y'
 'education_high.school' 'education_illiterate'
 'education_professional.course' 'education_university.degree'
 'education_unknown' 'default_no' 'default_unknown' 'default_yes'
 'housing_no' 'housing_unknown' 'housing_yes' 'loan_no' 'loan_unknown'
 'loan_yes' 'contact_cellular' 'contact_telephone' 'month_apr' 'month_aug'
 'month_dec' 'month_jul' 'month_jun' 'month_mar' 'month_may' 'month_nov'
 'month_oct' 'month_sep' 'day_of_week_fri' 'day_of_week_mon'
 'day_of_week_thu' 'day_of_week_tue' 'day_of_

In [7]:
data_final.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,44,210,1,999,0,1.4,93.444,-36.1,4.963,5228.1,...,0,0,0,0,1,0,0,0,1,0
1,53,138,1,999,0,-0.1,93.2,-42.0,4.021,5195.8,...,0,0,1,0,0,0,0,0,1,0
2,28,339,3,6,2,-1.7,94.055,-39.8,0.729,4991.6,...,0,0,0,0,1,0,0,0,0,1
3,39,185,2,999,0,-1.8,93.075,-47.1,1.405,5099.1,...,0,0,1,0,0,0,0,0,1,0
4,55,137,1,3,1,-2.9,92.201,-31.4,0.869,5076.2,...,0,0,1,0,0,0,0,0,0,1


#### Keeping some data aside for making predictions on new data

In [8]:
np.random.seed(123)

subset_size = 20
new_data_indices = np.random.choice(data_final.index,subset_size, replace=False)
data_new = data_final.loc[new_data_indices]

data_new.to_csv('new_banking_data.csv',index=False)
data = data_final.drop(new_data_indices)

print("{} records selected for final model prediction".format(data_new.shape[0]))

20 records selected for final model prediction


### Step 4. Feature Selection

Not all variables will have a significant impact on prediction. We will use RFE to eliminate features that do not have impact on the final target variable:

In [9]:
data_final_vars = data_final.columns.values.tolist()
y=['y']
X = [i for i in data_final_vars if i not in y]


In [11]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

rfe = RFE(logreg,18)

rfe = rfe.fit(data_final[X],data_final[y])

Selected_Features_Support = list(rfe.support_)
Selected_Features_rank = list(rfe.ranking_)

print(Selected_Features_Support)
print(Selected_Features_rank)

[False, False, False, False, True, False, False, False, True, False, False, True, False, False, False, True, False, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, True, True, True, False, False, True, True, True, False, False, False, True, False, False, False, True, True, True]
[39, 36, 16, 44, 1, 19, 23, 25, 1, 32, 15, 1, 9, 42, 43, 1, 29, 1, 1, 33, 28, 41, 5, 7, 8, 45, 21, 10, 14, 20, 40, 35, 11, 2, 1, 4, 46, 26, 17, 37, 24, 22, 18, 6, 1, 1, 1, 1, 30, 27, 1, 1, 1, 34, 13, 12, 1, 38, 31, 3, 1, 1, 1]


In [12]:
columns_final = ["previous", "euribor3m", "job_blue-collar", "job_retired", "job_services", "job_student", "default_no", 
      "month_aug", "month_dec", "month_jul", "month_nov", "month_oct", "month_sep", "day_of_week_fri", "day_of_week_wed", 
      "poutcome_failure", "poutcome_nonexistent", "poutcome_success"] 
col_dict = {}
col_dict['final_features'] = columns_final

with open('final_features.json', 'w') as fp:
    json.dump(col_dict, fp)

X = data_final[columns_final]
y = data_final['y']

### Step 5. Splitting Data into Train and Test datasets

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.3, random_state=0)

### Step 6. Fitting Decision Tree Classifier

In [14]:
clf = tree.DecisionTreeClassifier(max_leaf_nodes=None)
clf = clf.fit(X_train, y_train)

### Step 7. Quick Model Testing Method: Using Local Predictor

In [15]:
y_pred = clf.predict(X_test)
print("First 20 predictions (for demonstration): ",y_pred[:20])

('First 20 predictions (for demonstration): ', array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]))


### Step 8. Model Evaluation & Performance Measurement

In [16]:
print('Accuracy of Decision Tree on test set: {:2f}%\n'.format(clf.score(X_test,y_test)*100))
print("Confusion Matrix: \n")

from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix,"\n")
print("Precision, Recall and F1 score:")
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

Accuracy of Decision Tree on test set: 89.196407%

Confusion Matrix: 

(array([[10703,   278],
       [ 1057,   319]]), '\n')
Precision, Recall and F1 score:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94     10981
           1       0.53      0.23      0.32      1376

   micro avg       0.89      0.89      0.89     12357
   macro avg       0.72      0.60      0.63     12357
weighted avg       0.87      0.89      0.87     12357



### Step 9. Exporting the model for future predictions

Training and local predictions is always not a feasible solution, especially if you want to re-use the model on new data that the model hasn't seen before. For this, we will be exporting the model as a compressed pickle file.

In [18]:
modelname = 'finalized_dec_tree_model.pkl'
cPickle.dump(clf, open(modelname, 'wb'))

### Step 10. Exporting train and test data to S3 for future SageMaker models

In [19]:
from io import StringIO # python3; python2: BytesIO 
import boto3
import s3fs
train_s3_path = 'atl-gkrishna-ml/aws-mlops-lunch-and-learn/part1-sagemaker/sklearn-byom/data/training/banking_training.csv'
testing_s3_path = 'atl-gkrishna-ml/aws-mlops-lunch-and-learn/part1-sagemaker/sklearn-byom/data/testing/banking_testing.csv'
newdata_s3_path = 'atl-gkrishna-ml/aws-mlops-lunch-and-learn/part1-sagemaker/sklearn-byom/data/newdata/banking_newdata.csv'


def upload_to_s3(df,path):
    bytes_to_write = df.to_csv(None).encode()
    fs = s3fs.S3FileSystem()
    with fs.open('s3://{}'.format(path), 'wb') as f:
        f.write(bytes_to_write)

In [20]:
data_train = pd.concat([X_train, y_train], axis=1, sort=False)
data_test = pd.concat([X_test, y_test], axis=1, sort=False)

In [21]:
upload_to_s3(data_train,train_s3_path)
upload_to_s3(data_test,testing_s3_path)
upload_to_s3(data_new,newdata_s3_path)

### Step 11. Uploading model artifact to S3 (for Demo 4)

In [22]:
import tarfile
tar = tarfile.open("sklearn_dec_tree.tar.gz", "w:gz")
for name in ["finalized_dec_tree_model.pkl"]:
    tar.add(name)
tar.close()

In [23]:
import boto3
from botocore.exceptions import ClientError
import logging

def upload_file(file_name, bucket, object_name):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
        print("Uploaded")
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [24]:
object_name = 'aws-mlops-lunch-and-learn/part1-sagemaker/sklearn-pretrained/model-artifact/{}'.format('sklearn_dec_tree.tar.gz')
upload_file('sklearn_dec_tree.tar.gz', 'atl-gkrishna-ml', object_name)

Uploaded


True