# KPMG Virtual Internship

# Task 2: Data Insights

### ---- 7 Model Development ----

In [1]:
#import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import sklearn as sk

#my info
__author__ = 'Cici Du'
__email__ = 'ciciechodu@gmail.com'

#### Load the dataset

In [2]:
merged_df = pd.read_csv('merged_df.csv')

In [123]:
merged_df.head()

Unnamed: 0,customer_id,gender,past_3_years_bike_related_purchases,job_title,job_industry_category,wealth_segment,owns_car,tenure,age,postcode,state,property_valuation,target
0,1,Female,93,Executive Secretary,Health,Mass Customer,Yes,11.0,65,2016,NSW,10,1
1,2,Male,81,Administrative Officer,Financial Services,Mass Customer,Yes,16.0,38,2153,NSW,10,0
2,4,Male,33,,IT,Mass Customer,No,7.0,57,4211,QLD,9,0
3,5,Female,56,Senior Editor,,Affluent Customer,Yes,8.0,41,2448,NSW,4,1
4,6,Male,35,,Retail,High Net Worth,Yes,13.0,52,3216,VIC,9,0


In [55]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3411 entries, 0 to 3410
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   customer_id                          3411 non-null   int64  
 1   gender                               3411 non-null   object 
 2   past_3_years_bike_related_purchases  3411 non-null   int64  
 3   job_title                            2996 non-null   object 
 4   job_industry_category                2851 non-null   object 
 5   wealth_segment                       3411 non-null   object 
 6   owns_car                             3411 non-null   object 
 7   tenure                               3411 non-null   float64
 8   age                                  3411 non-null   int64  
 9   postcode                             3411 non-null   int64  
 10  state                                3411 non-null   object 
 11  property_valuation            

#### Feature Engineering

Based on the assumption that job title and post code impact whether a customer is a target customer, we create new features to capture this relationship. 

In [56]:
merged_df.postcode.value_counts()

2153    28
2170    27
2155    26
2145    26
2770    22
        ..
3793     1
3786     1
3240     1
2535     1
2449     1
Name: postcode, Length: 829, dtype: int64

In [3]:
postcode_df = merged_df.groupby('postcode').agg({'customer_id' : 'count',
                                                 'target': sum})
postcode_df.head()

Unnamed: 0_level_0,customer_id,target
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,7,2
2007,2,1
2008,1,1
2009,4,4
2010,12,4


In [4]:
postcode_df.columns = ['no_customers','no_target_customers']
postcode_df['percentage'] = postcode_df.no_target_customers/postcode_df.no_customers * 100
postcode_df.sort_values(by=['percentage'], inplace=True, ascending=False)
postcode_df.head()

Unnamed: 0_level_0,no_customers,no_target_customers,percentage
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4883,1,1,100.0
3084,3,3,100.0
4270,1,1,100.0
2327,1,1,100.0
2333,3,3,100.0


In [5]:
postcode_df['percentage'].describe()

count    829.000000
mean      50.733024
std       34.588339
min        0.000000
25%       25.000000
50%       50.000000
75%       75.000000
max      100.000000
Name: percentage, dtype: float64

In [6]:
(postcode_df['percentage'] >= 75).sum()

229

In [7]:
#generate a label for postcodes with equal to or more than 75% target customers
def target_postcode(row):
    if row['percentage'] >= 75:
        postcode_target = 1
    else:
        postcode_target = 0
    return postcode_target
postcode_df['postcode_target'] = postcode_df.apply(target_postcode, axis = 1)
postcode_df.head()

Unnamed: 0_level_0,no_customers,no_target_customers,percentage,postcode_target
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4883,1,1,100.0,1
3084,3,3,100.0,1
4270,1,1,100.0,1
2327,1,1,100.0,1
2333,3,3,100.0,1


In [8]:
postcode_df = postcode_df.iloc[:,3]
postcode_df.head()

postcode
4883    1
3084    1
4270    1
2327    1
2333    1
Name: postcode_target, dtype: int64

In [9]:
#merge with the currect dataset
merged_df = pd.merge(merged_df, postcode_df, on = 'postcode')
merged_df.head()

Unnamed: 0,customer_id,gender,past_3_years_bike_related_purchases,job_title,job_industry_category,wealth_segment,owns_car,tenure,age,postcode,state,property_valuation,target,postcode_target
0,1,Female,93,Executive Secretary,Health,Mass Customer,Yes,11.0,65,2016,NSW,10,1,0
1,1059,Female,68,Account Representative III,Entertainment,High Net Worth,Yes,6.0,50,2016,NSW,11,0,0
2,2172,Male,6,Information Systems Manager,Health,Affluent Customer,Yes,13.0,39,2016,NSW,10,0,0
3,2380,Female,44,Technical Writer,Retail,Mass Customer,Yes,15.0,62,2016,NSW,10,1,0
4,2768,Male,86,Executive Secretary,Manufacturing,Mass Customer,No,3.0,23,2016,NSW,12,1,0


In [10]:
merged_df.job_title.value_counts()

Business Systems Development Analyst    38
Tax Accountant                          36
Social Worker                           36
Legal Assistant                         35
Associate Professor                     35
                                        ..
Research Assistant III                   2
Health Coach III                         2
Health Coach I                           2
Geologist II                             2
Developer I                              1
Name: job_title, Length: 195, dtype: int64

In [11]:
jobtitle_df = merged_df.groupby('job_title').agg({'customer_id' : 'count',
                                                 'target': sum})
jobtitle_df.head()

Unnamed: 0_level_0,customer_id,target
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Account Coordinator,25,11
Account Executive,29,17
Account Representative I,9,3
Account Representative II,4,1
Account Representative III,6,4


In [12]:
jobtitle_df.columns = ['no_customers','no_target_customers']
jobtitle_df['percentage'] = jobtitle_df.no_target_customers/jobtitle_df.no_customers * 100
jobtitle_df.sort_values(by=['percentage'], inplace=True, ascending=False)
jobtitle_df.head()

Unnamed: 0_level_0,no_customers,no_target_customers,percentage
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Web Developer IV,3,3,100.0
Programmer Analyst IV,3,3,100.0
Staff Accountant I,3,3,100.0
Database Administrator I,4,4,100.0
Administrative Assistant II,4,4,100.0


In [13]:
jobtitle_df['percentage'].describe()

count    195.000000
mean      50.498980
std       17.366441
min        0.000000
25%       40.000000
50%       50.000000
75%       60.000000
max      100.000000
Name: percentage, dtype: float64

In [14]:
(jobtitle_df['percentage'] >= 50).sum()

105

In [15]:
#generate a variable to label job titles with equal to or more than 50% of target customers
#The threshold is set after comparing different results
def target_jobtitle(row):
    if row['percentage'] >= 50:
        jobtitle_target = 1
    else:
        jobtitle_target = 0
    return jobtitle_target
jobtitle_df['jobtitle_target'] = jobtitle_df.apply(target_jobtitle, axis = 1)
jobtitle_df.head()

Unnamed: 0_level_0,no_customers,no_target_customers,percentage,jobtitle_target
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Web Developer IV,3,3,100.0,1
Programmer Analyst IV,3,3,100.0,1
Staff Accountant I,3,3,100.0,1
Database Administrator I,4,4,100.0,1
Administrative Assistant II,4,4,100.0,1


In [16]:
jobtitle_df = jobtitle_df.iloc[:,3]
jobtitle_df.head()

job_title
Web Developer IV               1
Programmer Analyst IV          1
Staff Accountant I             1
Database Administrator I       1
Administrative Assistant II    1
Name: jobtitle_target, dtype: int64

In [17]:
#merge with the currect dataset
merged_df = pd.merge(merged_df, jobtitle_df, on = 'job_title')
merged_df.head()

Unnamed: 0,customer_id,gender,past_3_years_bike_related_purchases,job_title,job_industry_category,wealth_segment,owns_car,tenure,age,postcode,state,property_valuation,target,postcode_target,jobtitle_target
0,1,Female,93,Executive Secretary,Health,Mass Customer,Yes,11.0,65,2016,NSW,10,1,0,1
1,2768,Male,86,Executive Secretary,Manufacturing,Mass Customer,No,3.0,23,2016,NSW,12,1,0,1
2,1415,Female,68,Executive Secretary,Property,High Net Worth,Yes,3.0,20,2774,NSW,8,0,0,1
3,1032,Female,36,Executive Secretary,,Mass Customer,Yes,18.0,40,3145,VIC,11,1,0,1
4,2310,Male,12,Executive Secretary,,High Net Worth,Yes,11.0,48,2021,NSW,12,1,0,1


In [18]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2996 entries, 0 to 2995
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   customer_id                          2996 non-null   int64  
 1   gender                               2996 non-null   object 
 2   past_3_years_bike_related_purchases  2996 non-null   int64  
 3   job_title                            2996 non-null   object 
 4   job_industry_category                2523 non-null   object 
 5   wealth_segment                       2996 non-null   object 
 6   owns_car                             2996 non-null   object 
 7   tenure                               2996 non-null   float64
 8   age                                  2996 non-null   int64  
 9   postcode                             2996 non-null   int64  
 10  state                                2996 non-null   object 
 11  property_valuation            

#### Define variables

In [19]:
cat_var = ['gender', 'job_industry_category','wealth_segment','owns_car','state']
num_var = ['past_3_years_bike_related_purchases','tenure','age','property_valuation','postcode_target', 'jobtitle_target']
target_var = 'target'

#### Encode variables

In [20]:
#encode categorical variables
def dummy_encode(df,cat_var,num_var):
    cat_df = pd.get_dummies(df[cat_var], drop_first=True)
    num_df = df[num_var]
    return pd.concat([num_df,cat_df,], axis=1)
encoded_df = dummy_encode(merged_df,cat_var, num_var)
encoded_df.head()

Unnamed: 0,past_3_years_bike_related_purchases,tenure,age,property_valuation,postcode_target,jobtitle_target,gender_Male,job_industry_category_Entertainment,job_industry_category_Financial Services,job_industry_category_Health,job_industry_category_IT,job_industry_category_Manufacturing,job_industry_category_Property,job_industry_category_Retail,job_industry_category_Telecommunications,wealth_segment_High Net Worth,wealth_segment_Mass Customer,owns_car_Yes,state_QLD,state_VIC
0,93,11.0,65,10,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0
1,86,3.0,23,12,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0
2,68,3.0,20,8,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0
3,36,18.0,40,11,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1
4,12,11.0,48,12,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0


In [21]:
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2996 entries, 0 to 2995
Data columns (total 20 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   past_3_years_bike_related_purchases       2996 non-null   int64  
 1   tenure                                    2996 non-null   float64
 2   age                                       2996 non-null   int64  
 3   property_valuation                        2996 non-null   int64  
 4   postcode_target                           2996 non-null   int64  
 5   jobtitle_target                           2996 non-null   int64  
 6   gender_Male                               2996 non-null   uint8  
 7   job_industry_category_Entertainment       2996 non-null   uint8  
 8   job_industry_category_Financial Services  2996 non-null   uint8  
 9   job_industry_category_Health              2996 non-null   uint8  
 10  job_industry_category_IT            

In [36]:
X = encoded_df
y = merged_df[target_var]

In [23]:
#check and see if the dataset is balanced
y.value_counts()

0    1501
1    1495
Name: target, dtype: int64

#### Split the dataset into the training set and the test set

In [24]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state = 42)

#### Feature scaling

In [25]:
sc = sk.preprocessing.StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#### Model 1: Random Forest Classifier

In [26]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42, n_estimators=100)
rf_classifier.fit(X_train,y_train)
y_pred = rf_classifier.predict(X_test)

In [140]:
def show_result(y_test,y_pred):
    cm = sk.metrics.confusion_matrix(y_test, y_pred)
    print(cm)
    print("Accuracy: {:.2f} %".format((sk.metrics.accuracy_score(y_test, y_pred))*100))
    print("Recall: {:.2f} %".format((sk.metrics.recall_score(y_test, y_pred))*100))
show_result(y_test, y_pred)

[[261  99]
 [178 211]]
Accuracy: 63.02 %
Recall: 54.24 %


In [141]:
#optimize for recall under the assumption that it's less costly to wrongly target a customer than to miss a good customer
from sklearn.model_selection import GridSearchCV
parameters = [{'max_depth':[3,4,5,6,7,8,9,10,15,20], 
               'max_features':['auto', None, 'sqrt', 'log2', 0.8, 0.9], 
               'min_samples_leaf':[100,110,120,130,140,150,160]}]
grid_search = GridSearchCV(estimator = rf_classifier, 
                           param_grid = parameters,
                           scoring = 'recall',
                           cv = 5,
                           n_jobs = -1) 
grid_search.fit(X_train, y_train)
best_recall = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Recall: {:.2f} %".format(best_recall * 100))    
print("Best Parameters:", best_parameters)

Best Recall: 68.99 %
Best Parameters: {'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 140}


#### Model 2: Kernel SVM

In [27]:
from sklearn.svm import SVC
svc_classifier = SVC(random_state=42)
svc_classifier.fit(X_train, y_train)

SVC(random_state=42)

In [143]:
y_pred = svc_classifier.predict(X_test)

In [144]:
def show_result(y_test,y_pred):
    cm = sk.metrics.confusion_matrix(y_test, y_pred)
    print(cm)
    print("Accuracy: {:.2f} %".format((sk.metrics.accuracy_score(y_test, y_pred))*100))
    print("Recall: {:.2f} %".format((sk.metrics.recall_score(y_test, y_pred))*100))
show_result(y_test, y_pred)

[[270  90]
 [194 195]]
Accuracy: 62.08 %
Recall: 50.13 %


#### Apply PCA

In [145]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 14)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

#### Apply grid search to find the optimal hyperparameters

In [147]:
#optimize recall as we assume the cost of wrongly targeting a customer is less than missing a good customer
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[0.25, 0.5, 0.75, 1], 'kernel':['rbf', 'sigmoid','poly'], 'gamma':[0.3, 0.6, 0.9]},
             {'kernel' : ['linear'], 'C':[0.25, 0.5, 0.75, 1]}]
grid_search = GridSearchCV(estimator = svc_classifier, 
                           param_grid = parameters,
                           scoring = 'recall',
                           cv = 5,
                           n_jobs = -1) 
grid_search.fit(X_train_pca, y_train)
best_recall = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Recall: {:.2f} %".format(best_recall * 100))    
print("Best Parameters:", best_parameters)

Best Recall: 55.06 %
Best Parameters: {'C': 1, 'gamma': 0.3, 'kernel': 'rbf'}


#### Select the best model

In [37]:
rf_classifier_final = RandomForestClassifier(random_state=42, 
                                             n_estimators=100, 
                                             max_depth = 5, 
                                             max_features = 'auto', 
                                             min_samples_leaf = 140)
#retrain on the entire dataset
X = sc.transform(X)
rf_classifier_final.fit(X,y)
y_pred = rf_classifier.predict(X)

In [39]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = rf_classifier_final, X = X, y = y)
recalls = cross_val_score(estimator = rf_classifier_final, scoring = 'recall', X = X, y = y)
precisions = cross_val_score(estimator = rf_classifier_final, scoring = 'precision', X = X, y = y)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Recall: {:.2f} %".format(recalls.mean()*100))
print("Precision: {:.2f} %".format(precisions.mean()*100))

Accuracy: 63.29 %
Recall: 69.57 %
Precision: 62.20 %


### ---- 8 Model Deployment ----

In [30]:
dataset = pd.read_csv('cleaned_newcust_df.csv')

In [153]:
dataset.head()

Unnamed: 0,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,property_valuation
0,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,6
1,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,11
2,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,5
3,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,1
4,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,9


In [154]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   gender                               1000 non-null   object
 1   past_3_years_bike_related_purchases  1000 non-null   int64 
 2   DOB                                  983 non-null    object
 3   job_title                            894 non-null    object
 4   job_industry_category                835 non-null    object
 5   wealth_segment                       1000 non-null   object
 6   deceased_indicator                   1000 non-null   object
 7   owns_car                             1000 non-null   object
 8   tenure                               1000 non-null   int64 
 9   address                              1000 non-null   object
 10  postcode                             1000 non-null   int64 
 11  state                                1000 no

In [31]:
def preprocess_dataset(dataset):
    '''convert DOB to age'''
    dataset['DOB'] = pd.to_datetime(dataset.DOB, format='%Y-%m-%d')
    dataset['age'] = 2018 - dataset['DOB'].dt.year
    '''add postcode label'''
    dataset = pd.merge(dataset, postcode_df, on = 'postcode', how = 'left')
    '''add job title label'''
    dataset = pd.merge(dataset, jobtitle_df, on = 'job_title', how = 'left')
    '''encode categorical variables'''
    encoded_dataset = dummy_encode(dataset,cat_var, num_var)
    return encoded_dataset
cl_dataset = preprocess_dataset(dataset)
cl_dataset.head()

Unnamed: 0,past_3_years_bike_related_purchases,tenure,age,property_valuation,postcode_target,jobtitle_target,gender_Male,job_industry_category_Entertainment,job_industry_category_Financial Services,job_industry_category_Health,job_industry_category_IT,job_industry_category_Manufacturing,job_industry_category_Property,job_industry_category_Retail,job_industry_category_Telecommunications,wealth_segment_High Net Worth,wealth_segment_Mass Customer,owns_car_Yes,state_QLD,state_VIC
0,86,14,61.0,6,0.0,1.0,1,0,0,0,0,1,0,0,0,0,1,1,1,0
1,69,16,48.0,11,0.0,0.0,1,0,0,0,0,0,1,0,0,0,1,0,0,0
2,10,10,44.0,5,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
3,64,5,39.0,1,0.0,1.0,0,0,0,0,0,1,0,0,0,0,0,1,1,0
4,34,19,53.0,9,0.0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [44]:
cl_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   past_3_years_bike_related_purchases       1000 non-null   int64  
 1   tenure                                    1000 non-null   int64  
 2   age                                       983 non-null    float64
 3   property_valuation                        1000 non-null   int64  
 4   postcode_target                           925 non-null    float64
 5   jobtitle_target                           894 non-null    float64
 6   gender_Male                               1000 non-null   uint8  
 7   job_industry_category_Entertainment       1000 non-null   uint8  
 8   job_industry_category_Financial Services  1000 non-null   uint8  
 9   job_industry_category_Health              1000 non-null   uint8  
 10  job_industry_category_IT             

#### Impute missing values with KNN

In [32]:
from sklearn.impute import KNNImputer
imputer = KNNImputer()
imputed_df = cl_dataset.copy(deep = True)
imputed_df.iloc[:,:] = np.round(imputer.fit_transform(imputed_df))

In [157]:
imputed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   past_3_years_bike_related_purchases       1000 non-null   float64
 1   tenure                                    1000 non-null   float64
 2   age                                       1000 non-null   float64
 3   property_valuation                        1000 non-null   float64
 4   postcode_target                           1000 non-null   float64
 5   jobtitle_target                           1000 non-null   float64
 6   gender_Male                               1000 non-null   float64
 7   job_industry_category_Entertainment       1000 non-null   float64
 8   job_industry_category_Financial Services  1000 non-null   float64
 9   job_industry_category_Health              1000 non-null   float64
 10  job_industry_category_IT             

In [33]:
X_result = sc.transform(imputed_df)

In [34]:
y_pred_result = rf_classifier_final.predict(X_result)
result_dataset = dataset.copy(deep = True)
result_dataset['predicted_result'] = y_pred_result
result_dataset.head()

Unnamed: 0,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,property_valuation,age,predicted_result
0,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,6,61.0,1
1,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,11,48.0,0
2,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,5,44.0,0
3,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,1,39.0,1
4,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,9,53.0,1


In [35]:
result_dataset.predicted_result.value_counts()

1    612
0    388
Name: predicted_result, dtype: int64

In [51]:
result_dataset.to_csv('result_dataset.csv', index = False)