In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Data Preprocessing

In [8]:
# Import data
df = pd.read_csv('/kaggle/input/bank-marketing/bank-additional-full.csv', delimiter=';')
df.head()

In [9]:
# Check missing values
df.isnull().sum() / df.count()

In [115]:
# Check target distribution
# 1. Check weight of each class

print(df.y.value_counts() / df.y.count())

# 2. Plot the distribution of the target variable

df.y.value_counts().plot.bar()

In [11]:
# Split the data into charter and numerical variables (exclude y which is the target variable)
char = df.drop('y', axis=1).select_dtypes(include="object")
dec = df.drop('y', axis=1).select_dtypes(exclude="object")

In [12]:
# Check correlation between numerical variabels

dec.corr()

nr.employed, euribor3m, emp.var.rate and cons.price.idx are highly correlated variables. We will need to be very careful with these variables during the modeling step.

In [18]:
# Check statistics of numerical variables
# Create a table summary

summary = dec.quantile([0.01, 0.25, 0.5, 0.75, 0.99]).T
summary.columns = ['1% percentile', '25% percentile', 'median', '75% percentile', '99% percentile']
summary

# 2. Add a column 'mean' to summary table

summary['mean'] = dec.mean()
summary['max'] = dec.max()
summary['min'] = dec.min()
summary

The 25% percentile of pdays is 999 days which means most of the customers were not contacted before. We will consider using a categorical variable which indicate whether the customer was contacted before or not.

In [19]:
# Check unique values of each categorical variable
# Use unique() function with a for loop

for col in df.columns:
    print()
    if df[col].dtype == 'object':
        print(f'Name of Column is: {col} and unique values are: {df[col].unique()}')

There are some 'unknown' values. Since those are categorical variables, we can treat them as individual classes.

In [21]:
# add "pdays_char" column to df dataset to indicate whether the customer was contacted before or not. 

df['pdays_char'] = df.apply(lambda x: 1 if x.pdays != 999 else 0, axis = 1)

In [43]:
# Create a modeling dataset
# Drop some variables that are irrelevant to the models.
# Create a dataset y that contains only target variable

vars_to_drop = ['pdays', 'month', 'day_of_week']
df_clean = df.drop(vars_to_drop, axis=1)
df_model = pd.get_dummies(df_clean.drop('y', axis=1), drop_first=True)
df_y = df['y'].apply(lambda x: 1 if x == 'yes' else 0)
df_y

In [44]:
# Normalize the data by deduct the mean and then divide by standard deviation of the data

df_model = (df_model - df_model.mean()) / df_model.std()

## Modeling

In [45]:
# Create train, test plit
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_model, df_y, test_size=0.33, random_state=42)

In [46]:
print(X_train.shape)
print(X_test.shape)

### Logistic regression model - Lasso
Since previously we have identified some highly correlated variables, we will introduce l1 regularization here to deal with multicollinearity.

In [48]:
# Define the logistic regression model. 
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.1)
clf.fit(X_train, y_train)
print(clf.coef_)

In [51]:
new = pd.DataFrame([X_train.columns, clf.coef_])
new.T

In [54]:
# Predict on both training set and testing set

y_train_predict = clf.predict(X_train) > 0.5
y_test_predict = clf.predict(X_test) > 0.5

In [56]:
# Calculate precision score for each data 
from sklearn.metrics import precision_score
print(precision_score(y_train, y_train_predict, average='macro'))
print(precision_score(y_test, y_test_predict, average='macro'))

### Decision tree model

In [96]:
# Build the decision tree model. 
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0, max_depth=3)
clf.fit(X_train, y_train)
print(clf.feature_importances_)

In [97]:
new = pd.DataFrame([X_train.columns, clf.feature_importances_])
new.T

In [98]:
y_train_predict = clf.predict(X_train)
y_test_predict = clf.predict(X_test)
print(y_train_predict)

In [99]:
# Calculate precision, recall, f1 and accuracy score for each data
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
print('This is the precision of the training dataset:', precision_score(y_train, y_train_predict, average='macro'))
print('This is the precision of the testing dataset:', precision_score(y_test, y_test_predict, average='macro'))
print('This is the recall of the training dataset:', recall_score(y_train, y_train_predict, average='macro'))
print('This is the recall of the testing dataset:', recall_score(y_test, y_test_predict, average='macro'))
print('This is the f1 of the training dataset:', f1_score(y_train, y_train_predict, average='macro'))
print('This is the f1 of the testing dataset:', f1_score(y_test, y_test_predict, average='macro'))
print('This is the accuracy of the training dataset:', accuracy_score(y_train, y_train_predict))
print('This is the accuracy of the testing dataset:', accuracy_score(y_test, y_test_predict))

In [100]:
from sklearn.metrics import confusion_matrix
print('This is the confusion matrix of the training dataset:', confusion_matrix(y_train, y_train_predict))
print('This is the confusion matrix of the testing dataset:', confusion_matrix(y_test, y_test_predict))

In [101]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(clf, X_train, y_train)
plt.show()

In [102]:
plot_confusion_matrix(clf, X_test, y_test)
plt.show()

## Feature analysis

In [103]:
# Plot some varibales with large variable importance. 
df.groupby('y').mean().duration.plot.bar()

In [113]:
df1 = df[df_model['nr.employed'] <= -1.099] 
df1.groupby('y').mean()['nr.employed'].plot.bar()

In [114]:
df2 = df[df_model['nr.employed'] > -1.099] 
df2.groupby('y').mean()['nr.employed'].plot.bar()

In [109]:
from sklearn import tree
plt.figure(figsize=(100,100))
tree.plot_tree(clf)
plt.show()

## Conclusion

In [None]:
# Draw conclusion based on identified important variables

Based on our model output, the most important variable that impacts whether clients will subscribe the bank term deposit product. Our plot has shown that there is a positive correlation between duration and the subscription rate, the longer the duration of the last contact, the more likely the clients will subscribe the bank term deposit product. Even though the decision tree model consider the number of employees as an important variable, we do not see that there is any correlation between the subscription rate. We also consider the logistic regression model; however, the poor performance of the model indicates that logistic regression may not be a good fit for the data set and the conclusion drew from the logistic regression may not be reliable. Overall, our recommendation is that we could keep the call longer for selling the products to clients.