# Predicting customer subscription using Bank data

This model is being built to predict if a customer will subscribe to a bank account using bank data

In [1]:
# Import libraries
import pandas as pd

In [2]:
# Load the data
banking_df = pd.read_csv('bank-additional-full.csv', sep = ';')

In [3]:
banking_df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
banking_df.dtypes.value_counts()

object     11
int64       5
float64     5
Name: count, dtype: int64

In [5]:
banking_df.shape

(41188, 21)

In [6]:
banking_df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [7]:
banking_df['y'].value_counts()

y
no     36548
yes     4640
Name: count, dtype: int64

In [8]:
banking_df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [9]:
banking_df["y"] = banking_df["y"].apply(lambda x: 1 if x=="yes" else 0)

In [10]:
train_df = banking_df.sample(frac=0.85, random_state=417)
test_df = banking_df.drop(train_df.index)


In [11]:
print(train_df["y"].value_counts(normalize=True))
print(test_df["y"].value_counts(normalize=True))

y
0    0.887889
1    0.112111
Name: proportion, dtype: float64
y
0    0.884267
1    0.115733
Name: proportion, dtype: float64


In [12]:
X_train = train_df.drop("y", axis=1)
y_train = train_df["y"]

In [13]:
X_test = test_df.drop("y", axis=1)
y_test = test_df["y"]

## k-NN for one feature

Now that we have our training and test sets, we can implement our algorithm!

Before we begin, we need to select a distance metric to calculate the distance between observations.

In [14]:
# Calculate the Euclidean distance between the single_test_input and every observation in X_train for the given feature. Save the distances in a new column, distance, in X_train.
def knn(feature, single_test_input, k):
    X_train['distance'] = abs(X_train[feature] - single_test_input[feature])
# For the k rows in distance with the smallest distance values, identify the most common label for the same rows in y_train. Save the label to the variable prediction
    prediction = y_train[X_train["distance"].nsmallest(n=k).index].mode()[0]
    return prediction

In [15]:
# Call the function, knn() with the following arguments: feature = "age". For single_test_input, select a random observation from X_test.k = 3.
model_prediction = knn("age", X_test.iloc[417], 3)
print(f"Predicted label: {model_prediction}")

Predicted label: 0


In [16]:
# Print the true label (y_test) corresponding to the single_test_input used above.
print(f"Actual label: {y_test.iloc[417]}")

Actual label: 0


Yes, the prediction and actual label are the same, 0

### Using knn on the test set

#### Using Age

Use knn() for every row in X_test. Store the predictions in a new column, age_predicted_y, in X_test. Use the same arguments for knn() as before:
feature = "age", 
 k = 3.

In [17]:
X_test["age_predicted_y"] = X_test.apply(lambda x: knn("age", x, 3), axis=1)

In [18]:
# Testing the models accuracy
model_accuracy = (X_test["age_predicted_y"] == y_test).value_counts(normalize=True)[True]*100

In [19]:
# Print out the models accuracy
print(f"Accuracy of model trained on the column 'age': {model_accuracy:.2f}%")

Accuracy of model trained on the column 'age': 83.09%


We can see the model has a fairly good accuracy when using age to predict

#### Using Campaign

Use knn() for every row in X_test. Store the predictions in a new column, campaign_predicted_y, in X_test. Use the same arguments for knn() as before: feature = "campaign", k = 3.

In [20]:
X_test["campaign_predicted_y"] = X_test.apply(lambda x: knn("campaign", x, 3), axis=1)

In [21]:
# Testing the models accuracy
model_accuracy1 = (X_test["campaign_predicted_y"] == y_test).value_counts(normalize=True)[True]*100

In [22]:
# Print out the models accuracy
print(f"Accuracy of model trained on the column 'campaign': {model_accuracy:.2f}%")

Accuracy of model trained on the column 'campaign': 83.09%


We can see that Campaign is slightly better than age and also a good predictor

### Using categorical values

First of all we have to use one-hot encoding on our categorical variables before we can use them for our model

#### Marital

In [23]:
# Create a copy of our data
banking_df_copy = banking_df.copy()

In [24]:
# Using pandas get dummies for our encoding
banking_df_copy = pd.get_dummies(data = banking_df_copy, columns = ['marital'], drop_first = True)

In [25]:
banking_df_copy

Unnamed: 0,age,job,education,default,housing,loan,contact,month,day_of_week,duration,...,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,marital_married,marital_single,marital_unknown
0,56,housemaid,basic.4y,no,no,no,telephone,may,mon,261,...,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,True,False,False
1,57,services,high.school,unknown,no,no,telephone,may,mon,149,...,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,True,False,False
2,37,services,high.school,no,yes,no,telephone,may,mon,226,...,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,True,False,False
3,40,admin.,basic.6y,no,no,no,telephone,may,mon,151,...,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,True,False,False
4,56,services,high.school,no,no,yes,telephone,may,mon,307,...,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,professional.course,no,yes,no,cellular,nov,fri,334,...,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,1,True,False,False
41184,46,blue-collar,professional.course,no,no,no,cellular,nov,fri,383,...,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,0,True,False,False
41185,56,retired,university.degree,no,yes,no,cellular,nov,fri,189,...,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,0,True,False,False
41186,44,technician,professional.course,no,no,no,cellular,nov,fri,442,...,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,1,True,False,False


We are going to use age, campaign, marital_married and marital_single for our model

In [26]:
train_df = banking_df_copy.sample(frac=0.85, random_state=417)
test_df = banking_df_copy.drop(train_df.index)

In [27]:
X_train = train_df.drop("y", axis=1)
y_train = train_df["y"]

In [28]:
X_test = test_df.drop("y", axis=1)
y_test = test_df["y"]

Create a function knn() that has the following parameters: features, single_test_input, k


Inside the function: Calculate the Euclidean distance between the single_test_input and every observation in X_train for the given features. Save the distances in a new column, distance, in X_train.

In [29]:
def knn(features, single_test_input, k):
    squared_distance = 0
    for feature in features:
        if isinstance(single_test_input[feature], pd.Series):
            squared_distance += (X_train[feature].astype(int) - single_test_input[feature].astype(int))**2
        else:
            squared_distance += (X_train[feature].astype(int) - single_test_input[feature])**2
    X_train["distance"] = squared_distance**0.5
#For the k rows in distance with the smallest distance values, identify the most common label for the same rows in y_train. Save the label to the variable prediction. Return prediction.
    prediction = y_train[X_train["distance"].nsmallest(n=k).index].mode()[0]
    return prediction


In [30]:
model_prediction2 = knn(["age", "campaign", "marital_married", "marital_single"], X_test.iloc[417], 3)
print(f"Predicted label: {model_prediction2}")
print(f"Actual label: {y_test.iloc[417]}")


Predicted label: 0
Actual label: 0


In [31]:
X_test["predicted_y"] = X_test.apply(lambda x: knn(["age", "campaign", "marital_married", "marital_single"], x, 3), axis=1)

In [32]:
model_accuracy = (X_test["predicted_y"] == y_test).value_counts(normalize=True)[True]*100
print(f"Accuracy of the model: {model_accuracy:.2f}%")

Accuracy of the model: 85.34%


#### Scaling our Age and Campaign columns

In [33]:
features = ["age", "campaign"]

In [34]:
for feature in features:
    X_train[feature] = (X_train[feature] - X_train[feature].min())/(X_train[feature].max() - X_train[feature].min())
    X_test[feature] = (X_test[feature] - X_train[feature].min())/(X_train[feature].max() - X_train[feature].min())
    

In [35]:
X_test["predicted_y"] = X_test.apply(lambda x: knn(["age", "campaign", "marital_married", "marital_single"], x, 3), axis=1)


In [36]:
model_accuracy = (X_test["predicted_y"] == y_test).value_counts(normalize=True)[True]*100

In [37]:
print(f"Accuracy of the model: {model_accuracy:.2f}%")

Accuracy of the model: 88.43%


Their was a slight improvement in accuracy after Scaling our data