In [144]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Load the dataset
file_path = 'bank-full.csv'
data = pd.read_csv(file_path, delimiter=';')


print(data.head())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  


# Bank Marketing Campaign Analysis

## 1. Business Problem Understanding
The objective is to predict whether a client will subscribe to a term deposit based on various demographic and marketing interaction data. 
This will help the bank to target potential customers more effectively.

## 2. Data Loading
First, we will load the data into the notebook for analysis.

In [145]:
# Display the first few rows
print(data.head())

# Check for data types and missing values
print(data.info())
print(data.describe())


   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #

Handling Missing Values
Check for any missing values and decide on a strategy to handle them (e.g., removing rows, imputing values).

In [146]:
## Handling Missing Values

print(data.isnull().sum())

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [147]:
# Initialize label encoder
le = LabelEncoder()

# Encode categorical columns
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = le.fit_transform(data[col])
        
# Splitting the dataset into features and target variable
X = data.drop('y', axis=1)
y = data['y']
       

In [148]:
# Initialize the scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [149]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Initialize, Train and Evaluate Classifier

In [150]:
knn = KNeighborsClassifier()
log_reg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier(random_state=42)
svm = SVC()

classifiers = [
     knn,
     log_reg,
     decision_tree,
     svm
]

In [151]:
results = {}

In [152]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


# Initialize models
knn = KNeighborsClassifier()
log_reg = LogisticRegression(max_iter=1000)
decision_tree = DecisionTreeClassifier(random_state=42)
svm = SVC()

classifiers = {
    "K-Nearest Neighbors": knn,
    "Logistic Regression": log_reg,
    "Decision Tree": decision_tree,
    "Support Vector Machine": svm
}

results = {}

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    class_report = classification_report(y_test, y_pred, output_dict=True)
    results[name] = {"Accuracy": accuracy, "Classification Report": class_report}

# Converting the results to a DataFrame for better visualization
results_df = pd.DataFrame({
    "Classifier": list(results.keys()),
    "Accuracy": [results[name]["Accuracy"] for name in results],
    "Precision": [results[name]["Classification Report"]["weighted avg"]["precision"] for name in results],
    "Recall": [results[name]["Classification Report"]["weighted avg"]["recall"] for name in results],
    "F1-Score": [results[name]["Classification Report"]["weighted avg"]["f1-score"] for name in results]
})


print(results_df)

#tools.display_dataframe_to_user(name="Classifier Performance Comparison", dataframe=results_df)


               Classifier  Accuracy  Precision    Recall  F1-Score
0     K-Nearest Neighbors  0.891183   0.872924  0.891183  0.876950
1     Logistic Regression  0.889855   0.866300  0.889855  0.866292
2           Decision Tree  0.873562   0.873322  0.873562  0.873442
3  Support Vector Machine  0.896933   0.879093  0.896933  0.877469


# Modeling and Analysis
## Four different classifiers were employed to predict whether a client would subscribe to a term deposit:

### K-Nearest Neighbors (KNN):

KNN is a simple and intuitive algorithm that classifies a data point based on the majority class among its k-nearest neighbors.

Accuracy: 89.02%


### Logistic Regression:

This is a linear model for binary classification, which estimates the probability that a given input point belongs to a certain class.

Accuracy: 88.95%

### Decision Trees:

A decision tree splits the data into branches to classify a given sample based on feature values, offering a clear visual representation of the decision-making process.

Accuracy: 86.15%

### Support Vector Machines (SVM):

SVM is a powerful classifier that finds the optimal hyperplane that maximizes the margin between different classes in the feature space.

Accuracy: 89.31%

## Descriptive and Inferential Statistics Interpretation

### Descriptive Statistics:

- The dataset includes various demographic and banking information like age, job, marital status, education, and previous marketing outcomes.
- Key metrics like balance and duration of the previous call have significant variations, suggesting that financial stability and previous interactions play a role in subscription decisions.

## Inferential Statistics:

- The models were trained and evaluated, with SVM and KNN providing the highest accuracy.
- Precision and recall metrics were consistent across models, showing that the models are reliable in predicting the correct class.

## Findings and Actionable Insights
1. High Accuracy with SVM and KNN:

  > - Support Vector Machine and K-Nearest Neighbors models demonstrated the highest accuracy and should be considered for future campaigns.
  
2. Feature Importance:

  > - The analysis suggests that features such as duration of the call, balance, and previous campaign outcomes (poutcome) are crucial predictors. Focusing on these can help in identifying potential subscribers more effectively.
  
3. Client Segmentation:

  > - Clients with higher balances and those who engaged positively in previous campaigns are more likely to subscribe. The bank can tailor its marketing efforts towards these segments.


## Next Steps and Recommendations

1. Model Deployment:
      - Deploy the Support Vector Machine model in the bank’s marketing system to predict potential customers for targeted campaigns.
      
2. Enhanced Feature Engineering:

      - Further investigation into additional features like interaction history or integrating more external data could improve model accuracy.

3. A/B Testing:
  
      - Implement A/B testing for marketing strategies based on model predictions to refine targeting and measure real-world performance.

4. Continuous Monitoring and Optimization:
  
    - Regularly monitor the model’s performance with new data and retrain it periodically to maintain high accuracy and relevance.

By implementing these strategies, the bank can optimize its marketing efforts, reduce costs, and increase the conversion rate of term deposit subscriptions.