Bank Marketing  
  The data is related with direct marketing campaigns of a Portuguese banking institution. 
   The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, 
   in order to access if the product (bank term deposit) would be (or not) subscribed.  
   
   This dataset is public available for research. The details are described in [Moro et al., 2011]. 
  Please include this citation if you plan to use this database:

  [Moro et al., 2011] S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. 
  In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimarães, Portugal, October, 2011. EUROSIS.

  Available at: [pdf] http://hdl.handle.net/1822/14838
                [bib] http://www3.dsi.uminho.pt/pcortez/bib/2011-esm-1.txt

# Import the libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt


# Read the bank data

In [2]:
bank_data = pd.read_csv("bank-full.csv", sep=";")

#### Check the data

In [3]:
bank_data.shape

(45211, 17)

In [4]:
bank_data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [5]:
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [6]:
bank_data.tail()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no
45210,37,entrepreneur,married,secondary,no,2971,no,no,cellular,17,nov,361,2,188,11,other,no


In [7]:
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [8]:
print(f'Any NaN values? {bank_data.isna().values.any()}')

Any NaN values? False


In [9]:
print(f'Any duplicates? {bank_data.duplicated().values.any()}')

Any duplicates? False


In [10]:
bank_data.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [11]:
bank_data.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [12]:
bank_data['marital'].value_counts()

married     27214
single      12790
divorced     5207
Name: marital, dtype: int64

In [13]:
bank_data['job'].value_counts()

blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: job, dtype: int64

# Preprocess the data

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
bank_data['job'] = le.fit_transform(bank_data['job'])
bank_data.value_counts()

age  job  marital   education  default  balance  housing  loan  contact    day  month  duration  campaign  pdays  previous  poutcome  y  
18   8    single    primary    no       608      no       no    cellular   12   aug    267       1         -1     0         unknown   yes    1
45   4    married   tertiary   no       0        no       no    cellular   20   aug    71        2         -1     0         unknown   no     1
          divorced  tertiary   no       54       no       yes   cellular   21   jul    34        3         -1     0         unknown   no     1
                                        220      yes      no    unknown    18   jun    89        4         -1     0         unknown   no     1
                                        335      no       no    cellular   5    aug    297       1         -1     0         unknown   no     1
                                                                                                                                            ..
35  

In [15]:
bank_data['marital'] = bank_data.marital.map({'single': 0, 'divorced': 1, 'married' : 2}) 
bank_data['education'] = bank_data.education.map({'unknown': 0, 'primary' : 1, 'secondary': 2, 'tertiary': 3}) 
bank_data['default'] = bank_data.default.map({'no': 0, 'yes' : 1}) 
bank_data['housing'] = bank_data.housing.map({'no': 0, 'yes' : 1}) 
bank_data['loan'] = bank_data.loan.map({'no': 0, 'yes' : 1})
bank_data['contact'] = bank_data.contact.map({'unknown': 0, 'telephone' : 1, 'cellular': 2}) 
bank_data['y'] = bank_data.y.map({'no': 0, 'yes' : 1}) 

## Drop features from other attributes

In [16]:
bank_data = bank_data.drop(['day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome'], axis=1)

In [17]:
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,y
0,58,4,2,3,0,2143,1,0,0,0
1,44,9,0,2,0,29,1,0,0,0
2,33,2,2,2,0,2,1,1,0,0
3,47,1,2,0,0,1506,1,0,0,0
4,33,11,0,0,0,1,0,0,0,0


In [18]:
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   age        45211 non-null  int64
 1   job        45211 non-null  int32
 2   marital    45211 non-null  int64
 3   education  45211 non-null  int64
 4   default    45211 non-null  int64
 5   balance    45211 non-null  int64
 6   housing    45211 non-null  int64
 7   loan       45211 non-null  int64
 8   contact    45211 non-null  int64
 9   y          45211 non-null  int64
dtypes: int32(1), int64(9)
memory usage: 3.3 MB


### Save the processed data

In [20]:
bank_data.to_csv("bank_data_processed.csv", index=False)