In [252]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from mlxtend.frequent_patterns import apriori,association_rules

In [253]:
bank_data = pd.read_csv('bankdata_csv_all.csv')
bank_data.head()

Unnamed: 0,id,age,sex,region,income,married,children,car,save_act,current_act,mortgage,pep
0,ID12101,48,FEMALE,INNER_CITY,17546.0,NO,1,NO,NO,NO,NO,YES
1,ID12102,40,MALE,TOWN,30085.1,YES,3,YES,NO,YES,YES,NO
2,ID12103,51,FEMALE,INNER_CITY,16575.4,YES,0,YES,YES,YES,NO,NO
3,ID12104,23,FEMALE,TOWN,20375.4,YES,3,NO,NO,YES,NO,NO
4,ID12105,57,FEMALE,RURAL,50576.3,YES,0,NO,YES,NO,NO,NO


In [254]:
len(bank_data)
#600 records

600

In [255]:
#Check for null
bank_data.isnull().sum()

id             0
age            0
sex            0
region         0
income         0
married        0
children       0
car            0
save_act       0
current_act    0
mortgage       0
pep            0
dtype: int64

In [256]:
print(bank_data['age'].min())
print(bank_data['age'].max())
#Age varies from 18-67

print(bank_data['income'].min())
print(bank_data['income'].max())
#Incomes varies from 5000$ to 63000$

print(bank_data['children'].min())
print(bank_data['children'].max())
#Children varies from 0 to 3 children

18
67
5014.21
63130.1
0
3


In [257]:
#Discretization of numeric columns - age, income
bank_data['age_cat']=pd.cut(bank_data['age'],
       bins=[0,10,20,30,40,50,60,70,80],
       labels = ["child","teens","twenties","thirties","fourties","fifties","sixties","seventies"])

In [258]:
bank_data['income_cat']=pd.cut(bank_data['income'],
       bins=[0,10000,20000,30000,40000,50000,60000,70000],
       labels = ["<10k","<20k","<30k","<40k","<50k","<60k","<70k"])


In [259]:
data = bank_data.drop(['id','age','income'],axis=1)
data.head()

Unnamed: 0,sex,region,married,children,car,save_act,current_act,mortgage,pep,age_cat,income_cat
0,FEMALE,INNER_CITY,NO,1,NO,NO,NO,NO,YES,fourties,<20k
1,MALE,TOWN,YES,3,YES,NO,YES,YES,NO,thirties,<40k
2,FEMALE,INNER_CITY,YES,0,YES,YES,YES,NO,NO,fifties,<20k
3,FEMALE,TOWN,YES,3,NO,NO,YES,NO,NO,twenties,<30k
4,FEMALE,RURAL,YES,0,NO,YES,NO,NO,NO,fifties,<60k


In [260]:
#Factorization of all columns
data['sex']=pd.Categorical(data['sex'])
data['region']=pd.Categorical(data['region'])
data['married']=pd.Categorical(data['married'])
data['children']=pd.Categorical(data['children'])
data['car']=pd.Categorical(data['car'])
data['save_act']=pd.Categorical(data['save_act'])
data['current_act']=pd.Categorical(data['current_act'])
data['mortgage']=pd.Categorical(data['mortgage'])
data['pep']=pd.Categorical(data['pep'])

#All dtypes are categories

In [261]:
#One hot encoding to create sparse matrix
data_dummies = pd.get_dummies(data)
list(data_dummies.columns)

['sex_FEMALE',
 'sex_MALE',
 'region_INNER_CITY',
 'region_RURAL',
 'region_SUBURBAN',
 'region_TOWN',
 'married_NO',
 'married_YES',
 'children_0',
 'children_1',
 'children_2',
 'children_3',
 'car_NO',
 'car_YES',
 'save_act_NO',
 'save_act_YES',
 'current_act_NO',
 'current_act_YES',
 'mortgage_NO',
 'mortgage_YES',
 'pep_NO',
 'pep_YES',
 'age_cat_child',
 'age_cat_teens',
 'age_cat_twenties',
 'age_cat_thirties',
 'age_cat_fourties',
 'age_cat_fifties',
 'age_cat_sixties',
 'age_cat_seventies',
 'income_cat_<10k',
 'income_cat_<20k',
 'income_cat_<30k',
 'income_cat_<40k',
 'income_cat_<50k',
 'income_cat_<60k',
 'income_cat_<70k']

In [262]:
data_dummies.head()

Unnamed: 0,sex_FEMALE,sex_MALE,region_INNER_CITY,region_RURAL,region_SUBURBAN,region_TOWN,married_NO,married_YES,children_0,children_1,...,age_cat_fifties,age_cat_sixties,age_cat_seventies,income_cat_<10k,income_cat_<20k,income_cat_<30k,income_cat_<40k,income_cat_<50k,income_cat_<60k,income_cat_<70k
0,1,0,1,0,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,0,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,0,1,0,0,0,0,1,1,0,...,1,0,0,0,1,0,0,0,0,0
3,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,1,0


In [268]:
# Building the model
frq_items = apriori(data_dummies, min_support = 0.3, use_colnames = True,verbose=1)
frq_items['length'] = frq_items['itemsets'].apply(lambda x: len(x))
print(frq_items[(frq_items['length'] ==2) &(frq_items['support'] >= 0.1)])


Processing 240 combinations | Sampling itemset size 2Processing 288 combinations | Sampling itemset size 3Processing 4 combinations | Sampling itemset size 4
     support                              itemsets  length
16  0.325000             (sex_FEMALE, married_YES)       2
17  0.343333            (sex_FEMALE, save_act_YES)       2
18  0.383333         (current_act_YES, sex_FEMALE)       2
19  0.341667             (mortgage_NO, sex_FEMALE)       2
20  0.335000               (married_YES, sex_MALE)       2
21  0.346667              (save_act_YES, sex_MALE)       2
22  0.375000           (current_act_YES, sex_MALE)       2
23  0.310000               (mortgage_NO, sex_MALE)       2
24  0.341667  (region_INNER_CITY, current_act_YES)       2
25  0.300000             (married_YES, children_0)       2
26  0.336667                 (car_NO, married_YES)       2
27  0.323333                (married_YES, car_YES)       2
28  0.461667           (married_YES, save_act_YES)       2
29  0.488333 



In [264]:
# Collecting the inferred rules in a dataframe
rules = association_rules(frq_items, metric ="lift",min_threshold = 0.5)
rules = rules.sort_values(['support', 'lift'], ascending =[True, True])
print(rules.head())

                    antecedents    consequents  antecedent support  \
18                (married_YES)   (children_0)            0.660000   
19                 (children_0)  (married_YES)            0.438333   
60                (mortgage_NO)      (pep_YES)            0.651667   
61                    (pep_YES)  (mortgage_NO)            0.456667   
70  (married_YES, save_act_YES)  (mortgage_NO)            0.461667   

    consequent support   support  confidence      lift  leverage  conviction  
18            0.438333  0.300000    0.454545  1.036986  0.010700    1.029722  
19            0.660000  0.300000    0.684411  1.036986  0.010700    1.077349  
60            0.456667  0.303333    0.465473  1.019284  0.005739    1.016475  
61            0.651667  0.303333    0.664234  1.019284  0.005739    1.037428  
70            0.651667  0.306667    0.664260  1.019325  0.005814    1.037509  
