In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pgmpy

### Data importing

In [2]:
insurance_claim_data = pd.read_csv('./datasets/insurance_claim_cat.csv')

insurance_claim_data.head()

Unnamed: 0,age,age_group,sex,bmi_group,bmi,children,smoker,charges_group,charges,insuranceclaim,northeast,northwest,southeast,southwest
0,19,Young_Adult,1,Overweight,27.9,0,1,Very_High,16884.924,1,0,0,0,1
1,18,Young_Adult,0,Obese,33.77,1,0,Low,1725.5523,1,0,0,1,0
2,28,Young_Adult,0,Obese,33.0,3,0,Average,4449.462,0,0,0,1,0
3,33,Middle_Aged,0,Normal,22.705,0,0,Very_High,21984.47061,0,0,1,0,0
4,32,Middle_Aged,0,Overweight,28.88,0,0,Below_Average,3866.8552,1,0,1,0,0


### Data wrangling/cleaning

In [3]:
insurance_claim_data = insurance_claim_data.drop(columns = ['age', 'charges', 
                                                            'bmi'], axis = 1)

insurance_claim_data.head()

Unnamed: 0,age_group,sex,bmi_group,children,smoker,charges_group,insuranceclaim,northeast,northwest,southeast,southwest
0,Young_Adult,1,Overweight,0,1,Very_High,1,0,0,0,1
1,Young_Adult,0,Obese,1,0,Low,1,0,0,1,0
2,Young_Adult,0,Obese,3,0,Average,0,0,0,1,0
3,Middle_Aged,0,Normal,0,0,Very_High,0,0,1,0,0
4,Middle_Aged,0,Overweight,0,0,Below_Average,1,0,1,0,0


In [4]:
insurance_claim_data.columns

Index(['age_group', 'sex', 'bmi_group', 'children', 'smoker', 'charges_group',
       'insuranceclaim', 'northeast', 'northwest', 'southeast', 'southwest'],
      dtype='object')

In [5]:
mapping_age_group = {'Young_Adult' : 0, 'Middle_Aged': 1,'Elderly' : 2}

In [6]:
mapping_bmi_group = {'Underweight' : 0, 'Normal': 1,'Overweight' : 2,'Obese': 3}

In [7]:
mapping_charges_group = {'Low' : 0, 'Below_Average': 1,'Average' : 2,'Very_High': 3}

In [8]:
insurance_claim_data['age_group']=insurance_claim_data['age_group'].replace(mapping_age_group)

In [9]:
insurance_claim_data['bmi_group']=insurance_claim_data['bmi_group'].replace(mapping_bmi_group)

In [10]:
insurance_claim_data['charges_group']=insurance_claim_data['charges_group'].replace(mapping_charges_group )

In [11]:
insurance_claim_data.head()

Unnamed: 0,age_group,sex,bmi_group,children,smoker,charges_group,insuranceclaim,northeast,northwest,southeast,southwest
0,0,1,2,0,1,3,1,0,0,0,1
1,0,0,3,1,0,0,1,0,0,1,0
2,0,0,3,3,0,2,0,0,0,1,0
3,1,0,1,0,0,3,0,0,1,0,0
4,1,0,2,0,0,1,1,0,1,0,0


### Data mining&analyis with pgmpy (Bayesian Network)

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(insurance_claim_data, 
                                   test_size = 0.2, random_state = 123)

In [13]:
from pgmpy.models import NaiveBayes

model = NaiveBayes(feature_vars = ['age_group', 'sex', 'bmi_group', 'children', 
                                   'smoker', 'charges_group', 'northeast', 'northwest', 
                                   'southeast', 'southwest'], dependent_var = 'insuranceclaim')

model.fit(X_train)

In [14]:
model.get_cpds()

[<TabularCPD representing P(age_group:3 | insuranceclaim:2) at 0x1326ae520>,
 <TabularCPD representing P(bmi_group:4 | insuranceclaim:2) at 0x1326b9490>,
 <TabularCPD representing P(charges_group:4 | insuranceclaim:2) at 0x1326b9130>,
 <TabularCPD representing P(children:6 | insuranceclaim:2) at 0x1326b94c0>,
 <TabularCPD representing P(insuranceclaim:2) at 0x1326b94f0>,
 <TabularCPD representing P(northeast:2 | insuranceclaim:2) at 0x1330eefd0>,
 <TabularCPD representing P(northwest:2 | insuranceclaim:2) at 0x1330eed00>,
 <TabularCPD representing P(sex:2 | insuranceclaim:2) at 0x1330eea60>,
 <TabularCPD representing P(smoker:2 | insuranceclaim:2) at 0x1330eefa0>,
 <TabularCPD representing P(southeast:2 | insuranceclaim:2) at 0x133115e80>,
 <TabularCPD representing P(southwest:2 | insuranceclaim:2) at 0x133115dc0>]

In [15]:
for edge in model.edges():
    print(edge)

('insuranceclaim', 'charges_group')
('insuranceclaim', 'bmi_group')
('insuranceclaim', 'age_group')
('insuranceclaim', 'children')
('insuranceclaim', 'smoker')
('insuranceclaim', 'southwest')
('insuranceclaim', 'northwest')
('insuranceclaim', 'sex')
('insuranceclaim', 'northeast')
('insuranceclaim', 'southeast')


In [16]:
model.local_independencies('age_group')

(age_group ⟂ charges_group, bmi_group, age_group, children, smoker, southwest, northwest, sex, northeast, southeast | insuranceclaim)

In [17]:
model.local_independencies('insuranceclaim')



In [18]:
y_test = X_test['insuranceclaim']

In [19]:
X_test = X_test.drop('insuranceclaim', axis = 1)

In [20]:
y_pred_nb = model.predict(X_test)

100%|██████████| 199/199 [00:42<00:00,  4.65it/s]


In [21]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_nb)))

Model accuracy score: 0.7799


In [22]:
from sklearn.metrics import precision_score, recall_score

print('Precision_score :',precision_score(y_test, y_pred_nb))
      
print('Recall_score :',recall_score(y_test, y_pred_nb))

Precision_score : 0.8079470198675497
Recall_score : 0.8026315789473685


If precision decreases, we expect recall to increase.

In [23]:
from pgmpy.inference import VariableElimination

infer = VariableElimination(model)

ins_claim_dist = infer.query(['insuranceclaim'])

print(ins_claim_dist)

Finding Elimination Order: :   0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/10 [00:00<?, ?it/s][A
Eliminating: charges_group:   0%|          | 0/10 [00:00<?, ?it/s][A
Eliminating: bmi_group:   0%|          | 0/10 [00:00<?, ?it/s]    [A
Eliminating: age_group:   0%|          | 0/10 [00:00<?, ?it/s][A
Eliminating: children:   0%|          | 0/10 [00:00<?, ?it/s] [A
Eliminating: smoker:   0%|          | 0/10 [00:00<?, ?it/s]  [A
Eliminating: southwest:   0%|          | 0/10 [00:00<?, ?it/s][A
Eliminating: northwest:   0%|          | 0/10 [00:00<?, ?it/s][A
Eliminating: sex:   0%|          | 0/10 [00:00<?, ?it/s]      [A
Eliminating: northeast:   0%|          | 0/10 [00:00<?, ?it/s][A
Eliminating: southeast: 100%|██████████| 10/10 [00:00<00:00, 131.29it/s]

+-------------------+-----------------------+
| insuranceclaim    |   phi(insuranceclaim) |
| insuranceclaim(0) |                0.4103 |
+-------------------+-----------------------+
| insuranceclaim(1) |                0.5897 |
+-------------------+-----------------------+





In [24]:
print(infer.query(['insuranceclaim'], evidence = {'smoker': 0, 'bmi_group': 2}))


  0%|          | 0/8 [00:00<?, ?it/s][A
Finding Elimination Order: :   0%|          | 0/8 [00:00<?, ?it/s][A

  0%|          | 0/8 [00:00<?, ?it/s][A[A

Eliminating: charges_group:   0%|          | 0/8 [00:00<?, ?it/s][A[A

Eliminating: age_group:   0%|          | 0/8 [00:00<?, ?it/s]    [A[A

Eliminating: children:   0%|          | 0/8 [00:00<?, ?it/s] [A[A

Eliminating: southwest:   0%|          | 0/8 [00:00<?, ?it/s][A[A

Eliminating: northwest:   0%|          | 0/8 [00:00<?, ?it/s][A[A

Eliminating: sex:   0%|          | 0/8 [00:00<?, ?it/s]      [A[A

Eliminating: northeast:   0%|          | 0/8 [00:00<?, ?it/s][A[A

Eliminating: southeast: 100%|██████████| 8/8 [00:00<00:00, 154.71it/s]

+-------------------+-----------------------+
| insuranceclaim    |   phi(insuranceclaim) |
| insuranceclaim(0) |                0.5311 |
+-------------------+-----------------------+
| insuranceclaim(1) |                0.4689 |
+-------------------+-----------------------+





In [25]:
print(infer.query(['insuranceclaim'], evidence = {'smoker':1, 'bmi_group':2}))



  0%|          | 0/8 [00:00<?, ?it/s][A[A

Finding Elimination Order: :   0%|          | 0/8 [00:00<?, ?it/s][A[A


  0%|          | 0/8 [00:00<?, ?it/s][A[A[A


Eliminating: charges_group:   0%|          | 0/8 [00:00<?, ?it/s][A[A[A


Eliminating: age_group:   0%|          | 0/8 [00:00<?, ?it/s]    [A[A[A


Eliminating: children:   0%|          | 0/8 [00:00<?, ?it/s] [A[A[A


Eliminating: southwest:   0%|          | 0/8 [00:00<?, ?it/s][A[A[A


Eliminating: northwest:   0%|          | 0/8 [00:00<?, ?it/s][A[A[A


Eliminating: sex:   0%|          | 0/8 [00:00<?, ?it/s]      [A[A[A


Eliminating: northeast:   0%|          | 0/8 [00:00<?, ?it/s][A[A[A


Eliminating: southeast: 100%|██████████| 8/8 [00:00<00:00, 120.27it/s]

+-------------------+-----------------------+
| insuranceclaim    |   phi(insuranceclaim) |
| insuranceclaim(0) |                0.1144 |
+-------------------+-----------------------+
| insuranceclaim(1) |                0.8856 |
+-------------------+-----------------------+





In [26]:
print(infer.query(['insuranceclaim'], evidence = {'smoker': 1, 'age_group': 2}))




  0%|          | 0/8 [00:00<?, ?it/s][A[A[A


Finding Elimination Order: :   0%|          | 0/8 [00:00<?, ?it/s][A[A[A



  0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A



Eliminating: charges_group:   0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A



Eliminating: bmi_group:   0%|          | 0/8 [00:00<?, ?it/s]    [A[A[A[A



Eliminating: children:   0%|          | 0/8 [00:00<?, ?it/s] [A[A[A[A



Eliminating: southwest:   0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A



Eliminating: northwest:   0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A



Eliminating: sex:   0%|          | 0/8 [00:00<?, ?it/s]      [A[A[A[A



Eliminating: northeast:   0%|          | 0/8 [00:00<?, ?it/s][A[A[A[A



Eliminating: southeast: 100%|██████████| 8/8 [00:00<00:00, 151.84it/s][A

+-------------------+-----------------------+
| insuranceclaim    |   phi(insuranceclaim) |
| insuranceclaim(0) |                0.0571 |
+-------------------+-----------------------+
| insuranceclaim(1) |                0.9429 |
+-------------------+-----------------------+



