In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pgmpy

from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD 

from sklearn.model_selection import train_test_split

### Data importing

In [2]:
insurance_claim_data = pd.read_csv('./datasets/insurance_claim_cat.csv')

insurance_claim_data.head()

## age
# Young_Adult: [18,35) 
# Middled_Aged: [35,60)
# Elderly: [60,inf)

## bmi
# Underweight: [0,18.6)
# Normal: [18.5, 24.9)
# Overweight: [24.9,30)
# Obese: [30, inf)

## charges: ???
# Low: 0,2000
# Below_Average 2000,8000
# Average 8000,16000
# Very_High

Unnamed: 0,age,age_group,sex,bmi_group,bmi,children,smoker,charges_group,charges,insuranceclaim,northeast,northwest,southeast,southwest
0,19,Young_Adult,1,Overweight,27.9,0,1,Very_High,16884.924,1,0,0,0,1
1,18,Young_Adult,0,Obese,33.77,1,0,Low,1725.5523,1,0,0,1,0
2,28,Young_Adult,0,Obese,33.0,3,0,Average,4449.462,0,0,0,1,0
3,33,Middle_Aged,0,Normal,22.705,0,0,Very_High,21984.47061,0,0,1,0,0
4,32,Middle_Aged,0,Overweight,28.88,0,0,Below_Average,3866.8552,1,0,1,0,0


### Data wrangling/cleaning

In [3]:
insurance_claim_data = insurance_claim_data.drop(columns=['age','charges', 
                                                          'bmi'], axis = 1)

insurance_claim_data.head()

Unnamed: 0,age_group,sex,bmi_group,children,smoker,charges_group,insuranceclaim,northeast,northwest,southeast,southwest
0,Young_Adult,1,Overweight,0,1,Very_High,1,0,0,0,1
1,Young_Adult,0,Obese,1,0,Low,1,0,0,1,0
2,Young_Adult,0,Obese,3,0,Average,0,0,0,1,0
3,Middle_Aged,0,Normal,0,0,Very_High,0,0,1,0,0
4,Middle_Aged,0,Overweight,0,0,Below_Average,1,0,1,0,0


In [4]:
mapping_age_group = {'Young_Adult' : 0, 'Middle_Aged': 1,'Elderly' : 2}

In [5]:
mapping_bmi_group = {'Underweight' : 0, 'Normal': 1,'Overweight' : 2,'Obese': 3}

In [6]:
mapping_charges_group = {'Low' : 0, 'Below_Average': 1,'Average' : 2,'Very_High': 3}

In [7]:
insurance_claim_data['age_group'] = insurance_claim_data['age_group'].replace(mapping_age_group)

In [8]:
insurance_claim_data['bmi_group'] = insurance_claim_data['bmi_group'].replace(mapping_bmi_group)

In [9]:
insurance_claim_data['charges_group'] = insurance_claim_data['charges_group'].replace(mapping_charges_group )

In [10]:
insurance_claim_data.head()

Unnamed: 0,age_group,sex,bmi_group,children,smoker,charges_group,insuranceclaim,northeast,northwest,southeast,southwest
0,0,1,2,0,1,3,1,0,0,0,1
1,0,0,3,1,0,0,1,0,0,1,0
2,0,0,3,3,0,2,0,0,0,1,0
3,1,0,1,0,0,3,0,0,1,0,0
4,1,0,2,0,0,1,1,0,1,0,0


In [11]:
insurance_claim_data.shape

(1338, 11)

In [12]:
insurance_claim_data.columns

Index(['age_group', 'sex', 'bmi_group', 'children', 'smoker', 'charges_group',
       'insuranceclaim', 'northeast', 'northwest', 'southeast', 'southwest'],
      dtype='object')

### Data mining&analyis with pgmpy (Bayesian Network)

In [13]:
model = BayesianModel([('age_group', 'insuranceclaim'),
                       ('sex', 'insuranceclaim'),
                       ('bmi_group', 'insuranceclaim'),
                       ('children', 'insuranceclaim'),
                       ('smoker', 'insuranceclaim'),
                       ('smoker', 'charges_group'),
                       ('bmi_group', 'charges_group'),
                       ('sex', 'charges_group'),
                       ('charges_group', 'insuranceclaim'),
                       ('northeast', 'insuranceclaim'),
                       ('northwest', 'insuranceclaim'),
                       ('southeast', 'insuranceclaim'),
                       ('southwest', 'insuranceclaim'),
]) 

In [14]:
model.get_cpds()

[]

In [15]:
X_train, X_test = train_test_split(insurance_claim_data, test_size = 0.2, random_state = 123)

In [16]:
X_train.head()

Unnamed: 0,age_group,sex,bmi_group,children,smoker,charges_group,insuranceclaim,northeast,northwest,southeast,southwest
67,1,0,2,1,0,2,0,0,1,0,0
736,1,1,3,0,1,3,1,0,0,1,0
310,1,0,2,0,0,2,1,0,0,0,1
963,1,0,1,3,0,2,0,1,0,0,0
680,0,1,0,1,0,1,1,0,0,0,1


In [17]:
X_test.head()

Unnamed: 0,age_group,sex,bmi_group,children,smoker,charges_group,insuranceclaim,northeast,northwest,southeast,southwest
650,1,1,3,2,0,2,0,0,0,1,0
319,1,0,3,1,0,2,0,1,0,0,0
314,0,1,3,0,1,3,1,0,0,0,1
150,1,0,1,1,0,2,0,0,1,0,0
336,2,0,2,0,0,3,0,0,0,1,0


In [18]:
model.fit(X_train)

In [19]:
model.get_cpds()

[<TabularCPD representing P(age_group:3) at 0x13cd015e0>,
 <TabularCPD representing P(bmi_group:4) at 0x13cce7310>,
 <TabularCPD representing P(charges_group:4 | bmi_group:4, sex:2, smoker:2) at 0x13cce72b0>,
 <TabularCPD representing P(children:6) at 0x13ccda610>,
 <TabularCPD representing P(insuranceclaim:2 | age_group:3, bmi_group:4, charges_group:4, children:6, northeast:2, northwest:2, sex:2, smoker:2, southeast:2, southwest:2) at 0x13cd0b910>,
 <TabularCPD representing P(northeast:2) at 0x13cd0bcd0>,
 <TabularCPD representing P(northwest:2) at 0x13cd0bc40>,
 <TabularCPD representing P(sex:2) at 0x13cd0bfd0>,
 <TabularCPD representing P(smoker:2) at 0x145bafd60>,
 <TabularCPD representing P(southeast:2) at 0x145bafe20>,
 <TabularCPD representing P(southwest:2) at 0x145baffa0>]

In [20]:
for edge in model.edges():
    print(edge)

('age_group', 'insuranceclaim')
('sex', 'insuranceclaim')
('sex', 'charges_group')
('bmi_group', 'insuranceclaim')
('bmi_group', 'charges_group')
('children', 'insuranceclaim')
('smoker', 'insuranceclaim')
('smoker', 'charges_group')
('charges_group', 'insuranceclaim')
('northeast', 'insuranceclaim')
('northwest', 'insuranceclaim')
('southeast', 'insuranceclaim')
('southwest', 'insuranceclaim')


In [21]:
y_test = X_test['insuranceclaim']

y_test.head()

650    0
319    0
314    1
150    0
336    0
Name: insuranceclaim, dtype: int64

In [22]:
X_test = X_test.drop('insuranceclaim', axis = 1)

X_test.head()

Unnamed: 0,age_group,sex,bmi_group,children,smoker,charges_group,northeast,northwest,southeast,southwest
650,1,1,3,2,0,2,0,0,1,0
319,1,0,3,1,0,2,1,0,0,0
314,0,1,3,0,1,3,0,0,0,1
150,1,0,1,1,0,2,0,1,0,0
336,2,0,2,0,0,3,0,0,1,0


In [23]:
y_pred_bayes = model.predict(X_test)

100%|██████████| 199/199 [46:17<00:00, 13.96s/it]


In [24]:
from sklearn.metrics import accuracy_score

print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_bayes))) ## acc = (TP+TN)/all_test_instances

Model accuracy score: 0.7724


Accuracy can be thought of as a measure of how many predictions are right.

In [25]:
from sklearn.metrics import precision_score, recall_score

print('Precision_score :',precision_score(y_test, y_pred_bayes)) ## precision = TP/(TP+FP)
      
print('Recall_score :',recall_score(y_test, y_pred_bayes)) ## recall = TP/(TP+FN)

Precision_score : 0.8760330578512396
Recall_score : 0.6973684210526315


Precision can be thought of as accuracy for when the model has flagged that a claim will be filed.


Recall can be thought of as the accuracy when an insurance claim is actually filed... model is not so good bc there are too many FNs.

In [26]:
model.get_independencies()

(age_group ⟂ southwest, smoker, northeast, southeast, bmi_group, charges_group, children, sex, northwest)
(age_group ⟂ smoker, northeast, southeast, bmi_group, charges_group, children, sex, northwest | southwest)
(age_group ⟂ southwest, northeast, southeast, bmi_group, charges_group, children, sex, northwest | smoker)
(age_group ⟂ southwest, smoker, southeast, bmi_group, charges_group, children, sex, northwest | northeast)
(age_group ⟂ southwest, smoker, northeast, bmi_group, charges_group, children, sex, northwest | southeast)
(age_group ⟂ southwest, smoker, northeast, southeast, charges_group, children, sex, northwest | bmi_group)
(age_group ⟂ southwest, smoker, northeast, southeast, bmi_group, children, sex, northwest | charges_group)
(age_group ⟂ southwest, smoker, northeast, southeast, bmi_group, charges_group, sex, northwest | children)
(age_group ⟂ southwest, smoker, northeast, southeast, bmi_group, charges_group, children, northwest | sex)
(age_group ⟂ southwest, smoker, northe

In [27]:
model.local_independencies('smoker')

(smoker ⟂ southwest, northeast, southeast, bmi_group, age_group, children, sex, northwest)

In [28]:
model.local_independencies('age_group')

(age_group ⟂ southwest, smoker, northeast, southeast, bmi_group, charges_group, children, sex, northwest)

In [29]:
from pgmpy.inference import VariableElimination

infer = VariableElimination(model)

ins_claim_dist = infer.query(['insuranceclaim'])

print(ins_claim_dist)

Finding Elimination Order: :   0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/10 [00:00<?, ?it/s][A
Finding Elimination Order: : 100%|██████████| 10/10 [00:00<00:00, 426.93it/s]

Eliminating: southwest:  10%|█         | 1/10 [00:03<00:31,  3.46s/it][A
Eliminating: northeast:  10%|█         | 1/10 [00:03<00:31,  3.46s/it][A
Eliminating: northeast:  20%|██        | 2/10 [00:05<00:23,  2.91s/it][A
Eliminating: southeast:  20%|██        | 2/10 [00:05<00:23,  2.91s/it][A
Eliminating: southeast:  30%|███       | 3/10 [00:05<00:15,  2.27s/it][A
Eliminating: charges_group:  30%|███       | 3/10 [00:05<00:15,  2.27s/it][A
Eliminating: charges_group:  40%|████      | 4/10 [00:06<00:09,  1.66s/it][A
Eliminating: smoker:  40%|████      | 4/10 [00:06<00:09,  1.66s/it]       [A
Eliminating: smoker:  50%|█████     | 5/10 [00:06<00:05,  1.19s/it][A
Eliminating: bmi_group:  50%|█████     | 5/10 [00:06<00:05,  1.19s/it][A
Eliminating: age_group:  50%|█████     | 5/10 [00:06<00:05,  1

+-------------------+-----------------------+
| insuranceclaim    |   phi(insuranceclaim) |
| insuranceclaim(0) |                0.4607 |
+-------------------+-----------------------+
| insuranceclaim(1) |                0.5393 |
+-------------------+-----------------------+


In [30]:
print(infer.query(['insuranceclaim'], evidence = {'smoker': 0, 'bmi_group': 2}))

Finding Elimination Order: :   0%|          | 0/8 [00:00<?, ?it/s]
  0%|          | 0/8 [00:00<?, ?it/s][A
Finding Elimination Order: : 100%|██████████| 8/8 [00:00<00:00, 305.31it/s]

Eliminating: southwest:  12%|█▎        | 1/8 [00:00<00:03,  2.18it/s][A
Eliminating: northeast:  12%|█▎        | 1/8 [00:00<00:03,  2.18it/s][A
Eliminating: northeast:  25%|██▌       | 2/8 [00:00<00:02,  2.42it/s][A
Eliminating: southeast:  25%|██▌       | 2/8 [00:00<00:02,  2.42it/s][A
Eliminating: southeast:  38%|███▊      | 3/8 [00:00<00:01,  3.12it/s][A
Eliminating: charges_group:  38%|███▊      | 3/8 [00:00<00:01,  3.12it/s][A
Eliminating: age_group:  38%|███▊      | 3/8 [00:00<00:01,  3.12it/s]    [A
Eliminating: children:  38%|███▊      | 3/8 [00:00<00:01,  3.12it/s] [A
Eliminating: children:  75%|███████▌  | 6/8 [00:00<00:00,  4.26it/s][A
Eliminating: sex:  75%|███████▌  | 6/8 [00:00<00:00,  4.26it/s]     [A
Eliminating: northwest: 100%|██████████| 8/8 [00:01<00:00,  7.83it/s][A

+-------------------+-----------------------+
| insuranceclaim    |   phi(insuranceclaim) |
| insuranceclaim(0) |                0.4941 |
+-------------------+-----------------------+
| insuranceclaim(1) |                0.5059 |
+-------------------+-----------------------+





In [31]:
print(infer.query(['insuranceclaim'], evidence = {'smoker': 1, 'bmi_group': 2}))

Finding Elimination Order: :   0%|          | 0/8 [00:00<?, ?it/s]
  0%|          | 0/8 [00:00<?, ?it/s][A
Finding Elimination Order: : 100%|██████████| 8/8 [00:00<00:00, 419.38it/s]

Eliminating: southwest:  12%|█▎        | 1/8 [00:00<00:03,  1.92it/s][A
Eliminating: northeast:  12%|█▎        | 1/8 [00:00<00:03,  1.92it/s][A
Eliminating: northeast:  25%|██▌       | 2/8 [00:00<00:02,  2.34it/s][A
Eliminating: southeast:  25%|██▌       | 2/8 [00:00<00:02,  2.34it/s][A
Eliminating: southeast:  38%|███▊      | 3/8 [00:00<00:01,  2.94it/s][A
Eliminating: charges_group:  38%|███▊      | 3/8 [00:00<00:01,  2.94it/s][A
Eliminating: age_group:  38%|███▊      | 3/8 [00:00<00:01,  2.94it/s]    [A
Eliminating: children:  38%|███▊      | 3/8 [00:00<00:01,  2.94it/s] [A
Eliminating: sex:  38%|███▊      | 3/8 [00:00<00:01,  2.94it/s]     [A
Eliminating: northwest:  38%|███▊      | 3/8 [00:00<00:01,  2.94it/s][A
Eliminating: northwest: 100%|██████████| 8/8 [00:00<00:00,  8.11it/s][A

+-------------------+-----------------------+
| insuranceclaim    |   phi(insuranceclaim) |
| insuranceclaim(0) |                0.4145 |
+-------------------+-----------------------+
| insuranceclaim(1) |                0.5855 |
+-------------------+-----------------------+





In [32]:
print(infer.query(['insuranceclaim'], evidence = {'smoker': 1, 'children': 3}))

Finding Elimination Order: :   0%|          | 0/8 [00:00<?, ?it/s]
  0%|          | 0/8 [00:00<?, ?it/s][A
Finding Elimination Order: : 100%|██████████| 8/8 [00:00<00:00, 433.16it/s]

Eliminating: southwest:  12%|█▎        | 1/8 [00:00<00:02,  2.44it/s][A
Eliminating: northeast:  12%|█▎        | 1/8 [00:00<00:02,  2.44it/s][A
Eliminating: northeast:  25%|██▌       | 2/8 [00:00<00:01,  3.05it/s][A
Eliminating: southeast:  25%|██▌       | 2/8 [00:00<00:01,  3.05it/s][A
Eliminating: charges_group:  25%|██▌       | 2/8 [00:00<00:01,  3.05it/s][A
Eliminating: charges_group:  50%|█████     | 4/8 [00:00<00:00,  4.05it/s][A
Eliminating: bmi_group:  50%|█████     | 4/8 [00:00<00:00,  4.05it/s]    [A
Eliminating: age_group:  50%|█████     | 4/8 [00:00<00:00,  4.05it/s][A
Eliminating: sex:  50%|█████     | 4/8 [00:00<00:00,  4.05it/s]      [A
Eliminating: northwest: 100%|██████████| 8/8 [00:00<00:00, 11.32it/s][A

+-------------------+-----------------------+
| insuranceclaim    |   phi(insuranceclaim) |
| insuranceclaim(0) |                0.4524 |
+-------------------+-----------------------+
| insuranceclaim(1) |                0.5476 |
+-------------------+-----------------------+



