In [None]:
%pip install pgmpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pgmpy
  Downloading pgmpy-0.1.21-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pgmpy
Successfully installed pgmpy-0.1.21


In [None]:
from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD

In [None]:
# Defining the model structure. We can define the network by just passing a list of edges.
model = BayesianModel(
    [
        ('DebtIncomeRatio', 'PaymentHistory'),
        ('PaymentHistory', 'Age'),
        ('PaymentHistory', 'Reliability'),
        ('Age', 'Reliability'),
        ('Income','Assets'), 
        ('Income', 'FutureIncome'),
        ('Assets', 'FutureIncome'),
        ('DebtIncomeRatio', 'BankLoan'),
        ('Reliability', 'BankLoan'),
        ('FutureIncome', 'BankLoan')
    ]
)



In [None]:
cpd_DebtIncomeRatio = TabularCPD(variable='DebtIncomeRatio', variable_card=2, values=[[0.5], [0.5]], state_names={'DebtIncomeRatio': ['Low', 'High']})
cpd_Income = TabularCPD(variable='Income', variable_card=3, values=[[0.333], [0.333], [0.334]], state_names={'Income': ['High', 'Medium', 'Low']})

cpd_Reliability = TabularCPD(variable='Reliability', variable_card=2,
                      values=[[0.7, 0.8, 0.9, 0.6, 0.7, 0.8, 0.5, 0.6, 0.7],
                              [0.3, 0.2, 0.1, 0.4, 0.3, 0.2, 0.5, 0.4, 0.3],],
                      evidence=['PaymentHistory','Age'],
                      evidence_card=[3,3],
                      state_names={'Reliability': ['Reliable', 'Unreliable'],
                                   'PaymentHistory': ['Excellent', 'Acceptable', 'Unacceptable'],
                                   'Age': ['Between16and25', 'Between26and64', 'Over65']})

cpd_Age = TabularCPD(variable='Age', variable_card=3, 
                      values=[[0.1, 0.333, 0.6],
                              [0.3, 0.333, 0.3],
                              [0.6, 0.334, 0.1]],
                      evidence=['PaymentHistory'],
                      evidence_card=[3],
                      state_names={'Age': ['Between16and25', 'Between26and64', 'Over65'],
                                   'PaymentHistory': ['Excellent', 'Acceptable', 'Unacceptable']})


cpd_PaymentHistory = TabularCPD(variable='PaymentHistory', variable_card=3, 
                      values=[[0.6, 0.1],
                              [0.3, 0.3],
                              [0.1, 0.6]],
                      evidence=['DebtIncomeRatio'],
                      evidence_card=[2],
                      state_names={'PaymentHistory': ['Excellent', 'Acceptable', 'Unacceptable'],
                                   'DebtIncomeRatio': ['Low', 'High']})

cpd_BankLoan = TabularCPD(variable='BankLoan', variable_card=2, 
                      values=[[0.8, 0.6, 0.6, 0.4, 0.6, 0.4, 0.4, 0.2],
                              [0.2, 0.4, 0.4, 0.6, 0.4, 0.6, 0.6, 0.8]],
                      evidence=['DebtIncomeRatio','Reliability','FutureIncome'],
                      evidence_card=[2,2,2],
                      state_names={'BankLoan': ['Positive', 'Negative'],
                                   'DebtIncomeRatio': ['Low', 'High'],
                                   'Reliability': ['Reliable', 'Unreliable'],
                                   'FutureIncome': ['Promising', 'Not_promising']})

cpd_Assets = TabularCPD(variable='Assets', variable_card=3, 
                        values=[[0.6, 0.3, 0.1],
                                [0.3, 0.6, 0.3],
                                [0.1, 0.1, 0.6]],
                        evidence=['Income'],
                        evidence_card=[3],
                        state_names={'Assets': ['High', 'Medium', 'Low'],
                                     'Income': ['High', 'Medium', 'Low']})

cpd_FutureIncome = TabularCPD(variable='FutureIncome', variable_card=2,
                              values=[[0.8, 0.6, 0.5, 0.7, 0.6, 0.4, 0.6, 0.4, 0.2],
                                      [0.2, 0.4, 0.5, 0.3, 0.4, 0.6, 0.4, 0.6, 0.8]],
                              evidence=['Assets','Income'],
                              evidence_card=[3,3],
                              state_names={'FutureIncome': ['Promising', 'Not_promising'],
                                          'Assets': ['High', 'Medium', 'Low'],
                                          'Income': ['High', 'Medium', 'Low']})
# check_model checks for the network structure and CPDs and verifies that the CPDs are correctly 
# defined and sum to 1.
model.add_cpds(cpd_DebtIncomeRatio, cpd_Income, cpd_PaymentHistory, cpd_Age, cpd_Reliability, cpd_Assets, cpd_FutureIncome, cpd_BankLoan)
model.check_model()

True

In [None]:
# Independance entre les variables

model.local_independencies(['DebtIncomeRatio', 'PaymentHistory', 'Age', 'Reliability','Income','Assets', 'FutureIncome','DebtIncomeRatio', 'BankLoan'])

(DebtIncomeRatio ⟂ Assets, Income, FutureIncome)
(PaymentHistory ⟂ Assets, Income, FutureIncome | DebtIncomeRatio)
(Age ⟂ FutureIncome, DebtIncomeRatio, Assets, Income | PaymentHistory)
(Reliability ⟂ FutureIncome, DebtIncomeRatio, Assets, Income | PaymentHistory, Age)
(Income ⟂ PaymentHistory, DebtIncomeRatio, Age, Reliability)
(Assets ⟂ PaymentHistory, DebtIncomeRatio, Age, Reliability | Income)
(FutureIncome ⟂ PaymentHistory, DebtIncomeRatio, Age, Reliability | Assets, Income)
(DebtIncomeRatio ⟂ Assets, Income, FutureIncome)
(BankLoan ⟂ PaymentHistory, Assets, Age, Income | DebtIncomeRatio, Reliability, FutureIncome)

In [None]:
# Variable independante de Income
model.local_independencies('Income')

(Income ⟂ PaymentHistory, DebtIncomeRatio, Age, Reliability)

In [None]:
# Apprentissage par maximum de vraisemblance
import pandas as pd 
data = pd.read_csv('./50000-cases.csv')
data.head()

Unnamed: 0,DebtIncomeRatio,Income,PaymentHistory,Assets,Age,FutureIncome,Reliability,BankLoan
0,Low,Medium,Excellent,Medium,Between26and64,Promising,Reliable,Negative
1,Low,High,Excellent,High,Over65,Promising,Reliable,Positive
2,Low,Medium,Unacceptable,High,Between26and64,Promising,Unreliable,Positive
3,High,Medium,Unacceptable,Low,Over65,Not_promising,Reliable,Positive
4,High,Medium,Acceptable,Medium,Over65,Promising,Unreliable,Negative


In [None]:
from pgmpy.models import BayesianNetwork

model_struct = BayesianNetwork(ebunch=model.edges())
model_struct.nodes()

NodeView(('DebtIncomeRatio', 'PaymentHistory', 'BankLoan', 'Age', 'Reliability', 'Income', 'Assets', 'FutureIncome'))

In [None]:
from pgmpy.estimators.MLE import MaximumLikelihoodEstimator
from pgmpy.estimators import BayesianEstimator

model_struct.fit(data=data, estimator=MaximumLikelihoodEstimator)
print(model_struct.get_cpds("PaymentHistory"))

model_struct.fit(
    data=data,
    estimator=BayesianEstimator,
    prior_type="BDeu",
    equivalent_sample_size=1000,
)
print(model_struct.get_cpds("PaymentHistory"))

+------------------------------+-----------------------+----------------------+
| DebtIncomeRatio              | DebtIncomeRatio(High) | DebtIncomeRatio(Low) |
+------------------------------+-----------------------+----------------------+
| PaymentHistory(Acceptable)   | 0.2984163708165463    | 0.2973406602222133   |
+------------------------------+-----------------------+----------------------+
| PaymentHistory(Excellent)    | 0.0989668514898879    | 0.6018611367373952   |
+------------------------------+-----------------------+----------------------+
| PaymentHistory(Unacceptable) | 0.6026167776935658    | 0.10079820304039148  |
+------------------------------+-----------------------+----------------------+
+------------------------------+-----------------------+----------------------+
| DebtIncomeRatio              | DebtIncomeRatio(High) | DebtIncomeRatio(Low) |
+------------------------------+-----------------------+----------------------+
| PaymentHistory(Acceptable)   | 0.29909

In [None]:
# 5) Inference Exacte 
from pgmpy.inference import VariableElimination

infer = VariableElimination(model)
prob_bank_loan = infer.query(variables=["BankLoan"])
print(prob_bank_loan)


+--------------------+-----------------+
| BankLoan           |   phi(BankLoan) |
| BankLoan(Positive) |          0.5480 |
+--------------------+-----------------+
| BankLoan(Negative) |          0.4520 |
+--------------------+-----------------+




In [None]:
# Calculer la probabilité conditionnelle P(BankLoan|Income = Low, Age = Between16and25, PaymentHistory = Excellent, Assets = Low)
prop = infer.query(variables=['BankLoan'], evidence={'Income': 'Low', 'Age': 'Between16and25',
                                             'PaymentHistory': 'Excellent', 'Assets': 'Low'})
print(prop)


+--------------------+-----------------+
| BankLoan           |   phi(BankLoan) |
| BankLoan(Positive) |          0.5514 |
+--------------------+-----------------+
| BankLoan(Negative) |          0.4486 |
+--------------------+-----------------+




In [None]:
# Inférence par échantillonnage en avant

from pgmpy.inference import ApproxInference

infer2 = ApproxInference(model)
prop2 = infer2.query(variables=["BankLoan"], n_samples=100, joint=True)
print(prop2)

  0%|          | 0/8 [00:00<?, ?it/s]

+--------------------+-----------------+
| BankLoan           |   phi(BankLoan) |
| BankLoan(Negative) |          0.5700 |
+--------------------+-----------------+
| BankLoan(Positive) |          0.4300 |
+--------------------+-----------------+
