<a href="https://colab.research.google.com/github/dineshsingh099/Diabetes_Genetic_Disease_Prediction/blob/main/Diabetes_Genetic_Disease_Prediction_Using_Bayesian_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Diabetes Genetic Disease Prediction Using Bayesian Networks" aims to leverage Bayesian network models to analyze genetic data and identify potential risks for diabetes. By integrating various genetic and environmental factors, this approach enhances predictive accuracy, facilitating early intervention and personalized healthcare strategies.

In [1]:
!pip install pgmpy

Collecting pgmpy
  Downloading pgmpy-0.1.26-py3-none-any.whl.metadata (9.1 kB)
Downloading pgmpy-0.1.26-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pgmpy
Successfully installed pgmpy-0.1.26


## imports the necessary libraries

In [2]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination
import matplotlib.pyplot as plt
import seaborn as sns

## Read the dataset

In [3]:
data = pd.read_csv('diabetes_data.csv')

In [4]:
data.head()

Unnamed: 0,SNP_TCF7L2,SNP_FTO,SNP_CAPN10,Age,Family_History,BMI,Physical_Activity,Diet_Type,Insulin_Level,Diabetes
0,1,0,1,25,0,22.5,1,1,5.6,0
1,0,1,1,55,1,28.1,0,0,18.2,1
2,1,0,0,45,1,30.0,0,1,15.0,1
3,0,0,1,35,0,24.8,1,0,7.0,0
4,1,1,0,60,1,31.5,0,0,20.0,1


In [5]:
print("Shape of the dataset:", data.shape)

Shape of the dataset: (10, 10)


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SNP_TCF7L2         10 non-null     int64  
 1   SNP_FTO            10 non-null     int64  
 2   SNP_CAPN10         10 non-null     int64  
 3   Age                10 non-null     int64  
 4   Family_History     10 non-null     int64  
 5   BMI                10 non-null     float64
 6   Physical_Activity  10 non-null     int64  
 7   Diet_Type          10 non-null     int64  
 8   Insulin_Level      10 non-null     float64
 9   Diabetes           10 non-null     int64  
dtypes: float64(2), int64(8)
memory usage: 928.0 bytes


In [7]:
data.describe()

Unnamed: 0,SNP_TCF7L2,SNP_FTO,SNP_CAPN10,Age,Family_History,BMI,Physical_Activity,Diet_Type,Insulin_Level,Diabetes
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.6,0.5,0.6,43.4,0.6,28.25,0.5,0.5,14.58,0.5
std,0.516398,0.527046,0.516398,13.866026,0.516398,3.817867,0.527046,0.527046,6.560285,0.527046
min,0.0,0.0,0.0,25.0,0.0,22.5,0.0,0.0,5.6,0.0
25%,0.0,0.0,0.0,31.25,0.0,25.35,0.0,0.0,9.875,0.0
50%,1.0,0.5,1.0,42.5,1.0,28.55,0.5,0.5,13.75,0.5
75%,1.0,1.0,1.0,53.75,1.0,31.125,1.0,1.0,19.55,1.0
max,1.0,1.0,1.0,65.0,1.0,34.0,1.0,1.0,25.0,1.0


In [8]:
model = BayesianNetwork([
    ('SNP_TCF7L2', 'Diabetes'),
    ('SNP_FTO', 'Diabetes'),
    ('SNP_CAPN10', 'Diabetes'),
    ('Age', 'Diabetes'),
    ('Family_History', 'Diabetes'),
    ('BMI', 'Diabetes'),
    ('Insulin_Level', 'Diabetes'),
    ('Physical_Activity', 'Diabetes'),
    ('Diet_Type', 'Diabetes')
])

In [9]:
cpd_snp_tcf7l2 = TabularCPD(variable='SNP_TCF7L2', variable_card=2, values=[[0.6], [0.4]])
cpd_snp_fto = TabularCPD(variable='SNP_FTO', variable_card=2, values=[[0.7], [0.3]])
cpd_snp_capn10 = TabularCPD(variable='SNP_CAPN10', variable_card=2, values=[[0.8], [0.2]])
cpd_age = TabularCPD(variable='Age', variable_card=2, values=[[0.5], [0.5]])
cpd_family_history = TabularCPD(variable='Family_History', variable_card=2, values=[[0.9], [0.1]])
cpd_bmi = TabularCPD(variable='BMI', variable_card=2, values=[[0.4], [0.6]])
cpd_insulin_level = TabularCPD(variable='Insulin_Level', variable_card=2, values=[[0.7], [0.3]])
cpd_physical_activity = TabularCPD(variable='Physical_Activity', variable_card=2, values=[[0.6], [0.4]])
cpd_diet_type = TabularCPD(variable='Diet_Type', variable_card=2, values=[[0.8], [0.2]])


In [10]:
num_combinations = 2 ** 9
values = np.zeros((2, num_combinations))

In [11]:
for i in range(num_combinations):
    values[0, i] = 0.9
    values[1, i] = 0.1

In [12]:
for i in range(num_combinations):
    if (i & 0b00000001) >> 0 == 1:
        values[0, i] *= 0.3
        values[1, i] *= 0.7
    if (i & 0b00000010) >> 1 == 1:
        values[0, i] *= 0.4
        values[1, i] *= 0.6
    if (i & 0b00000100) >> 2 == 1:
        values[0, i] *= 0.5
        values[1, i] *= 0.5
    if (i & 0b00001000) >> 3 == 1:
        values[0, i] *= 0.4
        values[1, i] *= 0.6
    if (i & 0b00010000) >> 4 == 1:
        values[0, i] *= 0.2
        values[1, i] *= 0.8
    if (i & 0b00100000) >> 5 == 1:
        values[0, i] *= 0.5
        values[1, i] *= 0.5
    if (i & 0b01000000) >> 6 == 1:
        values[0, i] *= 0.3
        values[1, i] *= 0.7
    if (i & 0b10000000) >> 7 == 1:
        values[0, i] *= 0.8
        values[1, i] *= 0.2

In [13]:
for i in range(num_combinations):
    total = values[0, i] + values[1, i]
    if total > 0:
        values[0, i] /= total
        values[1, i] /= total


In [14]:
cpd_diabetes = TabularCPD(variable='Diabetes', variable_card=2,
                          values=values,
                          evidence=['SNP_TCF7L2', 'SNP_FTO', 'SNP_CAPN10', 'Age', 'Family_History', 'BMI', 'Insulin_Level', 'Physical_Activity', 'Diet_Type'],
                          evidence_card=[2] * 9)



In [15]:
model.add_cpds(cpd_snp_tcf7l2, cpd_snp_fto, cpd_snp_capn10, cpd_age, cpd_family_history,
               cpd_bmi, cpd_insulin_level, cpd_physical_activity, cpd_diet_type, cpd_diabetes)

In [16]:
assert model.check_model()

In [17]:
inference = VariableElimination(model)

In [18]:
query_result = inference.query(variables=['Diabetes'],
                               evidence={
                                   'SNP_TCF7L2': 1,
                                   'SNP_FTO': 0,
                                   'SNP_CAPN10': 1,
                                   'Age': 1,
                                   'Family_History': 1,
                                   'BMI': 1,
                                   'Insulin_Level': 1,
                                   'Physical_Activity': 0,
                                   'Diet_Type': 1
                               })




In [19]:
print("Query Result:")
print(query_result)

Query Result:
+-------------+-----------------+
| Diabetes    |   phi(Diabetes) |
| Diabetes(0) |          0.2160 |
+-------------+-----------------+
| Diabetes(1) |          0.7840 |
+-------------+-----------------+
