<a href="https://colab.research.google.com/github/ghazala-rabbani/ghazala3007/blob/main/Diabetes_Diagnosis_LearningComplexRelationship.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing Pandas and Scikit-learn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Why did not we choose Logistic regression this time????? The relationship is Complex not linear
#Diabetes risk increasing linearly with glucose → Logistic Regression
#Complex interactions between age, BMI, insulin, genetics → Random Forest

data = {
    'Pregnancies': [6, 1, 8, 1, 0, 5, 3, 10, 2, 4],
    'Glucose': [148, 85, 183, 89, 137, 116, 78, 115, 197, 125],
    'BloodPressure': [72, 66, 64, 66, 40, 74, 50, 0, 70, 80],
    'BMI': [33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, 32.0],
    'Age': [50, 31, 32, 21, 33, 30, 26, 29, 53, 45],
    'Diabetes': [1, 0, 1, 0, 1, 0, 0, 1, 1, 1]
}

# Python Tip Above type of data in Python called dictionary (key-value pair)
# Other types of data structures are list[x,y,z], Tuples(a,b,c)

df = pd.DataFrame(data)
print(df)

In [None]:
X = df.drop('Diabetes', axis=1)
y = df['Diabetes']


In [None]:
#Splitting the data into Traning and test data.
# Data is so small and we might not need to split. I am just putting it here since it is part of standard setup
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [None]:
# What is RandomForest ML Algorithm. A Majority vote is taken from Many predictions.
# Many Trees in a Forest. Builds many trees Each tree sees different data and different features
# A random subset of features at each split. Final prediction is made by majority voting across all 100 trees
# When to use: When Complex relationship but features are not too many and data is still CSV(Tabular)

model = RandomForestClassifier(
    n_estimators=10,
    random_state=42
)

model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
print(cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()

In [None]:
importances = pd.Series(model.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False))


In [None]:
new_patient = pd.DataFrame({
    'Pregnancies': [2],
    'Glucose': [140],
    'BloodPressure': [70],
    'BMI': [32.5],
    'Age': [45]
})

prediction = model.predict(new_patient)[0]
probability = model.predict_proba(new_patient)[0][1]

print("Diabetes Prediction:", "Yes" if prediction == 1 else "No")
print("Risk Probability:", round(probability * 100, 2), "%")


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Ensure 'model' and 'X' are defined for feature_importances_ if not already.
# Assuming model and X are defined in previous cells and their execution is valid.

importances = pd.Series(model.feature_importances_, index=X.columns)

plt.figure(figsize=(10, 6))
sorted_importances = importances.sort_values(ascending=True)
colors = ['orange' if x > 0 else 'red' for x in sorted_importances]
importances.sort_values(ascending=True).plot(kind='barh',color=colors)
plt.title("Feature Importance")
plt.xlabel("Importance Score")
plt.ylabel("Features")
plt.show()