<a href="https://colab.research.google.com/github/ghazala-rabbani/ghazala3007/blob/main/XGBoost_Disease_Diagnosis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# disease diagnosis using xgboost -


import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Load the dataset (Pima Indians Diabetes Dataset as an example)
# You can replace this with your own medical dataset.
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"

columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness",
           "Insulin", "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]

# Load into a Pandas DataFrame
data = pd.read_csv(url, header=None, names=columns)




In [None]:
# Preview the data
#print(data.head())
#print(data)
pd.set_option('display.expand_frame_repr', False)
print(data.head(30))


In [None]:
print(data.info())

In [None]:
# Split the features and target variable
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target (Outcome: 0 = No diabetes, 1 = Diabetes)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Setting up RandomForest and XGBoost

In [None]:
# Setting up randomForest
from sklearn.ensemble import RandomForestClassifier

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_y_pred = rf_model.predict(X_test)
rf_y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

print("Random Forest Model Predictions:", rf_y_pred[:10])
print("Random Forest Model Prediction Probabilities:", rf_y_pred_proba[:10])

accuracy = accuracy_score(y_test, rf_y_pred)
print("Accuracy:", accuracy)

In [None]:
# Setting up randomForest
from sklearn.ensemble import RandomForestClassifier

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_y_pred = rf_model.predict(X_test)
rf_y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

print("Random Forest Model Predictions:", rf_y_pred[:10])
print("Random Forest Model Prediction Probabilities:", rf_y_pred_proba[:10])

accuracy = accuracy_score(y_test, rf_y_pred)
print("Accuracy:", accuracy)

In [None]:
# Setting up xgboost
model = xgb.XGBClassifier(objective="binary:logistic", n_estimators=50,max_depth=3,learning_rate=0.1, random_state=42)

# Train the model
model.fit(X_train, y_train)



In [None]:
print (X_test)

In [None]:
# Make predictions (from the previous cell to ensure all necessary variables are available)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# probability of class 0 (e.g., No diabetes)
# probability of class 1 (e.g., Diabetes)

print(y_pred)
print(y_pred_proba)

In [None]:
# Make predictions (from the previous cell to ensure all necessary variables are available)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Draw the confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=['No Diabetes', 'Diabetes'],
            yticklabels=['No Diabetes', 'Diabetes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]



In [None]:
# Evaluate accuracy
# Accuracy=TP+TN/+FP+FN+TP+TNâ€‹

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Compute ROC AUC Score
#roc_auc = roc_auc_score(y_test, y_pred_proba)
#print("ROC AUC Score:", roc_auc)

# Calculate F1-score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print(f"\nF1-score: {f1:.4f}")

# Plot feature importance
import matplotlib.pyplot as plt

xgb.plot_importance(model, importance_type="weight")
plt.title("Feature Importance")
plt.show()

## Summary:

### Data Analysis Key Findings

*   The "Pima Indians Diabetes Dataset" was successfully loaded into a Pandas DataFrame named `data` from the specified URL: `https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv`.
*   The dataset contains 768 entries (rows) and 9 columns: `Pregnancies`, `Glucose`, `BloodPressure`, `SkinThickness`, `Insulin`, `BMI`, `DiabetesPedigreeFunction`, `Age`, and `Outcome`.
*   All columns have appropriate numerical data types; specifically, `Pregnancies`, `Glucose`, `BloodPressure`, `SkinThickness`, `Insulin`, `Age`, and `Outcome` are `int64`, while `BMI` and `DiabetesPedigreeFunction` are `float64`.
*   There are no missing values in any of the columns, as indicated by all columns showing 768 non-null entries.

### Insights or Next Steps

*   The dataset is clean regarding missing values, which streamlines the initial data preparation phase.
*   The next step should involve exploratory data analysis to understand the distribution of variables, identify potential outliers or inconsistencies (e.g., zero values for `BloodPressure` or `Glucose`), and analyze the relationships between features and the `Outcome` variable.
