In [3]:
# ===============================
# Step 1: Data Loading and Preprocessing
# ===============================
import pandas as pd
from sklearn.model_selection import train_test_split

# 1️⃣ Load data
data = pd.read_excel("Merged.xlsx")

# 2️⃣ Separate features and target
X = data.drop(columns=["Label"])
y = data["Label"]

# 3️⃣ Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)



print("✅ Data loaded successfully!")
print("data.shape:",data.shape)
print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])


✅ Data loaded successfully!
data.shape: (1129, 23)
Training samples: 903
Test samples: 226


In [9]:
# ===============================
# Step 2: Model Training
# ===============================
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score
import joblib

# Logistic Regression
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200, random_state=42, max_depth=6
)
rf_model.fit(X_train, y_train)

# XGBoost
xgb_model = XGBClassifier(
    n_estimators=200, learning_rate=0.05, max_depth=4, random_state=42, use_label_encoder=False, eval_metric="logloss"
)
xgb_model.fit(X_train, y_train)

# ===============================
# Step 3: Evaluate Models
# ===============================
models = {
    "Logistic Regression": log_model,
    "Random Forest": rf_model,
    "XGBoost": xgb_model
}

print("\n📊 Model Performance:")
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    print(f"{name:20s}  Accuracy: {acc:.3f}  AUC: {auc:.3f}")

# =============================


📊 Model Performance:
Logistic Regression   Accuracy: 0.956  AUC: 0.991
Random Forest         Accuracy: 0.996  AUC: 0.999
XGBoost               Accuracy: 0.996  AUC: 1.000


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
# ======================================
# IBD Prediction Model Training (No Scaling)
# ======================================
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import joblib

# -------------------------
# Load and split data
# -------------------------
data = pd.read_excel("Merged.xlsx")

X = data.drop(columns=["Label"])
y = data["Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------
# Train models
# -------------------------
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)

rf_model = RandomForestClassifier(
    n_estimators=200, random_state=42, max_depth=6
)
rf_model.fit(X_train, y_train)

xgb_model = XGBClassifier(
    n_estimators=200, learning_rate=0.05, max_depth=4,
    random_state=42, use_label_encoder=False, eval_metric="logloss"
)
xgb_model.fit(X_train, y_train)

# -------------------------
# Evaluate models
# -------------------------
models = {
    "Logistic Regression": log_model,
    "Random Forest": rf_model,
    "XGBoost": xgb_model
}

print("\n📊 Model Performance (No Scaling):")
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    print(f"{name:20s}  Accuracy: {acc:.3f}  AUC: {auc:.3f}")

# -------------------------
# Save models
# -------------------------
joblib.dump(log_model, "logistic_model.pkl")
joblib.dump(rf_model, "rf_model.pkl")
joblib.dump(xgb_model, "xgb_model.pkl")

print("\n💾 Models saved successfully (no scaling used)!")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



📊 Model Performance (No Scaling):
Logistic Regression   Accuracy: 0.956  AUC: 0.991
Random Forest         Accuracy: 0.996  AUC: 0.999
XGBoost               Accuracy: 0.996  AUC: 1.000

💾 Models saved successfully (no scaling used)!


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import joblib
import shap

# --------------------------
# Load Dataset
# --------------------------
data = pd.read_excel("Merged.xlsx")
X = data.drop(columns=["Label"])
y = data["Label"]

# --------------------------
# Train-Test Split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------
# Train Models
# --------------------------
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)

rf_model = RandomForestClassifier(n_estimators=200, max_depth=6, random_state=42)
rf_model.fit(X_train, y_train)

xgb_model = XGBClassifier(
    n_estimators=200, learning_rate=0.05, max_depth=4,
    random_state=42, use_label_encoder=False, eval_metric="logloss"
)
xgb_model.fit(X_train, y_train)

# --------------------------
# Evaluate Models
# --------------------------
models = {
    "Logistic Regression": log_model,
    "Random Forest": rf_model,
    "XGBoost": xgb_model
}

print("\n📊 Model Performance:")
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    print(f"{name:20s} Accuracy: {acc:.3f}  AUC: {auc:.3f}")

# --------------------------
# Compute SHAP values (Random Forest)
# --------------------------
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_train)  # list of arrays for binary classification

# Save models and SHAP values
joblib.dump(log_model, "logistic_model.pkl")
joblib.dump(rf_model, "rf_model.pkl")
joblib.dump(xgb_model, "xgb_model.pkl")
joblib.dump(shap_values, "rf_shap_values.pkl")
joblib.dump(X_train, "X_train.pkl")

print("\n💾 Models and SHAP values saved successfully!")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



📊 Model Performance:
Logistic Regression  Accuracy: 0.956  AUC: 0.991
Random Forest        Accuracy: 0.996  AUC: 0.999
XGBoost              Accuracy: 0.996  AUC: 1.000

💾 Models and SHAP values saved successfully!


In [27]:
import os
os.getcwd()


'C:\\Users\\Asus\\MTP PROJECT'

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib

# -------------------------------
# Load dataset
# -------------------------------
data = pd.read_excel("Merged.xlsx")
X = data.drop(columns=["Label"])
y = data["Label"]

# -------------------------------
# Train-test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# Train models
# -------------------------------
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)

rf_model = RandomForestClassifier(n_estimators=200, max_depth=6, random_state=42)
rf_model.fit(X_train, y_train)

xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)
xgb_model.fit(X_train, y_train)

# -------------------------------
# Save models & feature names
# -------------------------------
folder_path = r"C:\Users\Asus\MTP PROJECT"

joblib.dump(log_model, f"{folder_path}\\logistic_model.pkl")
joblib.dump(rf_model, f"{folder_path}\\rf_model.pkl")
joblib.dump(xgb_model, f"{folder_path}\\xgb_model.pkl")

# Save exact feature names (important for SHAP)
feature_names = list(X.columns)
joblib.dump(feature_names, f"{folder_path}\\rf_features.pkl")

print(f"✅ Models and feature names saved in '{folder_path}'")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Models and feature names saved in 'C:\Users\Asus\MTP PROJECT'
