In [2]:
import pandas as pd
import numpy as np

In [10]:
data=pd.read_excel('Merged.xlsx')

In [11]:
data.head()

Unnamed: 0,"WHEAT(CHAPATI,ROTI,NAAN,DALIA,RAWA/SOOJI,SEVIYAAN",WHEAT FREE CEREALS,FRUITS,OTHER VEGETABLES,"STARCHY(POTATO,SWEET PATATO,ARBI ETC)",PULSES AND LEGUMES,PREDOMINANT SATURATED FATS,PREDOMINANT UNSATURATED FATS,TRANS FATS,NUTS AND OILSEEDS,...,LOW LACTOSE DAIRY,SWEETEND BEVERAGES,ULTRA PROCESSED FOODS,READT TO EAT PACKAGED SNACKS,SAVORY SNACKS,PROCESSED FOODS,INDIAN SWEET MEATS,FOOD SUPPLEMENTS,ERGOGENIC SUPPLEMENTS,Label
0,4,3,7,11,3,6,0,4,0,1,...,0,3,4,0,4,1,4,0,0,1
1,4,9,10,13,3,5,4,4,0,2,...,2,0,3,6,3,5,3,0,0,1
2,4,2,5,15,2,6,4,0,0,1,...,4,0,0,3,2,4,4,0,0,1
3,4,2,5,10,4,8,0,0,0,1,...,1,2,2,0,0,0,2,0,0,1
4,4,3,6,12,4,4,4,8,1,4,...,4,2,2,4,2,7,3,4,0,1


In [14]:
data.shape

(1129, 23)

In [20]:
from sklearn.model_selection import train_test_split

# 22 features
feature_names = [
    "WHEAT(CHAPATI,ROTI,NAAN,DALIA,RAWA/SOOJI,SEVIYAAN)",
    "WHEAT FREE CEREALS",
    "FRUITS",
    "OTHER VEGETABLES",
    "STARCHY(POTATO,SWEET PATATO,ARBI ETC)",
    "PULSES AND LEGUMES",
    "PREDOMINANT SATURATED FATS",
    "PREDOMINANT UNSATURATED FATS",
    "TRANS FATS",
    "NUTS AND OILSEEDS",
    "LOW LACTOSE DAIRY",
    "SWEETEND BEVERAGES",
    "ULTRA PROCESSED FOODS",
    "READT TO EAT PACKAGED SNACKS",
    "SAVORY SNACKS",
    "PROCESSED FOODS",
    "INDIAN SWEET MEATS",
    "FOOD SUPPLEMENTS",
    "ERGOGENIC SUPPLEMENTS"
]

X = data.drop(columns=['Label'])
y = data['Label']  # Replace 'Target' with your actual label column


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")


Training samples: 903, Test samples: 226


In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score

# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# XGBoost
xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)


In [38]:
for name, model in zip(["Logistic Regression", "Random Forest", "XGBoost"], 
                       [log_model, rf_model, xgb_model]):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]
    print(f"{name} - Accuracy: {accuracy_score(y_test, y_pred):.2f}, AUC: {roc_auc_score(y_test, y_prob):.2f}")


Logistic Regression - Accuracy: 0.96, AUC: 0.99
Random Forest - Accuracy: 1.00, AUC: 1.00
XGBoost - Accuracy: 1.00, AUC: 1.00


In [40]:
import joblib

# Save Logistic Regression
joblib.dump(log_model, "logistic_model.pkl")

# Save Random Forest
joblib.dump(rf_model, "rf_model.pkl")

# Save XGBoost
joblib.dump(xgb_model, "xgb_model.pkl")

print("All models saved successfully!")


All models saved successfully!


In [42]:
log_model = joblib.load("logistic_model.pkl")
rf_model = joblib.load("rf_model.pkl")
xgb_model = joblib.load("xgb_model.pkl")


In [46]:
# ======================================================
# Train Logistic, Random Forest, XGBoost + SHAP explainers
# ======================================================

import pandas as pd
import numpy as np
import shap
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score

# -------------------------

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------
# 2️⃣ Train Models
# -------------------------

# Logistic Regression
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# XGBoost
xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# -------------------------
# 3️⃣ Evaluate Models
# -------------------------
for name, model in [('Logistic', log_model), ('RandomForest', rf_model), ('XGBoost', xgb_model)]:
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, probs)
    print(f"{name}: Accuracy={acc:.3f}, AUC={auc:.3f}")

# -------------------------
# 4️⃣ SHAP Explainability
# -------------------------

# Logistic Regression (KernelExplainer)
explainer_log = shap.Explainer(log_model, X_train)
shap_values_log = explainer_log(X_test)

# Random Forest (TreeExplainer)
explainer_rf = shap.TreeExplainer(rf_model)
shap_values_rf = explainer_rf(X_test)

# XGBoost (TreeExplainer)
explainer_xgb = shap.TreeExplainer(xgb_model)
shap_values_xgb = explainer_xgb(X_test)


# -------------------------
# 6️⃣ Save Models & SHAP Explainers as Pickle Files
# -------------------------
with open("log_model.pkl", "wb") as f:
    pickle.dump(log_model, f)

with open("rf_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)

with open("xgb_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

# Save explainers (important: use small background set for compact size)
background = X_train.sample(n=min(100, len(X_train)), random_state=42)

explainer_log_small = shap.Explainer(log_model, background)
explainer_rf_small = shap.TreeExplainer(rf_model)
explainer_xgb_small = shap.TreeExplainer(xgb_model)

with open("explainer_log.pkl", "wb") as f:
    pickle.dump(explainer_log_small, f)

with open("explainer_rf.pkl", "wb") as f:
    pickle.dump(explainer_rf_small, f)

with open("explainer_xgb.pkl", "wb") as f:
    pickle.dump(explainer_xgb_small, f)

print("\n✅ All models and SHAP explainers saved successfully!")


Logistic: Accuracy=0.942, AUC=0.967
RandomForest: Accuracy=0.991, AUC=1.000
XGBoost: Accuracy=0.991, AUC=0.999

✅ All models and SHAP explainers saved successfully!


In [48]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import shap

# Assuming X_train, y_train are already prepared

# Train models
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# --- Create SHAP Explainer using training background ---
rf_explainer = shap.TreeExplainer(rf_model, X_train)

# Save everything
joblib.dump(log_model, "log_model.pkl")
joblib.dump(rf_model, "rf_model.pkl")
joblib.dump(xgb_model, "xgb_model.pkl")
joblib.dump(rf_explainer, "rf_explainer.pkl")

print("✅ Models and explainer saved successfully!")


✅ Models and explainer saved successfully!
