In [12]:
# See if training and test sets overlap
set(X_train.index) & set(X_test.index)


set()

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import joblib


df = pd.read_csv(r"D:\data sets\dataset-uci.csv")


features = [
    'Gender', 'Hyperlipidemia', 'Alkaline Phosphatase (ALP)',
    'Intracellular Water (ICW)', 'Alanin Aminotransferaz (ALT)', 'Vitamin D',
    'C-Reactive Protein (CRP)', 'Bone Mass (BM)', 'Aspartat Aminotransferaz (AST)',
    'Extracellular Fluid/Total Body Water (ECF/TBW)', 'Low Density Lipoprotein (LDL)','Body Mass Index (BMI)'
    
]
X = df[features]
y = df['Gallstone Status']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


full_model = LogisticRegression(max_iter=1000, solver='lbfgs')
full_model.fit(X_train_scaled, y_train)


y_pred = full_model.predict(X_test_scaled)


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

p_full = np.clip(full_model.predict_proba(X_test_scaled)[:, 1], 1e-10, 1 - 1e-10)
ll_full = np.sum(y_test * np.log(p_full) + (1 - y_test) * np.log(1 - p_full))


null_model = LogisticRegression(max_iter=1000, solver='lbfgs')
null_model.fit(np.zeros((len(y_train), 1)), y_train)  # intercept only
p_null = np.clip(null_model.predict_proba(np.zeros((len(y_test), 1)))[:, 1], 1e-10, 1 - 1e-10)
ll_null = np.sum(y_test * np.log(p_null) + (1 - y_test) * np.log(1 - p_null))


mcfadden_r2 = 1 - (ll_full / ll_null)
print(f"McFadden's R²: {mcfadden_r2:.4f}")


joblib.dump(full_model, r"D:\data sets\gallstone_model.pkl")
joblib.dump(scaler, r"D:\data sets\scaler.pkl")


Confusion Matrix:
[[29  4]
 [ 8 23]]

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.88      0.83        33
           1       0.85      0.74      0.79        31

    accuracy                           0.81        64
   macro avg       0.82      0.81      0.81        64
weighted avg       0.82      0.81      0.81        64

McFadden's R²: 0.3458


['D:\\data sets\\scaler.pkl']

In [20]:
coefs = pd.DataFrame({
    'Feature': features,
    'Coefficient': full_model.coef_[0],
    'Mean': scaler.mean_,
    'StdDev': np.sqrt(scaler.var_)
})


intercept = full_model.intercept_[0]


coefs.to_csv(r"D:\data sets\logistic_model_coeffs.csv", index=False)
pd.DataFrame({'Intercept': [intercept]}).to_csv(r"D:\data sets\logistic_model_intercept.csv", index=False)


In [21]:
pd.DataFrame({
    'Metric': ['McFadden_R2'],
    'Value': [mcfadden_r2]
}).to_csv(r"D:\data sets\model_metrics.csv", index=False)

In [23]:
model = joblib.load(r"D:\data sets\gallstone_model.pkl")
scaler = joblib.load(r"D:\data sets\scaler.pkl")


df = pd.read_csv(r"D:\data sets\dataset-uci.csv")


features = [
    'Gender', 'Hyperlipidemia', 'Alkaline Phosphatase (ALP)',
    'Intracellular Water (ICW)', 'Alanin Aminotransferaz (ALT)', 'Vitamin D',
    'C-Reactive Protein (CRP)', 'Bone Mass (BM)', 'Aspartat Aminotransferaz (AST)',
    'Extracellular Fluid/Total Body Water (ECF/TBW)', 'Low Density Lipoprotein (LDL)',
    'Body Mass Index (BMI)'
]
X = df[features]
y = df['Gallstone Status']

X_scaled = scaler.transform(X)

p_full = np.clip(model.predict_proba(X_scaled)[:, 1], 1e-10, 1 - 1e-10)

p_null = np.full_like(p_full, y.mean())

pd.DataFrame({
    'Actual': y,
    'P_Full': p_full,
    'P_Null': p_null
}).to_csv(r"D:\data sets\mcfadden_input.csv", index=False)
