In [None]:
# -------------------------------
# Step 1. Imports
# -------------------------------
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import joblib

# -------------------------------
# Step 2. Load Dataset
# -------------------------------
file_path = "osteoporosisfinal (1).csv"
df = pd.read_csv(file_path)

# -------------------------------
# Step 3. Feature/Target Split
# -------------------------------
X = df.drop(columns=["Id", "Osteoporosis"])
y = df["Osteoporosis"]

# -------------------------------
# Step 4. Train-Test Split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# Step 5. Define Preprocessor
# -------------------------------
categorical_nominal = ["Gender", "Race", "MedCondition", "Medications"]

categorical_ordinal = {
    "Weight": ["Underweight", "Normal", "Overweight"],
    "CalciumIn": ["Low", "Adequate", "High"],
    "Activity": ["Sedentary", "Moderate", "Active"],
    "Hormone": ["Normal", "Postmenopausal"],
    
}

binary_cols = ["FHistory", "Fractures", "Smoking"]

preprocessor = ColumnTransformer(
    transformers=[
        # Nominal (One-Hot)
        ("nom", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False), categorical_nominal),

        # Ordinal
        ("ord", OrdinalEncoder(categories=[categorical_ordinal[col] for col in categorical_ordinal]), list(categorical_ordinal.keys())),

        # Binary (Yes/No)
        ("bin", OrdinalEncoder(categories=[["No", "Yes"]] * len(binary_cols)), binary_cols),

        # Numeric
        ("num", StandardScaler(), ["Age"])
    ],
    remainder="drop"
)

# -------------------------------
# Step 6. Fit Preprocessor
# -------------------------------
preprocessor.fit(X_train)

# Get transformed feature names
ohe = preprocessor.named_transformers_["nom"]
ohe_features = ohe.get_feature_names_out(categorical_nominal)
ord_features = list(categorical_ordinal.keys())
bin_features = binary_cols
num_features = ["Age_scaled"]  # You can use 'Age' or 'Age_scaled' as preferred

final_features = list(ohe_features) + ord_features + bin_features + num_features

# Transform to DataFrames
X_train_processed = pd.DataFrame(
    preprocessor.transform(X_train),
    columns=final_features,
    index=X_train.index
)
X_test_processed = pd.DataFrame(
    preprocessor.transform(X_test),
    columns=final_features,
    index=X_test.index
)

# -------------------------------
# Step 7. Train Model
# -------------------------------
clf = LogisticRegression(max_iter=1000, class_weight="balanced")
clf.fit(X_train_processed, y_train)

# -------------------------------
# Step 8. Evaluate Model
# -------------------------------
y_pred = clf.predict(X_test_processed)
y_prob = clf.predict_proba(X_test_processed)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

# -------------------------------
# Step 9. Save Preprocessor + Model
# -------------------------------
joblib.dump(preprocessor, "preprocessor.pkl")
joblib.dump(clf, "baseline_model.pkl")
joblib.dump(final_features, "final_features.pkl")
# -------------------------------
# Step 10. Predict on New Patient
# -------------------------------
# Now you can feed completely raw patient data
new_patient = pd.DataFrame([{
    "Age": 65,
    "Gender": "Female",
    "Hormone": "Postmenopausal",
    "FHistory": "Yes",
    "Race": "Caucasian",
    "Weight": "Underweight",
    "CalciumIn": "Low",
    "Activity": "Sedentary",
    "Smoking": "Yes",
    "MedCondition": "Rheumatoid Arthritis",
    "Medications": "Corticosteroids",
    "Fractures": "Yes"
}])

# Preprocess + predict
new_patient_processed = pd.DataFrame(
    preprocessor.transform(new_patient),
    columns=final_features
)

print("Prediction:", clf.predict(new_patient_processed))
print("Risk Probability:", clf.predict_proba(new_patient_processed)[:, 1])


              precision    recall  f1-score   support

           0       0.77      0.88      0.82       196
           1       0.86      0.74      0.79       196

    accuracy                           0.81       392
   macro avg       0.81      0.81      0.81       392
weighted avg       0.81      0.81      0.81       392

ROC-AUC: 0.879399208663057
Prediction: [1]
Risk Probability: [0.9939181]
