In [3]:
# ===================================================================
# Step 1: Setup and Imports
# ===================================================================
import pandas as pd
from datetime import datetime
import os
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import joblib # Added import for saving model

print("Step 1: All libraries imported successfully.")

# ===================================================================
# Step 2: Load Raw Data
# ===================================================================
print("\nStep 2: Loading raw data...")

# --- PATHS CORRECTED ---
# Paths are now relative to the 'notebooks' folder
path_to_raw_data = '../../data_collection/synthea/output/csv/'
path_to_processed_data = '../data/processed/'
# --- END CORRECTION ---

os.makedirs(path_to_processed_data, exist_ok=True)

try:
    patients_df = pd.read_csv(path_to_raw_data + 'patients.csv')
    conditions_df = pd.read_csv(path_to_raw_data + 'conditions.csv')
    observations_df = pd.read_csv(path_to_raw_data + 'observations.csv')
    print("-> Raw data files loaded successfully.")
except FileNotFoundError as e:
    print(f"\nERROR: Could not find raw data files. Checked location: {path_to_raw_data}")
    print(f"Full error: {e}")
    # exit() # Commented out exit() to allow notebook to run partially
except Exception as e:
    print(f"\nAn unexpected error occurred loading data: {e}")
    # exit()

# ===================================================================
# Step 3: Create the Target Variable (Diabetes Label)
# ===================================================================
print("\nStep 3: Creating the target variable (DIABETES)...")

# Ensure dataframes loaded before proceeding
if 'patients_df' in locals() and 'conditions_df' in locals():
    diabetes_conditions = conditions_df[conditions_df['DESCRIPTION'].str.contains('diabetes', case=False) & ~conditions_df['DESCRIPTION'].str.contains('Prediabetes', case=False)]
    patients_with_diabetes = diabetes_conditions['PATIENT'].unique()
    patients_df['DIABETES'] = patients_df['Id'].isin(patients_with_diabetes).astype(int)
    print(f"-> Identified {len(patients_with_diabetes)} patients with a diabetes diagnosis.")
else:
    print("-> Skipping Step 3, raw data not loaded.")

# ===================================================================
# Step 4: Feature Engineering
# ===================================================================
print("\nStep 4: Engineering features for Diabetes prediction...")

def get_latest_observation(df, code, new_column_name):
    temp_df = df[df['CODE'] == code].copy()
    temp_df['DATE'] = pd.to_datetime(temp_df['DATE'])
    temp_df = temp_df.sort_values(by=['PATIENT', 'DATE'])
    latest_obs = temp_df.drop_duplicates(subset='PATIENT', keep='last')
    latest_obs = latest_obs[['PATIENT', 'VALUE']].rename(columns={'VALUE': new_column_name})
    return latest_obs

if 'patients_df' in locals() and 'observations_df' in locals():
    patients_df['BIRTHDATE'] = pd.to_datetime(patients_df['BIRTHDATE'])
    current_year = datetime.now().year
    patients_df['AGE'] = current_year - patients_df['BIRTHDATE'].dt.year
    latest_bmi = get_latest_observation(observations_df, '39156-5', 'BMI')
    latest_systolic = get_latest_observation(observations_df, '8480-6', 'SYSTOLIC_BP')
    latest_diastolic = get_latest_observation(observations_df, '8462-4', 'DIASTOLIC_BP')
    latest_cholesterol = get_latest_observation(observations_df, '2093-3', 'TOTAL_CHOLESTEROL')
    latest_smoking = get_latest_observation(observations_df, '72166-2', 'SMOKING_STATUS')
    latest_smoking['SMOKING_STATUS'] = latest_smoking['SMOKING_STATUS'].replace({
        'Never smoker': 'non-smoker',
        'Former smoker': 'former-smoker',
        'Current every day smoker': 'smoker'
    }).fillna('unknown')
    print("-> All features created.")
else:
    print("-> Skipping Step 4, raw data not loaded.")

# ===================================================================
# Step 5: Combine, Clean, and Save Processed Data
# ===================================================================
print("\nStep 5: Combining and cleaning data...")

if 'patients_df' in locals():
    model_df = patients_df[['Id', 'AGE', 'DIABETES']]
    features_to_merge = [latest_bmi, latest_systolic, latest_diastolic, latest_cholesterol, latest_smoking]
    for feature_df in features_to_merge:
        model_df = pd.merge(model_df, feature_df, left_on='Id', right_on='PATIENT', how='left')
        if 'PATIENT' in model_df.columns:
            model_df = model_df.drop(columns=['PATIENT'])

    numeric_features = ['AGE', 'BMI', 'SYSTOLIC_BP', 'DIASTOLIC_BP', 'TOTAL_CHOLESTEROL']
    categorical_features = ['SMOKING_STATUS']
    feature_columns = numeric_features + categorical_features

    final_df = model_df[feature_columns + ['DIABETES']].copy()

    for col in numeric_features:
        final_df[col] = pd.to_numeric(final_df[col], errors='coerce')
        median_val = final_df[col].median()
        final_df[col] = final_df[col].fillna(median_val)
    for col in categorical_features:
        mode_val = final_df[col].mode()[0]
        final_df[col] = final_df[col].fillna(mode_val)

    output_path = path_to_processed_data + 'cleaned_diabetes_data_v3.csv'
    final_df.to_csv(output_path, index=False)
    print(f"-> Clean data saved to '{output_path}'")
    print(f"-> Shape of final model-ready data: {final_df.shape}")
else:
    print("-> Skipping Step 5, data not processed.")

# ===================================================================
# Step 6: Prepare for Modeling
# ===================================================================
print("\nStep 6: Preprocessing features and splitting data...")

if 'final_df' in locals():
    X = final_df[feature_columns]
    y = final_df['DIABETES']
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"-> Training set size: {X_train.shape[0]} samples")
    print(f"-> Test set size: {X_test.shape[0]} samples")
else:
    print("-> Skipping Step 6, final_df not created.")
    
# ===================================================================
# Step 7: Train and Evaluate Baseline Models
# ===================================================================
print("\nStep 7: Training and evaluating baseline models...")

if 'X_train' in locals():
    try:
        scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
    except (ZeroDivisionError, KeyError):
        scale_pos_weight = 1

    models = {
        "Logistic Regression": Pipeline(steps=[('preprocessor', preprocessor), ('classifier', LogisticRegression(random_state=42, class_weight='balanced'))]),
        "Random Forest": Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced'))]),
        "XGBoost": Pipeline(steps=[('preprocessor', preprocessor), ('classifier', xgb.XGBClassifier(random_state=42, scale_pos_weight=scale_pos_weight, use_label_encoder=False, eval_metric='logloss'))])
    }

    for name, pipeline in models.items():
        print(f"\n{'='*20} {name} {'='*20}")
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
        
        print(f"\nClassification Report:")
        print(classification_report(y_test, y_pred))
        acc_score = accuracy_score(y_test, y_pred)
        print(f"Accuracy Score: {acc_score:.4f}")
        auc_score = roc_auc_score(y_test, y_pred_proba)
        print(f"AUC-ROC Score: {auc_score:.4f}\n")
else:
    print("-> Skipping Step 7, data not split.")
    
# ===================================================================
# Step 8: Hyperparameter Tuning the Best Model (Random Forest)
# ===================================================================
print("\nStep 8: Tuning the Random Forest model for best performance...")

if 'X_train' in locals() and 'models' in locals():
    # Define the parameter grid to search
    param_dist = {
        'classifier__n_estimators': [100, 200, 300, 500],
        'classifier__max_depth': [5, 10, 15, 20, 30, None],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__bootstrap': [True, False]
    }

    # Use the Random Forest pipeline
    rf_pipeline = models["Random Forest"]

    # Set up RandomizedSearchCV
    random_search = RandomizedSearchCV(estimator=rf_pipeline, param_distributions=param_dist, n_iter=50,
                                       cv=3, verbose=1, random_state=42, n_jobs=-1, scoring='roc_auc')

    # Fit the search to the training data
    random_search.fit(X_train, y_train)

    print(f"\nBest parameters found: {random_search.best_params_}")

    # Evaluate the best model
    best_rf_model = random_search.best_estimator_
    y_pred_tuned = best_rf_model.predict(X_test)
    y_pred_proba_tuned = best_rf_model.predict_proba(X_test)[:, 1]

    print(f"\n{'='*20} Tuned Random Forest {'='*20}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred_tuned))
    acc_score_tuned = accuracy_score(y_test, y_pred_tuned)
    print(f"Accuracy Score: {acc_score_tuned:.4f}")
    auc_score_tuned = roc_auc_score(y_test, y_pred_proba_tuned)
    print(f"AUC-ROC Score: {auc_score_tuned:.4f}\n")
else:
    print("-> Skipping Step 8, models not trained.")

# ===================================================================
# Step 9: Save the Final Model to a File
# ===================================================================
print("\nStep 9: Saving the final model...")

if 'random_search' in locals():
    # The best model was found by our random_search in Step 8
    final_model = random_search.best_estimator_

    # --- PATHS CORRECTED ---
    # The directory to save models
    model_directory = '../models/'
    # --- END CORRECTION ---
    
    os.makedirs(model_directory, exist_ok=True)

    # Save the entire pipeline (preprocessor + model) to a file
    model_filename = model_directory + 'diabetes_prediction_model.joblib'
    joblib.dump(final_model, model_filename)

    print(f"\n--- Model saved successfully to {model_filename} ---")
else:
    print("-> Skipping Step 9, model tuning did not run.")

print("--- End of Script ---")

Step 1: All libraries imported successfully.

Step 2: Loading raw data...
-> Raw data files loaded successfully.

Step 3: Creating the target variable (DIABETES)...
-> Identified 169 patients with a diabetes diagnosis.

Step 4: Engineering features for Diabetes prediction...
-> All features created.

Step 5: Combining and cleaning data...
-> Clean data saved to '../data/processed/cleaned_diabetes_data_v3.csv'
-> Shape of final model-ready data: (1171, 7)

Step 6: Preprocessing features and splitting data...
-> Training set size: 936 samples
-> Test set size: 235 samples

Step 7: Training and evaluating baseline models...


Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.69      0.80       201
           1       0.30      0.76      0.43        34

    accuracy                           0.70       235
   macro avg       0.62      0.73      0.61       235
weighted avg       0.85      0.70      0.74       235

Accuracy Score: 0.7

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       201
           1       0.41      0.47      0.44        34

    accuracy                           0.83       235
   macro avg       0.66      0.68      0.67       235
weighted avg       0.84      0.83      0.83       235

Accuracy Score: 0.8255
AUC-ROC Score: 0.8532


Step 8: Tuning the Random Forest model for best performance...
Fitting 3 folds for each of 50 candidates, totalling 150 fits

Best parameters found: {'classifier__n_estimators': 300, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 4, 'classifier__max_depth': 5, 'classifier__bootstrap': True}


Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.68      0.80       201
           1       0.32      0.88      0.47        34

    accuracy                           0.71       235
   macro avg       0.64      0.78      0.63   