In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
import numpy as np
import joblib

# --- Step 1: Load and Analyze Data ---
file_name = '/content/preprocessed_diabetes_data.csv'
df = pd.read_csv(file_name)
print(f"Successfully loaded '{file_name}'")

# Debug: Print column names and data types
print("\n--- Debug: Dataset Information ---")
print(f"Columns: {list(df.columns)}")
print(f"Shape: {df.shape}")
print(f"Data types:\n{df.dtypes}")

# --- Data Diagnosis ---
print("\n--- Data Diagnosis ---")
target_counts = df['Diabetes_Diagnosis'].value_counts()
print("Distribution of Target Variable ('Diabetes_Diagnosis'):")
print(target_counts)
print("----------------------\n")

# --- Step 2: Prepare Data ---
y = df['Diabetes_Diagnosis'].astype(int)  # Ensure target is integer
X = df.drop('Diabetes_Diagnosis', axis=1)

# Debug: Print feature information
print(f"Features: {list(X.columns)}")
print(f"Feature data types:\n{X.dtypes}")

# Create train-validation-test split (proper way)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# --- Step 3: Scale Data ---
scaler = StandardScaler()
columns_to_scale = ['Age', 'BMI']

# Fit scaler only on training data
X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy()
X_test_scaled = X_test.copy()

X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_val_scaled[columns_to_scale] = scaler.transform(X_val[columns_to_scale])
X_test_scaled[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

# --- Step 4: Calculate Class Weights ---
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {i: weights[i] for i in range(len(weights))}
print(f"Calculated Class Weights: {class_weights}")

# --- Step 5: Build and Compile Model ---
model = Sequential([
    InputLayer(input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.summary()

custom_optimizer = Adam(learning_rate=0.0005)
model.compile(optimizer=custom_optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# --- Step 6: Train the Model ---
print("\nStarting model training...")
early_stopping = EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True, verbose=1)
rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4)

history = model.fit(
    X_train_scaled.values.astype(np.float32), y_train.values.astype(np.float32),
    epochs=150,
    batch_size=32,
    validation_data=(X_val_scaled.values.astype(np.float32), y_val.values.astype(np.float32)),  # Use proper validation set
    callbacks=[early_stopping],
    class_weight=class_weights,
    verbose=1
)
print("Model training complete.")

# --- Step 7: Evaluate the Final Model ---
loss, accuracy = model.evaluate(X_test_scaled.values.astype(np.float32), y_test.values.astype(np.float32), verbose=0)
print(f"\nFinal Model Evaluation on Test Set:")
print(f"  - Test Loss: {loss:.4f}")
print(f"  - Test Accuracy: {accuracy * 100:.2f}%")

# --- Step 8: Save Model, Scaler, Feature Names, and History ---
print("\nSaving all artifacts...")

# Save model
model.save('diabetes_model.keras')

# Save scaler
joblib.dump(scaler, 'scaler.joblib')

# Save feature names for consistency (IMPORTANT!)
feature_names = list(X.columns)
joblib.dump(feature_names, 'feature_names.joblib')

# Save columns that were scaled
joblib.dump(columns_to_scale, 'scaled_columns.joblib')

# Convert the history object to a DataFrame and save it as a CSV
history_df = pd.DataFrame(history.history)
history_df.to_csv('training_history.csv', index=False)

print("Model, scaler, feature names, and training history saved successfully.")
print(f"Feature order: {feature_names}")
print("\nTraining script finished.")