In [1]:
# ==============================================================================
# STEP 1: LOAD LIBRARIES, DATA, AND FEATURES
# ==============================================================================
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import joblib # Library for saving and loading models

# Load the full dataset
file_path = 'kmrl_enhanced_dataset.csv'
df = pd.read_csv(file_path)

# --- Data Cleaning and Type Conversion ---
df['Date'] = pd.to_datetime(df['Date'])
df['RollingStock_Cert_Expiry'] = pd.to_datetime(df['RollingStock_Cert_Expiry'])
df['Signal_Cert_Expiry'] = pd.to_datetime(df['Signal_Cert_Expiry'])
df['Telecom_Cert_Expiry'] = pd.to_datetime(df['Telecom_Cert_Expiry'])

# --- Re-create Engineered Features ---
df['Min_Cert_Days_to_Expiry'] = df[['RollingStock_Cert_Expiry', 'Signal_Cert_Expiry', 'Telecom_Cert_Expiry']].apply(
    lambda row: (row - df.loc[row.name, 'Date']).dt.days, axis=1
).min(axis=1)

# ==============================================================================
# STEP 2: TRAIN THE MODEL ON THE ENTIRE DATASET
# ==============================================================================
# This time, we train on all the data since our goal is to create the best possible
# model for deployment, not to test its performance.

print("Training the final model on the entire dataset...")

# --- Define Features and Target ---
df['Target'] = (df['JobCard_Priority'] == 'High').astype(int)
features = [
    'BrakePad_KM_Since_Change',
    'Bogie_KM_Since_Service',
    'HVAC_Operating_Hours',
    'Avg_Vibration_Level',
    'Max_Brake_Temp_Celsius',
    'Min_Cert_Days_to_Expiry'
]
X = df[features]
y = df['Target']

# --- Train the Model ---
final_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
final_model.fit(X, y)

print("✅ Final model training complete.")

# ==============================================================================
# STEP 3: SAVE THE MODEL TO A FILE
# ==============================================================================
model_filename = 'kmrl_risk_model.joblib'
joblib.dump(final_model, model_filename)

print(f"\n✅ Model successfully saved to '{model_filename}'")

Training the final model on the entire dataset...
✅ Final model training complete.

✅ Model successfully saved to 'kmrl_risk_model.joblib'
