In [None]:
# 1. Install requirements (if needed)
# !pip install pandas numpy matplotlib seaborn scikit-learn

# 2. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

sns.set(style="whitegrid")

In [None]:
# 3. Load sample data (df_train_tte.csv and df_current_tte.csv)
df_train = pd.read_csv("df_train_tte.csv")
df_current = pd.read_csv("df_current_tte.csv")

print("Training data shape:", df_train.shape)
print("Current data shape:", df_current.shape)

df_train.head()

In [None]:
# 4. Assume same features for both
features = ["feature1", "feature2", "feature3"]
target = "tte"

X_train = df_train[features]
y_train = df_train[target]

X_current = df_current[features]
y_current = df_current[target]

In [None]:
# 5. Train a mock model (simulate previously trained TTE model)
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Save predictions
df_train["predicted_tte"] = model.predict(X_train)
df_current["predicted_tte"] = model.predict(X_current)

In [None]:
# 6. Compare prediction distributions
plt.figure(figsize=(10, 5))
sns.kdeplot(df_train["predicted_tte"], label="Train Predictions", fill=True)
sns.kdeplot(df_current["predicted_tte"], label="Current Predictions", fill=True)
plt.title("Prediction Distribution Comparison")
plt.xlabel("Predicted TTE")
plt.legend()
plt.show()

In [None]:
# 7. Calculate prediction drift metric (e.g., RMSE between predicted and true TTE)
rmse_train = mean_squared_error(y_train, df_train["predicted_tte"], squared=False)
rmse_current = mean_squared_error(y_current, df_current["predicted_tte"], squared=False)

print(f"RMSE on Training Set: {rmse_train:.2f}")
print(f"RMSE on Current Set: {rmse_current:.2f}")
print(f"Prediction Drift Delta: {rmse_current - rmse_train:.2f}")

In [None]:
# 8. Optional: Monitor drift over time (e.g., current data by date)
df_current["date"] = pd.to_datetime(df_current["date"])
df_current["error"] = (df_current["tte"] - df_current["predicted_tte"])**2

drift_over_time = df_current.groupby(df_current["date"].dt.to_period("W"))["error"].mean()

drift_over_time.plot(kind="line", figsize=(10, 4), title="Prediction Drift Over Time")
plt.ylabel("Mean Squared Error")
plt.xlabel("Week")
plt.show()