#### Step 1: Set Up Databricks MLflow

In [0]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report


# Enable MLflow autologging
mlflow.sklearn.autolog()



#### Step 2: Simulate a Dataset with Data Leakage

In [None]:

#spark = SparkSession.builder.profile("adb-550863152927870").getOrCreate()


In [0]:
source = "abfss://raw@cloudinfrastg.dfs.core.windows.net/"
data = "netflix_titles.csv"
# Read the data

In [0]:
import pandas as pd

netflix = spark.read.csv(source + data, header=True, inferSchema=True)   
netflix.display() 
netflix.count()

In [None]:
# Simulating customer data
np.random.seed(42)
n_samples = 5000

df = pd.DataFrame({
    "customer_id": range(n_samples),
    "age": np.random.randint(18, 80, n_samples),
    "income": np.random.randint(20000, 150000, n_samples),
    "tenure": np.random.randint(1, 10, n_samples),  # years as a customer
    "logins_last_month": np.random.randint(0, 50, n_samples),
    "num_complaints": np.random.randint(0, 5, n_samples),
    "total_purchases_next_3_months": np.random.randint(0, 10, n_samples),  # LEAKED DATA
    "churn": np.random.choice([0, 1], n_samples)  # 1 = churn, 0 = no churn
})




In [None]:
# Data leakage: We include "total_purchases_next_3_months" which is from the future
features = ["age", "income", "tenure", "logins_last_month", "num_complaints", "total_purchases_next_3_months"]
target = "churn"

X = df[features]
y = df[target]

# Split into train and test (to later detect overfitting)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
with mlflow.start_run() as run:
    # Train a model with the LEAKED feature
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
    model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Metrics
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    print(f"Train Accuracy: {train_acc:.4f}")
    print(f"Test Accuracy: {test_acc:.4f}")

    # Log metrics to MLflow
    mlflow.log_metric("train_accuracy", train_acc)
    mlflow.log_metric("test_accuracy", test_acc)

    # Log model
    mlflow.sklearn.log_model(model, "model")


In [None]:
%md
### Fixing the Data Leakage

In [None]:
# Remove the leaked feature
features = ["age", "income", "tenure", "logins_last_month", "num_complaints"]

X = df[features]  # Remove future data
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

with mlflow.start_run() as run:
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    print(f"Fixed Model - Train Accuracy: {train_acc:.4f}")
    print(f"Fixed Model - Test Accuracy: {test_acc:.4f}")

    mlflow.log_metric("train_accuracy_fixed", train_acc)
    mlflow.log_metric("test_accuracy_fixed", test_acc)
    mlflow.sklearn.log_model(model, "fixed_model")
