# main nootbook to run

## installaions

## imports

In [1]:
import pandas as pd
from sklearn.model_selection import (
    StratifiedShuffleSplit,
    GridSearchCV,
    train_test_split,
    cross_validate,
    cross_val_score,
)
from lightgbm import LGBMClassifier
from sklift.models import SoloModel

# from sklift.viz import plot_qini_curve
from sklift.datasets import fetch_megafon
from sklift.metrics import make_uplift_scorer
import os
import sys
from pathlib import Path
import yaml
from datetime import datetime
import re
from sklift.metrics import qini_auc_score
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from functools import reduce

from sklift.metrics import qini_auc_score
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cwd = Path.cwd()
repo_root = ([cwd] + list(cwd.parents))[1]

# Ensure repo_root is on sys.path so `src.train` can be imported
sys.path.append(str(repo_root))
from src.process_datasets import create_data, get_web_feats

# Load the YAML config file
with open(os.path.join(repo_root, "config.yaml"), "r") as f:
    config = yaml.safe_load(f)

## data handling

### data loading

In [3]:
# ----------------------------
# Load data
# ----------------------------
app_usage = pd.read_csv(repo_root / "data" / "train" / "app_usage.csv")
web_visits = pd.read_csv(repo_root / "data" / "train" / "web_visits.csv")
claims = pd.read_csv(repo_root / "data" / "train" / "claims.csv")
churn_labels = pd.read_csv(repo_root / "data" / "train" / "churn_labels.csv")
test_app_usage = pd.read_csv(repo_root / "data" / "test" / "test_app_usage.csv")
test_web_visits = pd.read_csv(repo_root / "data" / "test" / "test_web_visits.csv")
test_claims = pd.read_csv(repo_root / "data" / "test" / "test_claims.csv")
test_churn_labels = pd.read_csv(repo_root / "data" / "test" / "test_churn_labels.csv")

In [4]:
x_train, y_train, treatment_train = get_data(
    app_usage, web_visits, claims, churn_labels, day_first_web=True
)

x_test, y_test, treatment_test = get_data(
    test_app_usage, test_web_visits, test_claims, test_churn_labels, day_first_web=False
)

NameError: name 'get_data' is not defined

In [None]:
x_train.visit_trend.max()

### data preprocess

### feature selection

## training

## evaluation

In [None]:
timestamp = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")

In [None]:
# x_train = x_train.fillna(0)
# x_test = x_test.fillna(0)
# y_train = y_train.fillna(0)
# y_test = y_test.fillna(0)
# treatment_train = treatment_train.fillna(0)
# treatment_test = treatment_test.fillna(0)

In [None]:
x_train

In [None]:
## Overfitting Diagnostic Dashboard



def compare_train_test_performance(
    model, X_tr, X_te, y_tr, y_te, t_tr, t_te, model_name="Model"
):
    """Compare model performance on train vs test to detect overfitting"""

    # Predict on both sets
    uplift_train = model.predict(X_tr)
    uplift_test = model.predict(X_te)

    # Calculate Qini AUC
    qini_train = qini_auc_score(y_tr, uplift_train, t_tr)
    qini_test = qini_auc_score(y_te, uplift_test, t_te)

    # Plot side-by-side
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

    from sklift.viz import plot_qini_curve

    plot_qini_curve(y_tr, uplift_train, t_tr, perfect=True, name="Train", ax=ax1)
    ax1.set_title(f"{model_name} - Train (Qini AUC: {qini_train:.4f})")

    plot_qini_curve(y_te, uplift_test, t_te, perfect=True, name="Test", ax=ax2)
    ax2.set_title(f"{model_name} - Test (Qini AUC: {qini_test:.4f})")

    plt.tight_layout()
    plt.show()

    # Calculate overfitting gap
    gap = qini_train - qini_test
    print(f"\n{'='*60}")
    print(f"Model: {model_name}")
    print(f"Train Qini AUC: {qini_train:.4f}")
    print(f"Test Qini AUC:  {qini_test:.4f}")
    print(f"Overfitting Gap: {gap:.4f} ({gap/qini_train*100:.1f}% relative)")

    if gap > 0.01:
        print("‚ö†Ô∏è  WARNING: Significant overfitting detected!")
    elif gap > 0.005:
        print("‚ö†Ô∏è  Mild overfitting detected")
    else:
        print("‚úÖ Good generalization")
    print(f"{'='*60}\n")

    return qini_train, qini_test, gap




In [None]:
x_train.iloc[0]

In [None]:
from sklearn.model_selection import ParameterGrid
import numpy as np

param_grid = {
    "estimator__learning_rate": [0.01, 0.05],  # Lower learning rates
    "estimator__max_depth": [2, 3, 4],  # Much shallower trees
    "estimator__min_child_samples": [100],  # More samples per leaf
    "estimator__reg_alpha": [5.0],  # Stronger L1 regularization
    "estimator__reg_lambda": [5.0],  # Stronger L2 regularization
    "estimator__num_leaves": [7, 15],  # Fewer leaves
    "estimator__subsample": [0.7, 0.8],  # Add row subsampling
    "estimator__colsample_bytree": [0.7, 0.8],  # Add column subsampling
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
best_score = -np.inf
best_params = None

for params in ParameterGrid(param_grid):
    cv_scores = []
    # ParameterGrid returns namespaced keys (estimator__*) - we pass them to the estimator
    for train_idx, val_idx in cv.split(x_train, y_train):
        X_tr, X_val = x_train.iloc[train_idx], x_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        t_tr, t_val = treatment_train.iloc[train_idx], treatment_train.iloc[val_idx]

        # create a fresh model for each fold and parameter setting
        est_kwargs = {k.replace("estimator__", ""): v for k, v in params.items()}
        model = SoloModel(
            estimator=LGBMClassifier(
                random_state=42,
                n_jobs=-1,
                class_weight="balanced",
                **est_kwargs,
                verbose=-1,
            )
        )
        model.fit(X_tr, y_tr, treatment=t_tr)
        uplift_val = model.predict(X_val)
        q = qini_auc_score(y_val, uplift_val, t_val)
        print(q)
        cv_scores.append(q)

    mean_qini = np.mean(cv_scores)
    print(f"Params: {params} | CV Qini AUC: {mean_qini:.4f}")
    if mean_qini > best_score:
        best_score = mean_qini
        best_params = params

print("‚úÖ Best Params:", best_params)
print("‚úÖ Best CV Qini AUC:", best_score)

In [None]:
## Train Final Model with Best Regularization

# Use best params from grid search
final_model = SoloModel(
    estimator=LGBMClassifier(random_state=42, n_jobs=-1,is_unbalanced=True)
)
# final_model.set_params(**best_params)

# Train on full training set
final_model.fit(x_train, y_train, treatment=treatment_train)

# Comprehensive evaluation
print("\n" + "=" * 70)
print("FINAL MODEL EVALUATION")
print("=" * 70)

compare_train_test_performance(
    final_model,
    x_train,
    x_test,
    y_train,
    y_test,
    treatment_train,
    treatment_test,
    model_name="Final Regularized S-Learner",
)

# Save model if generalization is acceptable
uplift_train_final = final_model.predict(x_train)
uplift_test_final = final_model.predict(x_test)

qini_train_final = qini_auc_score(y_train, uplift_train_final, treatment_train)
qini_test_final = qini_auc_score(y_test, uplift_test_final, treatment_test)

if (qini_train_final - qini_test_final) < 0.015:  # Accept <0.015 gap
    print("\n‚úÖ Model generalization acceptable - saving model")
    import joblib

    model_path = repo_root / "models" / f"uplift_slearner_{timestamp}.pkl"
    joblib.dump(final_model, model_path)
    print(f"Saved to: {model_path}")
else:
    print("\n‚ö†Ô∏è  WARNING: Model still overfitting - consider simpler model or more data")

In [None]:
from sklift.models.two_models import TwoModel
from lightgbm import LGBMClassifier

# 1. Define the base estimator (used for both T=1 and T=0)
estimator = LGBMClassifier(
    random_state=42,
    n_jobs=-1,
    # REMINDER: Remove class_weight="balanced" if you used it before
)

# 2. Instantiate the T-Learner wrapper
# It creates two copies of the estimator internally: one for T=1 and one for T=0
tm = TwoModel(estimator=estimator)

# 3. Fit the model using your data
tm.fit(X_train, y_train, treatment_train)

# 4. Predict the uplift
uplift_preds = tm.predict(X_test)

In [None]:
## Train Final Model with Best Regularization

# Use best params from grid search
final_model = SoloModel(
    estimator=LGBMClassifier(random_state=42, n_jobs=-1)
)
final_model.set_params(**best_params)

# Train on full training set
final_model.fit(x_train, y_train, treatment=treatment_train)

# Comprehensive evaluation
print("\n" + "=" * 70)
print("FINAL MODEL EVALUATION")
print("=" * 70)

compare_train_test_performance(
    final_model,
    x_train,
    x_test,
    y_train,
    y_test,
    treatment_train,
    treatment_test,
    model_name="Final Regularized S-Learner",
)

# Save model if generalization is acceptable
uplift_train_final = final_model.predict(x_train)
uplift_test_final = final_model.predict(x_test)

qini_train_final = qini_auc_score(y_train, uplift_train_final, treatment_train)
qini_test_final = qini_auc_score(y_test, uplift_test_final, treatment_test)

if (qini_train_final - qini_test_final) < 0.015:  # Accept <0.015 gap
    print("\n‚úÖ Model generalization acceptable - saving model")
    import joblib

    model_path = repo_root / "models" / f"uplift_slearner_{timestamp}.pkl"
    joblib.dump(final_model, model_path)
    print(f"Saved to: {model_path}")
else:
    print("\n‚ö†Ô∏è  WARNING: Model still overfitting - consider simpler model or more data")

In [None]:
pip install causalml

In [None]:
from causalml.inference.meta.drlearner import BaseDRLearner

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMRegressor
from causalml.inference.meta import BaseDRLearner
# from causalml.metrics import qini_auc_score
import joblib

# ----------------------------
# 0Ô∏è‚É£ Setup
# ----------------------------
obs_start = pd.to_datetime("2025-07-01")
obs_end = pd.to_datetime("2025-07-14")

# Ensure all member IDs are in features
all_member_ids = churn_labels["member_id"]

# ----------------------------
# 1Ô∏è‚É£ App usage features
# ----------------------------
app_total_events = (
    app_usage.groupby("member_id")
    .size()
    .reindex(all_member_ids, fill_value=0)
    .rename("app_total_events")
)
app_last_ts = (
    app_usage.groupby("member_id")["timestamp"]
    .max()
    .reindex(all_member_ids, fill_value=obs_start)
    .rename("app_last_ts")
)

# Weekly trend
app_usage["week"] = app_usage["timestamp"].dt.isocalendar().week
weekly_counts = app_usage.groupby(["member_id", "week"]).size().unstack(fill_value=0)
weekly_counts = weekly_counts.reindex(all_member_ids, fill_value=0)
if weekly_counts.shape[1] >= 2:
    weekly_counts["app_trend"] = weekly_counts.iloc[:, -1] - weekly_counts.iloc[:, 0]
else:
    weekly_counts["app_trend"] = 0

# ----------------------------
# 2Ô∏è‚É£ Web visits features
# ----------------------------
web_visits[["domain", "category", "page"]] = web_visits["url"].str.extract(
    r"https?://([^/]+)/([^/]+)/(.+)"
)

web_total = (
    web_visits.groupby("member_id")
    .size()
    .reindex(all_member_ids, fill_value=0)
    .rename("web_total")
)
web_chronic = (
    web_visits[web_visits["category"] == "chronic"]
    .groupby("member_id")
    .size()
    .reindex(all_member_ids, fill_value=0)
    .rename("web_chronic")
)
web_last_ts = (
    web_visits.groupby("member_id")["timestamp"]
    .max()
    .reindex(all_member_ids, fill_value=obs_start)
    .rename("web_last_ts")
)

# ----------------------------
# 3Ô∏è‚É£ Claims features
# ----------------------------
claims_count = (
    claims.groupby("member_id")
    .size()
    .reindex(all_member_ids, fill_value=0)
    .rename("claims_count")
)
icd_prefix = claims["icd_code"].str[:3]
claims_icd_counts = (
    claims.groupby(["member_id", icd_prefix]).size().unstack(fill_value=0)
)
claims_icd_counts = claims_icd_counts.reindex(all_member_ids, fill_value=0)

# ----------------------------
# 4Ô∏è‚É£ Signup / recency features
# ----------------------------
signup_date = pd.to_datetime(
    churn_labels.set_index("member_id")["signup_date"]
).reindex(all_member_ids)
days_since_signup = (obs_end - signup_date).dt.days.rename("days_since_signup")

# ----------------------------
# 5Ô∏è‚É£ Merge all features
# ----------------------------
features = pd.concat(
    [
        app_total_events,
        app_last_ts,
        weekly_counts["app_trend"],
        web_total,
        web_chronic,
        web_last_ts,
        claims_count,
        claims_icd_counts,
        days_since_signup,
    ],
    axis=1,
)

# Convert timestamps to numeric
for col in ["app_last_ts", "web_last_ts"]:
    features[col] = (features[col] - obs_start).dt.total_seconds()

features = features.fillna(0)

# ----------------------------
# 6Ô∏è‚É£ Build propensity features
# ----------------------------
treatment_aligned = treatment_train.reindex(features.index).fillna(0)
prop_model = RandomForestClassifier(n_estimators=100, random_state=42)
prop_model.fit(features, treatment_aligned.values)
propensity_score = pd.Series(
    prop_model.predict_proba(features)[:, 1],
    index=features.index,
    name="propensity_score",
)

features["propensity_score"] = propensity_score

# ----------------------------
# 7Ô∏è‚É£ Align train/test by member_id
# ----------------------------
x_train_final = features.loc[x_train["member_id"]].fillna(0)
x_test_final = features.loc[x_test["member_id"]].fillna(0)


y_train_aligned = y_train.reindex(x_train_final.index).fillna(0).astype("int64")

y_test_aligned = y_test.reindex(x_test_final.index).fillna(0).astype("int64")

treatment_train_aligned = (
    treatment_train.reindex(x_train_final.index).fillna(0).astype("int64")
)

treatment_test_aligned = (
    treatment_test.reindex(x_test_final.index).fillna(0).astype("int64")
)
# ----------------------------
# 8Ô∏è‚É£ Train DR-Learner
# ----------------------------
dr_model = BaseDRLearner(learner=LGBMRegressor(random_state=42, n_jobs=-1))
dr_model.fit(
    X=x_train_final.values,
    treatment=treatment_train_aligned.values,
    y=y_train_aligned.values,
)

# ----------------------------
# 9Ô∏è‚É£ Predict uplift
# ----------------------------
uplift_train_final = dr_model.predict(x_train_final.values)
uplift_test_final = dr_model.predict(x_test_final.values)

# ----------------------------
# üîü Qini evaluation
# ----------------------------
qini_train_final = qini_auc_score(
    y_train_aligned.values, uplift_train_final, treatment_train_aligned.values
)
qini_test_final = qini_auc_score(
    y_test_aligned.values, uplift_test_final, treatment_test_aligned.values
)

print("\n" + "=" * 70)
print("FINAL DR-LEARNER MODEL EVALUATION")
print("=" * 70)
print(f"Qini AUC - Train: {qini_train_final:.4f}")
print(f"Qini AUC - Test:  {qini_test_final:.4f}")

# ----------------------------
# 1Ô∏è‚É£1Ô∏è‚É£ Save model if generalization acceptable
# ----------------------------
gap = qini_train_final - qini_test_final
if gap < 0.015:
    print("\n‚úÖ Model generalization acceptable - saving model")
    model_path = repo_root / "models" / f"uplift_drlearner_{timestamp}.pkl"
    joblib.dump(dr_model, model_path)
    print(f"Saved to: {model_path}")
else:
    print(
        "\n‚ö†Ô∏è WARNING: Model still overfitting or signal weak - consider simpler learner or more features"
    )

In [None]:
print(features.head())
print(treatment_train.mean())
print(y_train.mean())

In [None]:
import pandas as pd
from causalml.inference.meta import BaseDRLearner
from lightgbm import LGBMClassifier
# from causalml.metrics import qini_auc_score
import joblib
from lightgbm import LGBMRegressor
from causalml.inference.meta import BaseDRLearner


# ----------------------------
# Ensure correct data types
# ----------------------------
y_train = y_train.astype("int64")
y_test = y_test.astype("int64")
treatment_train = treatment_train.astype("int64")
treatment_test = treatment_test.astype("int64")

x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

# ----------------------------
# Create DR Learner
# ----------------------------
# Optional: use best_params from grid search
# estimator_params = {k.replace("estimator__", ""): v for k, v in best_params.items()}
dr_model = BaseDRLearner(learner=LGBMRegressor(random_state=42, n_jobs=-1))
# ----------------------------
# Train on full training set
# ----------------------------
dr_model.fit(X=x_train.values, treatment=treatment_train.values, y=y_train.values)

# ----------------------------
# Predictions
# ----------------------------
uplift_train_final = dr_model.predict(X=x_train.values)
uplift_test_final = dr_model.predict(X=x_test.values)

# ----------------------------
# Qini AUC evaluation
# ----------------------------
qini_train_final = qini_auc_score(
    y_train.values, uplift_train_final, treatment_train.values
)
qini_test_final = qini_auc_score(
    y_test.values, uplift_test_final, treatment_test.values
)

print("\n" + "=" * 70)
print("FINAL DR-LEARNER MODEL EVALUATION")
print("=" * 70)
print(f"Qini AUC - Train: {qini_train_final:.4f}")
print(f"Qini AUC - Test:  {qini_test_final:.4f}")

# ----------------------------
# Save model if generalization is acceptable
# ----------------------------
gap = qini_train_final - qini_test_final
if gap < 0.015:  # Acceptable generalization
    print("\n‚úÖ Model generalization acceptable - saving model")
    model_path = repo_root / "models" / f"uplift_drlearner_{timestamp}.pkl"
    joblib.dump(dr_model, model_path)
    print(f"Saved to: {model_path}")
else:
    print(
        "\n‚ö†Ô∏è WARNING: Model still overfitting - consider simpler estimator or more data"
    )

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor
from causalml.inference.meta import BaseDRLearner
# from causalml.metrics import qini_auc_score
import joblib

# ----------------------------
# Settings
# ----------------------------
obs_start = pd.to_datetime("2025-07-01")
obs_end = pd.to_datetime("2025-07-14")
SAVE_MODEL = False  # set True to save model
MODEL_PATH = "uplift_drlearner_final.pkl"

# ----------------------------
# Ensure training labels & treatment are correct dtypes
# ----------------------------
y_train = y_train.astype("int64")
y_test = y_test.astype("int64")
treatment_train = treatment_train.astype("int64")
treatment_test = treatment_test.astype("int64")

# ----------------------------
# all_member_ids: use churn_labels as canonical list of members
# ----------------------------
all_member_ids = churn_labels["member_id"]

# ----------------------------
# 1) Build base feature table (indexed by member_id)
# ----------------------------
# a) app usage aggregates
app_usage["timestamp"] = pd.to_datetime(app_usage["timestamp"], errors="coerce")
app_total_events = (
    app_usage.groupby("member_id")
    .size()
    .reindex(all_member_ids, fill_value=0)
    .rename("app_total_events")
)

# 'app_last_ts' is custom numeric encoding (higher == more recent/active).
# If app_last_ts exists in your precomputed features table, use it; else compute from app_usage.
if "app_last_ts" in app_usage.columns:
    # If present in app_usage rows (unlikely), else we'll compute from groupby.
    pass

# Compute numeric encoding if not already present in a features DF:
# For group-by we will take the max of the custom value if present, otherwise fallback to timestamp seconds.
if "app_last_ts" in app_usage.columns and app_usage["app_last_ts"].notna().any():
    app_last_raw = (
        app_usage.groupby("member_id")["app_last_ts"]
        .max()
        .reindex(all_member_ids, fill_value=0)
        .rename("app_last_raw")
    )
else:
    # fallback: use timestamp seconds since obs_start as proxy encoding
    app_last_raw = (
        app_usage.groupby("member_id")["timestamp"]
        .max()
        .reindex(all_member_ids, fill_value=pd.NaT)
        .fillna(obs_start)
        .apply(lambda ts: int((ts - obs_start).total_seconds()))
        .rename("app_last_raw")
    )

# b) web visits aggregates
web_visits["timestamp"] = pd.to_datetime(web_visits["timestamp"], errors="coerce")
# Extract domain/category/page robustly
web_visits[["domain", "category", "page"]] = web_visits["url"].str.extract(
    r"https?://([^/]+)/([^/]+)/(.+)", expand=True
)
web_total = (
    web_visits.groupby("member_id")
    .size()
    .reindex(all_member_ids, fill_value=0)
    .rename("web_total")
)

# web_last_raw as custom numeric if exists else from timestamp
if "web_last_ts" in web_visits.columns and web_visits["web_last_ts"].notna().any():
    web_last_raw = (
        web_visits.groupby("member_id")["web_last_ts"]
        .max()
        .reindex(all_member_ids, fill_value=0)
        .rename("web_last_raw")
    )
else:
    web_last_raw = (
        web_visits.groupby("member_id")["timestamp"]
        .max()
        .reindex(all_member_ids, fill_value=pd.NaT)
        .fillna(obs_start)
        .apply(lambda ts: int((ts - obs_start).total_seconds()))
        .rename("web_last_raw")
    )

# web chronic counts and unique domains
web_chronic = (
    web_visits[web_visits["category"] == "chronic"]
    .groupby("member_id")
    .size()
    .reindex(all_member_ids, fill_value=0)
    .rename("web_chronic")
)
web_unique_domains = (
    web_visits.groupby("member_id")["domain"]
    .nunique()
    .reindex(all_member_ids, fill_value=0)
    .rename("web_unique_domains")
)

# c) claims aggregates
claims_count = (
    claims.groupby("member_id")
    .size()
    .reindex(all_member_ids, fill_value=0)
    .rename("claims_count")
)
# ICD prefix counts (top prefixes only to avoid huge expansion)
claims["icd_prefix"] = claims["icd_code"].astype(str).str[:3]
icd_counts = claims.groupby(["member_id", "icd_prefix"]).size().unstack(fill_value=0)
# keep top N prefixes by frequency to avoid explosion
top_icd_prefixes = icd_counts.sum().sort_values(ascending=False).head(10).index.tolist()
claims_icd_counts = icd_counts[top_icd_prefixes].reindex(all_member_ids, fill_value=0)
claims_icd_counts.columns = [f"icd_{c}" for c in claims_icd_counts.columns]

# d) signup / tenure
signup_date = pd.to_datetime(
    churn_labels.set_index("member_id")["signup_date"]
).reindex(all_member_ids)
days_since_signup = (
    (obs_end - signup_date)
    .dt.days.rename("days_since_signup")
    .fillna((obs_end - obs_start).days)
)

# e) weekly trend features (app)
# Build week index 1/2 for the observation window (safe even if weeks overlap calendar weeks)
app_usage = app_usage.copy()
app_usage["day_index"] = (app_usage["timestamp"] - obs_start).dt.days.clip(
    lower=0, upper=(obs_end - obs_start).days
)
app_usage["week_half"] = np.where(
    app_usage["day_index"] <= 6, 1, 2
)  # week1: days 0-6, week2: days 7-13
app_weekly = (
    app_usage.groupby(["member_id", "week_half"])
    .size()
    .unstack(fill_value=0)
    .reindex(all_member_ids, fill_value=0)
)
# Ensure columns 1 and 2 exist
for c in [1, 2]:
    if c not in app_weekly.columns:
        app_weekly[c] = 0
app_weekly = app_weekly[[1, 2]]
app_weekly.columns = ["app_week1", "app_week2"]
app_weekly["app_trend"] = app_weekly["app_week2"] - app_weekly["app_week1"]

# f) web weekly trend
web_visits = web_visits.copy()
web_visits["day_index"] = (web_visits["timestamp"] - obs_start).dt.days.clip(
    lower=0, upper=(obs_end - obs_start).days
)
web_visits["week_half"] = np.where(web_visits["day_index"] <= 6, 1, 2)
web_weekly = (
    web_visits.groupby(["member_id", "week_half"])
    .size()
    .unstack(fill_value=0)
    .reindex(all_member_ids, fill_value=0)
)
for c in [1, 2]:
    if c not in web_weekly.columns:
        web_weekly[c] = 0
web_weekly = web_weekly[[1, 2]]
web_weekly.columns = ["web_week1", "web_week2"]
web_weekly["web_trend"] = web_weekly["web_week2"] - web_weekly["web_week1"]

# ----------------------------
# 2) Assemble features DataFrame
# ----------------------------
features = pd.concat(
    [
        app_total_events,
        app_last_raw,
        app_weekly[["app_week1", "app_week2", "app_trend"]],
        web_total,
        web_chronic,
        web_unique_domains,
        web_last_raw,
        web_weekly[["web_week1", "web_week2", "web_trend"]],
        claims_count,
        claims_icd_counts,
        days_since_signup,
    ],
    axis=1,
)

# Standardize column names in case of raw names
features = features.rename(
    columns={"app_last_raw": "app_last_raw", "web_last_raw": "web_last_raw"}
)

# Fill any remaining NaNs with 0
features = features.fillna(0)

# ----------------------------
# 3) Convert custom "last activity" encoding into recency
#    We assume higher raw encoding = more recent activity; recency = max_raw - raw
# ----------------------------
for col in ["app_last_raw", "web_last_raw"]:
    max_val = features[col].max()
    # if max_val == 0 then keep zeros; avoid negative recency
    if max_val > 0:
        features[f"{col}_recency"] = (max_val - features[col]).astype(float)
    else:
        features[f"{col}_recency"] = 0.0
    # drop raw
    features.drop(columns=[col], inplace=True)


# ----------------------------
# 4) Clip extreme values (reduce influence of outliers)
# ----------------------------
def clip_by_train_quantiles(train_df, test_df, lower_q=0.01, upper_q=0.99):
    clipped_train = train_df.copy()
    clipped_test = test_df.copy()
    for c in train_df.columns:
        lo = train_df[c].quantile(lower_q)
        hi = train_df[c].quantile(upper_q)
        clipped_train[c] = train_df[c].clip(lo, hi)
        # use train quantiles to clip test
        clipped_test[c] = test_df[c].clip(lo, hi)
    return clipped_train, clipped_test


# ----------------------------
# 5) Add simple interaction features (helpful for non-random treatment)
# ----------------------------
# Make sure no zeros for multiplication/division issues
features["total_activity"] = features["app_total_events"] + features["web_total"] + 1.0
features["activity_per_day"] = features["total_activity"] / (
    features["days_since_signup"].clip(lower=1)
)

# temp placeholder for propensity (will be computed next)
# ----------------------------
# 6) Fit propensity model (use all features except target)
# ----------------------------
# Align treatment series to features index
treatment_aligned = treatment_train.reindex(features.index).fillna(0).astype("int64")
# Save columns used for propensity model
propensity_feature_cols = features.columns.tolist()

# Fit propensity model on these columns
prop_model = RandomForestClassifier(
    n_estimators=200, max_depth=8, random_state=42, n_jobs=-1
)
prop_model.fit(features[propensity_feature_cols], treatment_aligned.values)

# Add propensity score
features["propensity_score"] = prop_model.predict_proba(
    features[propensity_feature_cols]
)[:, 1]


# Add interaction terms with propensity and activity
features["high_activity_low_prop"] = features["total_activity"] * (
    1 - features["propensity_score"]
)
features["low_activity_high_prop"] = (1.0 / features["total_activity"]) * features[
    "propensity_score"
]

# Final fillna just in case
features = features.fillna(0)


# ----------------------------
# 7) Align train/test by member_id
# ----------------------------
# helper to extract member_id index from x_train/x_test
def member_index_from_X(X):
    # If X has a 'member_id' column use that, else assume its index are member_ids
    if "member_id" in X.columns:
        return pd.Index(X["member_id"].values)
    else:
        # if X.index is member_id-like (not 0..n), return it
        return X.index


try:
    train_member_ids = member_index_from_X(x_train)
    test_member_ids = member_index_from_X(x_test)
except Exception:
    train_member_ids = x_train.index
    test_member_ids = x_test.index

# ensure members present; if not present, raise friendly error
missing_train = set(train_member_ids) - set(features.index)
missing_test = set(test_member_ids) - set(features.index)
if missing_train:
    print(
        f"Warning: {len(missing_train)} train member_ids not in features (they will be dropped). Example: {list(missing_train)[:5]}"
    )
if missing_test:
    print(
        f"Warning: {len(missing_test)} test member_ids not in features (they will be dropped). Example: {list(missing_test)[:5]}"
    )

# intersect to avoid KeyError
train_member_ids = [m for m in train_member_ids if m in features.index]
test_member_ids = [m for m in test_member_ids if m in features.index]

x_train_final = features.loc[train_member_ids].copy()
x_test_final = features.loc[test_member_ids].copy()

# Align labels and treatment to these member ids
y_train_aligned = y_train.reindex(x_train_final.index).fillna(0).astype("int64")
y_test_aligned = y_test.reindex(x_test_final.index).fillna(0).astype("int64")
treatment_train_aligned = (
    treatment_train.reindex(x_train_final.index).fillna(0).astype("int64")
)
treatment_test_aligned = (
    treatment_test.reindex(x_test_final.index).fillna(0).astype("int64")
)

# ----------------------------
# 8) Clip using train quantiles to limit outliers
# ----------------------------
x_train_clipped, x_test_clipped = clip_by_train_quantiles(
    x_train_final, x_test_final, 0.01, 0.99
)

# ----------------------------
# 9) Train DR-Learner with LGBMRegressor (learner for continuous pseudo-outcomes)
# ----------------------------
learner = LGBMRegressor(
    max_depth=3,
    num_leaves=15,
    learning_rate=0.03,
    min_child_samples=150,
    subsample=0.7,
    colsample_bytree=0.7,
    reg_alpha=5,
    reg_lambda=5,
    n_estimators=200,
    random_state=42,
    n_jobs=-1,
)
dr_model = BaseDRLearner(learner=learner)
dr_model.fit(
    X=x_train_clipped.values,
    treatment=treatment_train_aligned.values,
    y=y_train_aligned.values,
)

# ----------------------------
# 10) Predict uplift and evaluate
# ----------------------------
uplift_train = dr_model.predict(x_train_clipped.values)
uplift_test = dr_model.predict(x_test_clipped.values)

qini_train = qini_auc_score(
    y_train_aligned.values, uplift_train, treatment_train_aligned.values
)
qini_test = qini_auc_score(
    y_test_aligned.values, uplift_test, treatment_test_aligned.values
)

print("\nFINAL DR-LEARNER MODEL EVALUATION")
print(f"Train Qini AUC: {qini_train:.4f}")
print(f"Test  Qini AUC: {qini_test:.4f}")
print(f"Treatment rate (train): {treatment_train_aligned.mean():.4f}")
print(f"Outcome rate (train):   {y_train_aligned.mean():.4f}")

# ----------------------------
# 11) Optional: save model
# ----------------------------
if SAVE_MODEL:
    joblib.dump(dr_model, MODEL_PATH)
    print(f"Saved model to {MODEL_PATH}")

# ----------------------------
# 12) Quick feature importances from propensity model (diagnostic)
# ----------------------------
feat_imp = pd.Series(
    prop_model.feature_importances_, index=propensity_feature_cols
).sort_values(ascending=False)

print("\nTop propensity features:")
print(feat_imp.head(15))

In [None]:
# ================================================
# FULL DR-LEARNER UPLIFT PIPELINE
# ================================================

import pandas as pd
import numpy as np
from functools import reduce
from lightgbm import LGBMRegressor
from causalml.inference.meta import BaseDRLearner
# from causalml.metrics import qini_auc_score
from sklearn.ensemble import RandomForestClassifier
import joblib
from pathlib import Path

# ----------------------------
# 0) Paths / constants
# ----------------------------
# repo_root = Path(".")
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M")

# ----------------------------
# 1) Load raw datasets
# ----------------------------
app_usage = pd.read_csv(repo_root / "data" / "train" / "app_usage.csv")
web_visits = pd.read_csv(repo_root / "data" / "train" / "web_visits.csv")
claims = pd.read_csv(repo_root / "data" / "train" / "claims.csv")
churn_labels = pd.read_csv(repo_root / "data" / "train" / "churn_labels.csv")
test_app_usage = pd.read_csv(repo_root / "data" / "test" / "test_app_usage.csv")
test_web_visits = pd.read_csv(repo_root / "data" / "test" / "test_web_visits.csv")
test_claims = pd.read_csv(repo_root / "data" / "test" / "test_claims.csv")
test_churn_labels = pd.read_csv(repo_root / "data" / "test" / "test_churn_labels.csv")


# ----------------------------
# 2) Feature engineering via get_data()
# ----------------------------
def get_data(app_usage, web_visits, claims, churn_labels, day_first_web):
    # --------- churn labels ---------
    churn_labels["signup_date"] = pd.to_datetime(churn_labels["signup_date"])
    churn_labels["tenure_days"] = (
        pd.Timestamp("2025-07-14") - churn_labels["signup_date"]
    ).dt.days
    churn_labels["treatment"] = churn_labels["outreach"]
    churn_labels["y"] = churn_labels["churn"]

    # --------- app_usage features ---------
    app_usage["timestamp"] = pd.to_datetime(app_usage["timestamp"])
    app_usage_agg = (
        app_usage.groupby("member_id")
        .agg(
            app_sessions=("timestamp", "count"),
            active_days=("timestamp", lambda x: x.dt.date.nunique()),
            first_session=("timestamp", "min"),
            last_session=("timestamp", "max"),
        )
        .reset_index()
    )

    mid_date = pd.Timestamp("2025-07-07")

    def usage_trend(group):
        early = (group["timestamp"] <= mid_date).sum()
        late = (group["timestamp"] > mid_date).sum()
        return pd.Series(
            {"early_usage": early, "late_usage": late, "usage_trend": late - early}
        )

    app_trend = app_usage.groupby("member_id").apply(usage_trend).reset_index()
    app_features = pd.merge(app_usage_agg, app_trend, on="member_id")

    # --------- web_visits features ---------
    web_visits["timestamp"] = pd.to_datetime(
        web_visits["timestamp"], errors="coerce", dayfirst=day_first_web
    )
    web_visits[["domain", "category", "page"]] = web_visits["url"].str.extract(
        r"https://([^/]+)/([^/]+)/(\d+)"
    )
    web_visits["is_wellco_domain"] = web_visits["domain"].str.contains(
        "wellco", na=False
    )

    web_agg = (
        web_visits.groupby("member_id")
        .agg(
            total_web_visits=("url", "count"),
            unique_domains=("domain", "nunique"),
            unique_categories=("category", "nunique"),
            unique_pages=("page", "nunique"),
            last_visit=("timestamp", "max"),
            wellco_domain_visits=("is_wellco_domain", "sum"),
        )
        .reset_index()
    )
    web_agg["ratio_wellco_domain"] = (
        web_agg["wellco_domain_visits"] / web_agg["total_web_visits"]
    )

    def web_trend(group):
        early = (group["timestamp"] <= mid_date).shape[0]
        late = (group["timestamp"] > mid_date).shape[0]
        return pd.Series(
            {"early_visits": early, "late_visits": late, "visit_trend": late - early}
        )

    web_trend_df = web_visits.groupby("member_id").apply(web_trend).reset_index()
    web_features = pd.merge(web_agg, web_trend_df, on="member_id")

    # --------- claims features ---------
    claims["diagnosis_date"] = pd.to_datetime(claims["diagnosis_date"])
    claims["icd_category"] = claims["icd_code"].str[:3]

    priority_icd_codes = [
        "Z71.3",
        "J00",
        "M54.5",
        "I10",
        "E11.9",
        "K21.9",
        "R51",
        "A09",
        "B34.9",
        "H10.9",
    ]
    for icd in priority_icd_codes:
        claims[f"has_icd_{icd.replace('.', '_')}"] = (claims["icd_code"] == icd).astype(
            int
        )

    claims_agg = (
        claims.groupby("member_id")
        .agg(
            {
                "icd_code": ["count", "nunique"],
                "icd_category": "nunique",
                "diagnosis_date": "max",
                **{
                    f"has_icd_{icd.replace('.', '_')}": "max"
                    for icd in priority_icd_codes
                },
            }
        )
        .reset_index()
    )
    claims_agg.columns = [
        "member_id",
        "total_claims",
        "unique_icd_codes",
        "unique_icd_categories",
        "last_claim",
    ] + [f"has_icd_{icd.replace('.', '_')}" for icd in priority_icd_codes]

    def claims_trend(group):
        early = (group["diagnosis_date"] <= mid_date).sum()
        late = (group["diagnosis_date"] > mid_date).sum()
        return pd.Series(
            {"early_claims": early, "late_claims": late, "claim_trend": late - early}
        )

    claims_trend_df = claims.groupby("member_id").apply(claims_trend).reset_index()
    claims_features = pd.merge(claims_agg, claims_trend_df, on="member_id")

    # --------- merge all features ---------
    dfs = [churn_labels, app_features, web_features, claims_features]
    full_features = reduce(
        lambda left, right: pd.merge(left, right, on="member_id", how="left"), dfs
    )

    # Fill NaNs
    count_cols = [
        c
        for c in full_features.columns
        if any(k in c for k in ["count", "usage", "visits", "claims"])
    ]
    full_features[count_cols] = full_features[count_cols].fillna(0)
    date_cols = ["first_session", "last_session", "last_visit", "last_claim"]
    for col in date_cols:
        full_features[col] = pd.to_datetime(full_features[col])
        full_features[col] = full_features[col].fillna(pd.Timestamp("2025-07-01"))

    # Final X, y, treatment
    exclude_cols = [
        "signup_date",
        "churn",
        "outreach",
        "treatment",
        "y",
        "first_session",
        "last_session",
        "last_claim",
        "last_visit",
    ]
    feature_cols = [c for c in full_features.columns if c not in exclude_cols]
    X = full_features[feature_cols]
    y = full_features["y"]
    treatment = full_features["treatment"]

    return X, y, treatment


# ----------------------------
# 3) Create train/test features
# ----------------------------
x_train, y_train, treatment_train = get_data(
    app_usage, web_visits, claims, churn_labels, day_first_web=True
)
x_test, y_test, treatment_test = get_data(
    test_app_usage, test_web_visits, test_claims, test_churn_labels, day_first_web=False
)

# ----------------------------
# 4) DR-Learner pipeline
# ----------------------------
# Fill NaNs
x_train = x_train.fillna(0)
x_test = x_test.fillna(0)

# Ensure numeric types
y_train = y_train.astype("int64")
y_test = y_test.astype("int64")
treatment_train = treatment_train.astype("int64")
treatment_test = treatment_test.astype("int64")

# Propensity score model
prop_model = RandomForestClassifier(
    n_estimators=200, max_depth=8, random_state=42, n_jobs=-1
)
prop_model.fit(x_train, treatment_train)
x_train["propensity_score"] = prop_model.predict_proba(x_train)[:, 1]
x_test["propensity_score"] = prop_model.predict_proba(x_test)[:, 1]

# Train DR-Learner
dr_model = BaseDRLearner(learner=LGBMRegressor(random_state=42, n_jobs=-1))
dr_model.fit(X=x_train.values, treatment=treatment_train.values, y=y_train.values)

# Predict uplift
uplift_train = dr_model.predict(x_train.values)
uplift_test = dr_model.predict(x_test.values)

# Evaluate
qini_train = qini_auc_score(y_train.values, uplift_train, treatment_train.values)
qini_test = qini_auc_score(y_test.values, uplift_test, treatment_test.values)

print(f"Train Qini AUC: {qini_train:.4f}")
print(f"Test  Qini AUC: {qini_test:.4f}")

In [None]:
# Predict uplift on the test set
uplift_test = dr_model.predict(X=x_test.values)

# Add it to the x_test dataframe for inspection
x_test_with_uplift = x_test.copy()
x_test_with_uplift["uplift"] = uplift_test

# Show top members with highest predicted uplift
print(x_test_with_uplift.sort_values(by="uplift", ascending=False).head(10))

# Or just inspect some summary statistics
print(x_test_with_uplift["uplift"])

In [None]:
x_test_with_uplift["uplift"].describe()

In [None]:
x_test_with_uplift["uplift_clipped"] = x_test_with_uplift["uplift"].clip(-0.99, 0.99)

In [None]:
import matplotlib.pyplot as plt

plt.hist(x_test_with_uplift["uplift_clipped"], bins=50)
plt.title("Uplift distribution")
plt.show()

In [None]:
n = 1000
top_members = x_test_with_uplift.nlargest(n, "uplift_clipped")
print(top_members[["uplift_clipped"]].head(300))

In [None]:
n = 1000
x_test_with_uplift["uplift"] = dr_model.predict(x_test.values)
recommended = x_test_with_uplift.nlargest(n, "uplift").copy()

In [None]:
# Merge test labels
recommended["y"] = y_test.loc[recommended.index]
recommended["treatment"] = treatment_test.loc[recommended.index]

# Split by treatment in recommended group
treated = recommended[recommended["treatment"] == 1]
control = recommended[recommended["treatment"] == 0]

# Empirical uplift = difference in outcome between treated vs control
uplift_empirical = treated["y"].mean() - control["y"].mean()
print("Empirical uplift for recommended group:", uplift_empirical)

In [None]:
overall_treated = y_test[treatment_test == 1].mean()
overall_control = y_test[treatment_test == 0].mean()
overall_uplift = overall_treated - overall_control

print("Overall uplift in full test set:", overall_uplift)

In [None]:
import matplotlib.pyplot as plt

plt.hist(recommended["uplift"], bins=30, alpha=0.7, label="Recommended")
plt.hist(x_test_with_uplift["uplift"], bins=30, alpha=0.3, label="All test")
plt.legend()
plt.title("Uplift distribution: recommended vs all test")
plt.show()