# Uplift Modeling — Estimating Incremental Impact per Action

This notebook develops **uplift (treatment effect) models** for each sales action in the Sales Next Best Action (NBA) system.  
It builds directly on insights from:
- `01_exploration_sales_nba.ipynb` (Exploratory Data Analysis)
- `02_model_baseline_win_prob.ipynb` (Baseline Win Propensity Model)

## Objectives
1. Estimate **incremental win probability** for each action type using a T-Learner approach.  
2. Convert these differences into **expected incremental value (EV)** by combining uplift with account-level ACV and action cost.  
3. Validate results against observed lifts from the EDA.

## Key Deliverables
- Trained uplift models for each action (`GradientBoostingClassifier`-based T-Learners)  
- Uplift performance metrics (Qini / AUC)  
- `nba_recommendations_raw` table with predicted uplift and expected value  
- Model artifacts saved under `data/artifacts/`

**High-Level Workflow**
- train_sales_nba → per-action T-Learners → uplift & expected value → recommendations


After this notebook, we will operationalize the recommendations in the **Decisioning** layer.


In [2]:
# ----------------------------------------------------
# Imports & Environment Setup
# ----------------------------------------------------

# Core libraries
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

# Utility
import joblib
from tqdm import tqdm

# Project modules
from nba.warehouse.duckdb_client import connect
from nba.config import settings

# Visualization style
sns.set(style="whitegrid", palette="muted", font_scale=1.1)
plt.rcParams["figure.figsize"] = (7, 5)

# Confirm project paths
print(f"Project root: {settings.project_root}")
print(f"Database path: {settings.db_path}")
print(f"Artifacts dir: {settings.artifacts_dir}")

Project root: /Users/emcknight/nba-sales
Database path: /Users/emcknight/nba-sales/data/warehouse.duckdb
Artifacts dir: /Users/emcknight/nba-sales/data/artifacts


In [4]:
# Connect to DuckDB and load training data
con = connect()
df = con.execute("SELECT * FROM train_sales_nba").fetchdf()

print(f"Dataset loaded: {df.shape[0]:,} rows × {df.shape[1]} columns")

# Sanity check for key columns
expected_cols = [
    "account_id", "segment", "acv_potential", "intent_affinity",
    "trial_events_30", "trial_users_30", "touches_30", "responses_30",
    "treated", "action_type", "won"
]

missing = [c for c in expected_cols if c not in df.columns]
if missing:
    print("⚠️ Missing expected columns:", missing)
else:
    print("✅ All expected columns present.")

display(df.head())

Dataset loaded: 5,000 rows × 30 columns
✅ All expected columns present.


Unnamed: 0,account_id,segment,industry,region,employees,acv_potential,intent_affinity,created_date,web_30,downloads_30,...,close_date,won,acv,p_win_no_action,p_win_observed,realized_revenue,expected_revenue_observed,y_won,y_revenue,action_cost
0,1,MM,SaaS,EMEA,1722,121881,0.688397,2023-11-24,85,2,...,2026-04-01,1,121881.0,0.210591,0.210591,121881.0,25667.06,1,121881.0,0.0
1,2,SMB,EdTech,EMEA,99,3000,0.434754,2024-09-23,60,1,...,2026-04-01,0,3000.0,0.187724,0.187724,0.0,563.17,0,0.0,0.0
2,3,MM,Manufacturing,,1466,57224,0.644223,2023-05-02,94,2,...,2026-04-01,1,57224.0,0.209499,0.209499,57224.0,11988.39,1,57224.0,0.0
3,4,MM,SaaS,,1935,160311,0.6416,2024-01-25,84,3,...,2026-04-01,0,160311.0,0.211789,0.211789,0.0,33952.14,0,0.0,0.0
4,5,SMB,SaaS,,44,3000,0.718388,2025-07-14,94,2,...,2026-04-01,1,3000.0,0.196778,0.196778,3000.0,590.33,1,3000.0,0.0


In [5]:
# Basic cleanup
df = df.dropna(subset=["action_type", "won"]).copy()

# Ensure correct data types
df["won"] = df["won"].astype(int)
df["treated"] = df["treated"].astype(int)

# Log-scale ACV to reduce skew
df["log_acv_potential"] = np.log1p(df["acv_potential"])

# Confirm unique actions
actions = sorted(df["action_type"].dropna().unique().tolist())
print(f"Unique actions ({len(actions)}):", actions)


Unique actions (8): ['0', 'CALL_OUTREACH', 'DEMO_OFFER', 'EMAIL_SEQUENCE', 'EXEC_SPONSOR_OUTREACH', 'LINKEDIN_TOUCH', 'PRICING_CONCESSION', 'TECHNICAL_WORKSHOP']


In [6]:
# ----------------------------------------------------
# Define a T-Learner prototype for one action type
# ----------------------------------------------------

def train_t_learner_for_action(df: pd.DataFrame, action: str, features: list):
    """
    Train a T-Learner uplift model for a specific action type.

    Parameters
    ----------
    df : pd.DataFrame
        Full dataset containing treated/untreated rows.
    action : str
        The action_type value to model (e.g., 'DEMO_OFFER').
    features : list
        List of baseline feature names.

    Returns
    -------
    dict
        Dictionary containing model objects, AUCs, and evaluation data.
    """

    print(f"\nTraining uplift model for action: {action}")

    # --- 1. Subset data ---
    treated_df = df[df["action_type"] == action].copy()
    control_df = df[df["treated"] == 0].copy()

    # Ensure both groups exist
    if treated_df.empty or control_df.empty:
        print(f"⚠️ Skipping {action}: insufficient treated or control rows.")
        return None

    # --- 2. Train/test split for each group ---
    X_treat = treated_df[features]
    y_treat = treated_df["won"]

    X_ctrl = control_df[features]
    y_ctrl = control_df["won"]

    X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(
        X_treat, y_treat, test_size=0.3, random_state=42, stratify=y_treat
    )

    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
        X_ctrl, y_ctrl, test_size=0.3, random_state=42, stratify=y_ctrl
    )

    # --- 3. Train two separate models (treated & control) ---
    model_treat = GradientBoostingClassifier(
        n_estimators=200, learning_rate=0.05, max_depth=3, subsample=0.8, random_state=42
    )
    model_ctrl = GradientBoostingClassifier(
        n_estimators=200, learning_rate=0.05, max_depth=3, subsample=0.8, random_state=42
    )

    model_treat.fit(X_train_t, y_train_t)
    model_ctrl.fit(X_train_c, y_train_c)

    # --- 4. Evaluate each model ---
    auc_t = roc_auc_score(y_test_t, model_treat.predict_proba(X_test_t)[:, 1])
    auc_c = roc_auc_score(y_test_c, model_ctrl.predict_proba(X_test_c)[:, 1])

    print(f"AUC (treated model): {auc_t:.3f} | AUC (control model): {auc_c:.3f}")

    # --- 5. Predict uplift on full dataset ---
    p_treat = model_treat.predict_proba(df[features])[:, 1]
    p_ctrl = model_ctrl.predict_proba(df[features])[:, 1]
    uplift = p_treat - p_ctrl

    results = pd.DataFrame({
        "account_id": df["account_id"],
        "action_type": action,
        "p_treat": p_treat,
        "p_ctrl": p_ctrl,
        "uplift": uplift,
        "acv": df["acv_potential"],
        "expected_value": uplift * df["acv_potential"] - df["action_cost"]
    })

    # --- 6. Save models ---
    joblib.dump(model_treat, settings.artifacts_dir / f"uplift_treated_{action}.pkl")
    joblib.dump(model_ctrl, settings.artifacts_dir / f"uplift_control_{action}.pkl")

    return {
        "action": action,
        "model_treat": model_treat,
        "model_ctrl": model_ctrl,
        "auc_t": auc_t,
        "auc_c": auc_c,
        "results": results
    }


In [7]:
# ----------------------------------------------------
# Prototype Run for One Action
# ----------------------------------------------------

features = [
    "intent_affinity",
    "trial_events_30",
    "trial_users_30",
    "touches_30",
    "responses_30",
    "log_acv_potential",
    "employees"
]

demo_results = train_t_learner_for_action(df, "DEMO_OFFER", features)



Training uplift model for action: DEMO_OFFER
AUC (treated model): 0.502 | AUC (control model): 0.498
