# Data Generating Process

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid") 
sns.set_palette('viridis')
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['font.family'] = 'monospace'

In [None]:
def generate_uplift(sample_n=100_000, random_seed=42):
  # Set random seed for reproducibility
  np.random.seed(random_seed)

  ###### Confounding features
  # 1 - premium_tier, 0 - free_tier
  streaming_tier_premium = np.random.binomial(n=1, p=0.72, size=sample_n)
  # Retail spend in last 30 days
  retail_spending_t30 = (
      np.random.binomial(n=1, p=0.18, size=sample_n) *
      np.abs(np.random.normal(loc=220, scale=80, size=sample_n)) +
      35*streaming_tier_premium
  )
  # Indicator for other active subscriptions
  other_subscriptions = np.random.binomial(n=1, p=0.4, size=sample_n)

  ####### Behavioral features
  play_days = np.clip(
      np.ceil(np.random.exponential(12, size=sample_n)).astype(int) * np.random.binomial(n=1, p=0.3, size=sample_n),
      0, 30)

  songs_listened = np.select(
      [play_days==0, play_days!=0],
      [0, play_days * 4  + 10*np.random.binomial(n=1, p=0.4, size=sample_n)])


  # Treatment assignment propensity function
  confounding_propensity = (
      np.clip(0.12 +
      0.5*((play_days - min(play_days))/(max(play_days) - min(play_days))) +
      0.3*((songs_listened - min(songs_listened))/(max(songs_listened) - min(songs_listened))) +
      0.2*other_subscriptions -
      0.08*streaming_tier_premium +
      0.4*((retail_spending_t30 - min(retail_spending_t30))/(max(retail_spending_t30) - min(retail_spending_t30))), 0.0, 1.0)
  )

  # Assigns treatment under confounding
  upsell_marketing = np.random.binomial(1, confounding_propensity).astype(int)

  # Assigns treatment randomly, simulating a randomized control trial
  upsell_rct = np.random.binomial(1, p=0.5, size=sample_n)

  # Individual treatment effects
  ### Simple y_0 baseline - the more you use the service, the more likely you are to signup without marketing
  y_0_propensity = (play_days/200) + np.random.uniform(low=0.01, high=0.1, size=sample_n)

  ### Change in signup propensity based on customer features
  y_1_propensity = np.clip(
      y_0_propensity +
      0.1*((retail_spending_t30 - min(retail_spending_t30))/(max(retail_spending_t30) - min(retail_spending_t30))) +
      0.08*((songs_listened - min(songs_listened))/(max(songs_listened) - min(songs_listened))) +
      0.15*((play_days - min(play_days))/(max(play_days) - min(play_days))) -
      0.03*other_subscriptions,
      0.0, 1.0
  )

  # Actuall individual treatment effect for each customer
  ite = y_1_propensity - y_0_propensity

  sign_up_p = np.select(
    [upsell_marketing==0, upsell_marketing==1],
    [y_0_propensity, y_1_propensity])

  sign_up = np.random.binomial(1, sign_up_p)

  sign_up_p_rct = np.select(
      [upsell_rct==0, upsell_rct==1],
      [y_0_propensity, y_1_propensity])

  sign_up_rct = np.random.binomial(1, sign_up_p_rct)

  # Create DataFrame
  potential_outcomes_df = pd.DataFrame({
      'converted': sign_up,
      'y_0_propensity': y_0_propensity,
      'y_1_propensity': y_1_propensity,
      'individual_treatment_effect': ite,
      'upsell_marketing': upsell_marketing,
      'upsell_confounding_propensity': confounding_propensity,
      'converted_rct': sign_up_rct,
      'upsell_marketing_rct': upsell_rct,
      'streaming_tier_premium': streaming_tier_premium,
      'play_days': play_days,
      'songs_listened': songs_listened,
      'other_subscriptions': other_subscriptions,
      'retail_spending': retail_spending_t30
  })

  observational_df = (
      potential_outcomes_df
      .drop(columns=[
          'y_0_propensity', 'y_1_propensity', 'individual_treatment_effect',
          'upsell_confounding_propensity', 'converted_rct', 'upsell_marketing_rct'])
  )

  return potential_outcomes_df, observational_df

In [None]:
potential_outcomes_df, observational_df = generate_uplift()

## Dataset Properties

In [None]:
# Check treatment effect
signup_rate_by_treatment = potential_outcomes_df.groupby('upsell_marketing')['converted'].mean()
print(f"Signup rates by treatment group:\n{signup_rate_by_treatment}")

# Plot distribution
plt.figure(figsize=(8, 6))
potential_outcomes_df.groupby('upsell_marketing')['converted'].mean().plot(kind='bar', color=['tab:blue', 'tab:green'])
plt.title("Signup Rate by Upsell Message Exposure")
plt.xlabel("Upsell Marketing (0 = No, 1 = Yes)")
plt.ylabel("Converted Rate")
plt.xticks(rotation=0)
plt.show()


In [None]:
# Check lift estimates
biased_lift = (
    (potential_outcomes_df['converted'][potential_outcomes_df.upsell_marketing==1].mean()) - 
    (potential_outcomes_df['converted'][potential_outcomes_df.upsell_marketing==0].mean())
)

actual_lift = potential_outcomes_df.individual_treatment_effect.mean()
print(
    f'Biased Marketing Lift: {biased_lift:.2%}',
    f'Acutal Marketing Lift: {actual_lift:.2%}', 
    sep='\n'
)

In [None]:
# Save results
potential_outcomes_df.to_pickle('potential_outcomes_df.pkl')
potential_outcomes_df.drop(
    columns=[
        'y_0_propensity', 'y_1_propensity', 
        'individual_treatment_effect', 
        'upsell_confounding_propensity', 'converted_rct', 'upsell_marketing_rct']).to_pickle('observational_df.pkl')