# Data Analysis and Causal Mechanisms

## Environment Setup and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import phik

# Set style
sns.set_style("whitegrid") 
sns.set_palette('viridis')
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['font.family'] = 'monospace'

In [None]:
# Load data
potential_outcomes_df = pd.read_pickle('../data/potential_outcomes_df.pkl')
observational_df = pd.read_pickle('../data/observational_df.pkl')

## Causal Mechanisms

<center>
<img 
  src="../assets/confounding_bias.png" 
  alt="Confounding Relationships" 
  style="width:500px;height:auto;"
> 

In [None]:
# Observational data
observational_df.head()

### True Causal Effects


In [None]:
potential_outcomes_df.head()

### Dangers of Unadjusted Estimates

<center>
<img 
  src="../assets/correlation_causation.png" 
  alt="Correlation is not causation" 
  style="width:250px;height:auto;"
> 

In [None]:
# Check treatment effect
signup_rate_by_treatment = potential_outcomes_df.groupby('upsell_marketing')['amu_signup'].mean()
print(f"Signup rates by treatment group: {signup_rate_by_treatment}")

# Plot distribution
plt.figure(figsize=(8, 6))
potential_outcomes_df.groupby('upsell_marketing')['amu_signup'].mean().plot(kind='bar', color=['tab:blue', 'tab:green'])
plt.title("Signup Rate by Upsell Message Exposure")
plt.xlabel("Upsell Marketing (0 = No, 1 = Yes)")
plt.ylabel("Signup Rate")
plt.xticks(rotation=0)
plt.show()

In [None]:
# Difference in conversion rates by teatment type
biased_lift = (
    (potential_outcomes_df['amu_signup'][potential_outcomes_df.upsell_marketing==1].mean()) - 
    (potential_outcomes_df['amu_signup'][potential_outcomes_df.upsell_marketing==0].mean())
)

# True average treatment effect based on ITE
actual_lift = potential_outcomes_df.individual_treatment_effect.mean()
print(
    f'Biased Marketing Lift: {biased_lift:.2%}',
    f'Acutal Marketing Lift: {actual_lift:.2%}', 
    sep='\n'
)

## Exploratory Data Analysis

In [None]:
## Summary Statistics
(
    observational_df
    .describe()
    .drop('count')
    .style
    .format(
        {
            'amu_signup': '{:.1%}',
            'upsell_marketing': '{:.1%}',
            'streaming_tier_prime': '{:.1%}',
            'play_days': '{:,.2f}',
            'songs_listened': '{:,.2f}',
            'other_subscriptions': '{:.1%}',
            'retail_spending': '${:,.2f}'
        }
        )
)

In [None]:
# Customer features by treatment
## Summary Statistics
(
    observational_df
    .groupby('upsell_marketing', as_index=False)
    .mean()
    .style
    .format(
        {
            'amu_signup': '{:.1%}',
            'streaming_tier_prime': '{:.1%}',
            'play_days': '{:,.2f}',
            'songs_listened': '{:,.2f}',
            'other_subscriptions': '{:.1%}',
            'retail_spending': '${:,.2f}'
        }
        )
)

### Randomized Control Trials
RCTs remove confounding bias but making the control (no-upsell) and treatment (upsell) groups similiar on their attributes and behavorial profiles

In [None]:
(
    potential_outcomes_df
    .filter(
        items=[
            'upsell_marketing_rct', 'amu_signup', 
            'streaming_tier_prime', 'play_days', 
            'songs_listened', 'other_subscriptions', 
            'retail_spending'], axis=1)
    .groupby('upsell_marketing_rct', as_index=False)
    .mean()
    .style
    .format(
        {
            'amu_signup': '{:.1%}',
            'streaming_tier_prime': '{:.1%}',
            'play_days': '{:,.2f}',
            'songs_listened': '{:,.2f}',
            'other_subscriptions': '{:.1%}',
            'retail_spending': '${:,.2f}'
        }
    )
)

What do RCTs do to our estimates of the causal effect?
- They remove bias and provide accurate estimates of the true causal effects of treatment!

In [None]:
# Difference in conversion rates by teatment type in an RCT
rct_lift = (
    (potential_outcomes_df['amu_signup_rct'][potential_outcomes_df.upsell_marketing_rct==1].mean()) - 
    (potential_outcomes_df['amu_signup_rct'][potential_outcomes_df.upsell_marketing_rct==0].mean())
)

# True average treatment effect based on ITE
actual_lift = potential_outcomes_df.individual_treatment_effect.mean()
print(
    f'RCT Marketing Lift: {rct_lift:.2%}',
    f'Acutal Marketing Lift: {actual_lift:.2%}', 
    sep='\n'
)