# A/B Testing Analysis on E-Commerce Dataset
This notebook performs EDA, statistical testing, and uplift modeling using the enhanced Kaggle-style dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('../data/kaggle_ecommerce_data_enhanced.csv')
df.head()

## Exploratory Data Analysis

In [None]:
df.describe(include='all')

In [None]:
# Conversion rates by group
conversion_rates = df.groupby('group')['converted'].mean()
print(conversion_rates)
sns.barplot(data=df, x='group', y='converted')
plt.title("Conversion Rate by Group")
plt.show()

## Hypothesis Testing (A/B)

In [None]:
from scipy.stats import ttest_ind

control = df[df['group'] == 'control']['converted']
treatment = df[df['group'] == 'treatment']['converted']

t_stat, p_val = ttest_ind(treatment, control)
print(f"T-statistic: {t_stat}, P-value: {p_val}")


## Uplift Modeling with EconML

In [None]:
from econml.dr import DRLearner
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

# One-hot encode categorical variables
df_model = pd.get_dummies(df, columns=["location", "device", "gender"], drop_first=True)

# Define features
features = ['time_spent', 'age', 'days_since_last_visit', 'pages_viewed'] +            [col for col in df_model.columns if col.startswith("location_") or col.startswith("device_") or col.startswith("gender_")]

X = df_model[features]
T = df_model['group'].map({'control': 0, 'treatment': 1})
Y = df_model['converted']

model_y = RandomForestRegressor()
model_t = LogisticRegression()
learner = DRLearner(model_regression=model_y, model_propensity=model_t)
learner.fit(Y, T, X=X)

df_model['uplift'] = learner.effect(X)
df_model[['uplift']].describe()

In [None]:
sns.histplot(df_model['uplift'], bins=30, kde=True)
plt.title("Estimated Treatment Effect (Uplift Distribution)")
plt.show()