In [128]:
import pandas as pd
import numpy as np
import uuid
import statsmodels.api as sm

In [None]:
np.random.seed(0)

n_rows = 1500

category_names = [
    'Electronics', 'Home Appliances', 'Clothing and Apparel', 'Footwear',
    'Beauty and Personal Care', 'Furniture', 'Toys and Games',
    'Sports and Outdoor Equipment', 'Automotive', 'Groceries and Food',
    'Books and Stationery', 'Health and Wellness'
]
category_weights = np.array([0.8, 0.6, 0.95, 0.85, 1.05, 0.75, 0.55, 0.65, 1.15, 0.7, 1.0, 0.9])

data = pd.DataFrame({
    'has_coupon': np.random.choice([0, 1], size=n_rows, p=[0.5, 0.5]),
    'is_holiday_related': np.random.choice([0, 1], size=n_rows, p=[0.7, 0.3])
})

data['discount_ratio'] = np.random.choice(
    [0, 0.2, 0.25, 0.3, 0.35, 0.4], 
    size=n_rows, 
    p=[0.3, 0.14, 0.14, 0.14, 0.14, 0.14]
)
data['discount_amount'] = np.random.choice([0, 5, 10, 15, 20, 25, 30], size=n_rows)

# Ensure 'discount_amount' is zero when 'discount_ratio' is non-zero
data.loc[data['discount_ratio'] != 0, 'discount_amount'] = 0

# Assign one category per row with one-hot encoding
categories = np.random.choice(category_names, size=n_rows, p=category_weights/category_weights.sum())
category_data = pd.get_dummies(categories)
data = pd.concat([data, category_data], axis=1)

# Calculate log odds for conversion
beta_values = {
    'Electronics': 1.2,
    'Home Appliances': 1.0,
    'Clothing and Apparel': 0.9,
    'Footwear': 0.8,
    'Beauty and Personal Care': 0.7,
    'Furniture': 0.2,
    'Toys and Games': 0.2,
    'Sports and Outdoor Equipment': 0.2,
    'Automotive': 0.2,
    'Groceries and Food': 0.2,
    'Books and Stationery': 0.2,
    'Health and Wellness': 0.2
}

conversion_log_odds = (
    data['has_coupon'] * 0.5 +
    data['discount_ratio'] * 1.9 +
    data['discount_amount'] * 0.4 +
    sum(data[name] * beta_values[name] for name in category_names)
)
data['Conversion'] = np.random.binomial(1, p=1 / (1 + np.exp(-conversion_log_odds)))

uuids = [uuid.uuid4() for _ in range(len(df))]
data.insert(0, 'message_id', uuids)

data.to_csv('marketing_simulation_with_categories.csv', index=False)


In [131]:
import statsmodels.api as sm

df = pd.read_csv('marketing_simulation_with_categories.csv')

X = sm.add_constant(df.drop('Conversion', axis=1))
X = X.drop(columns=['message_id', 'Sports and Outdoor Equipment'])

y = data['Conversion']
model = sm.Logit(y, X)
result = model.fit_regularized(method='l1')

print(result.summary())


Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.3853952748797677
            Iterations: 91
            Function evaluations: 93
            Gradient evaluations: 91
                           Logit Regression Results                           
Dep. Variable:             Conversion   No. Observations:                 1500
Model:                          Logit   Df Residuals:                     1484
Method:                           MLE   Df Model:                           15
Date:                Mon, 27 May 2024   Pseudo R-squ.:                  0.1300
Time:                        18:02:03   Log-Likelihood:                -578.09
converged:                       True   LL-Null:                       -664.46
Covariance Type:            nonrobust   LLR p-value:                 6.919e-29
                               coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------