In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder as SklearnOneHotEncoder
from statsmodels.regression.linear_model import OLS

In [3]:
class OneHotEncoder(SklearnOneHotEncoder):
    def __init__(self, **kwargs):
        super(OneHotEncoder, self).__init__(**kwargs)
        self.fit_flag = False

    def fit(self, X, **kwargs):
        out = super().fit(X)
        self.fit_flag = True
        return out

    def transform(self, X, **kwargs):
        sparse_matrix = super(OneHotEncoder, self).transform(X)
        new_columns = self.get_new_columns(X=X)
        d_out = pd.DataFrame(sparse_matrix.toarray(), columns=new_columns, index=X.index)
        return d_out

    def fit_transform(self, X, **kwargs):
        self.fit(X)
        return self.transform(X)

    def get_new_columns(self, X):
        new_columns = []
        for i, column in enumerate(X.columns):
            j = 0
            while j < len(self.categories_[i]):
                new_columns.append(f'{column}_<{self.categories_[i][j]}>')
                j += 1
        return new_columns

In [4]:
promos = ['SALE15', 'TAKE30', 'LUCKY', 'SORRY', 'WINTER', 'SUMMER']
promos_rate = [45, 9, 25, 4, 15, 42]
total = sum(promos_rate)
promos_rate = [i / total for i in promos_rate]
promos_impact = [0.89, 0.77, 0.84, 0.93, 0.84, 0.91]

In [5]:
aov = 30
sigma = 8
dataset_size = 370000
promo_size = int(0.41 * dataset_size)
free_delivery_share = 0.12
surge_increment_share = 0.03

df = pd.DataFrame().assign(
    gmv = list(map(int, np.random.normal(aov, sigma, dataset_size - promo_size))),
    title = ['no_promo'] * (dataset_size - promo_size)
)

for i, j, k in zip(promos, promos_rate, promos_impact):

    df = (
        pd.concat([
            df,
            (
                pd.DataFrame().assign(
                    gmv = list(map(int, np.random.normal(k * aov, sigma / k, int(promo_size * j)))),
                    title = [i] * int(promo_size * j)
                )
            )
        ])
    )

df = (
    df
        .sample(frac=1,replace=False)
        .reset_index(drop=True)
        .assign(
            delivery_discount = lambda x: np.random.binomial(1, free_delivery_share, x.shape[0]),
            surge_increment = lambda x: np.random.binomial(1, surge_increment_share, x.shape[0]),
            order_id = lambda x: [i for i in range(768977643, 768977643 + x.shape[0])],
            gmv = lambda x: x['gmv'] + 2 * (1 - x['delivery_discount']) + 1 * x['surge_increment']
        )
        .query('gmv > 0')
)

df.to_csv('multiple_promo_df.csv')

In [6]:
df.head()

Unnamed: 0,gmv,title,delivery_discount,surge_increment,order_id
0,44,no_promo,0,0,768977643
1,31,no_promo,0,0,768977644
2,33,no_promo,0,0,768977645
3,12,TAKE30,0,0,768977646
4,39,no_promo,0,0,768977647


In [8]:
df.query('title == "no_promo"').shape[0] / df.shape[0]

0.5904031893802725

In [7]:
encoder = OneHotEncoder()

In [9]:
encoder.fit_transform(df[['title']]).head(3)

Unnamed: 0,title_<LUCKY>,title_<SALE15>,title_<SORRY>,title_<SUMMER>,title_<TAKE30>,title_<WINTER>,title_<no_promo>
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
Y = df['gmv']

X = encoder.fit_transform(df[['title']]).drop('title_<no_promo>', axis=1).assign(aov=1)
X['delivery_discount'] = df['delivery_discount']
X['surge_increment'] = df['surge_increment']

In [12]:
X.head(3)

Unnamed: 0,title_<LUCKY>,title_<SALE15>,title_<SORRY>,title_<SUMMER>,title_<TAKE30>,title_<WINTER>,aov,delivery_discount,surge_increment
0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0


In [46]:
estimator = OLS(Y, X).fit()

In [47]:
print(estimator.summary())

                            OLS Regression Results                            
Dep. Variable:                    gmv   R-squared:                       0.058
Model:                            OLS   Adj. R-squared:                  0.058
Method:                 Least Squares   F-statistic:                     2844.
Date:                Tue, 19 Jul 2022   Prob (F-statistic):               0.00
Time:                        09:25:10   Log-Likelihood:            -1.3137e+06
No. Observations:              369705   AIC:                         2.627e+06
Df Residuals:                  369696   BIC:                         2.628e+06
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
title_<LUCKY>        -4.7214      0.05