In [1]:
import pandas as pd
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler

In [2]:
FILE = 'clean.parq.gzip'

In [3]:
data = pd.read_parquet(FILE)
data.head()

Unnamed: 0,spend_1m_baby,spend_2m_baby,spend_3m_baby,spend_4m_baby,spend_5m_baby,spend_6m_baby,spend_1m_clothes,spend_2m_clothes,spend_3m_clothes,spend_4m_clothes,...,spend_6m_eletronic,sales,discount,profit,age,gender,cust_state,tenure,sales_prediction_bins,sales_prediction
0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,...,0.0,3368.64,125.0,40.06,40,1.0,MA,6,8,1351.024765
1,0,0,0,0,0,0.0,0,0.0,0.0,0.0,...,0.0,2133.1,75.0,29.52,36,0.0,MG,10,4,1035.580387
2,0,0,0,0,0,0.0,0,0.0,0.0,0.0,...,0.0,2001.62,50.0,48.08,34,0.0,RJ,7,3,992.401825
3,0,0,0,0,0,0.0,0,0.0,0.0,0.0,...,0.0,1461.96,10.0,61.64,31,0.0,BA,7,1,919.720735
4,0,0,0,0,0,0.0,0,0.0,0.0,0.0,...,0.0,2743.72,100.0,34.44,32,1.0,PB,6,6,1176.485681


In [4]:
mod_1 = OLS(data['sales'], add_constant(data['discount']))
fit_1 = mod_1.fit()
fit_1.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.809
Model:,OLS,Adj. R-squared:,0.809
Method:,Least Squares,F-statistic:,63410.0
Date:,"Thu, 01 Dec 2022",Prob (F-statistic):,0.0
Time:,07:29:08,Log-Likelihood:,-113630.0
No. Observations:,15000,AIC:,227300.0
Df Residuals:,14998,BIC:,227300.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,804.5246,7.790,103.274,0.000,789.255,819.794
discount,21.9248,0.087,251.815,0.000,21.754,22.095

0,1,2,3
Omnibus:,11175.308,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,516097.915
Skew:,3.118,Prob(JB):,0.0
Kurtosis:,31.051,Cond. No.,181.0


In [5]:
binned = pd.qcut(data['discount'], 4, labels=['low', 'med', 'high', 'ultra'])
mod_2 = OLS(data['sales'], pd.get_dummies(binned, drop_first=False))
fit_2 = mod_2.fit()
fit_2.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.763
Model:,OLS,Adj. R-squared:,0.763
Method:,Least Squares,F-statistic:,16080.0
Date:,"Thu, 01 Dec 2022",Prob (F-statistic):,0.0
Time:,07:29:08,Log-Likelihood:,-115240.0
No. Observations:,15000,AIC:,230500.0
Df Residuals:,14996,BIC:,230500.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
low,1359.8713,8.359,162.688,0.000,1343.487,1376.256
med,2067.8880,8.269,250.069,0.000,2051.679,2084.097
high,2933.7307,8.867,330.856,0.000,2916.350,2951.111
ultra,3890.6437,8.880,438.148,0.000,3873.238,3908.049

0,1,2,3
Omnibus:,8888.304,Durbin-Watson:,2.005
Prob(Omnibus):,0.0,Jarque-Bera (JB):,246487.257
Skew:,2.351,Prob(JB):,0.0
Kurtosis:,22.295,Cond. No.,1.07


In [6]:
mod_3 = OLS(data['profit'], add_constant(data['discount']))
fit_3 = mod_3.fit()
fit_3.summary()

0,1,2,3
Dep. Variable:,profit,R-squared:,0.02
Model:,OLS,Adj. R-squared:,0.02
Method:,Least Squares,F-statistic:,303.4
Date:,"Thu, 01 Dec 2022",Prob (F-statistic):,2.7099999999999998e-67
Time:,07:29:08,Log-Likelihood:,-68395.0
No. Observations:,15000,AIC:,136800.0
Df Residuals:,14998,BIC:,136800.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,39.4217,0.382,103.274,0.000,38.673,40.170
discount,0.0743,0.004,17.419,0.000,0.066,0.083

0,1,2,3
Omnibus:,11175.278,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,516086.235
Skew:,3.118,Prob(JB):,0.0
Kurtosis:,31.051,Cond. No.,181.0


In [7]:
# binned = pd.qcut(data['discount'], 4, labels=['low', 'med', 'high', 'ultra'])
binned = pd.qcut(data['discount'], 5, labels=[f'q_{x}' for x in range(1, 6)])
mod_4 = OLS(data['profit'], pd.get_dummies(binned, drop_first=False))
fit_4 = mod_4.fit()
fit_4.summary()

0,1,2,3
Dep. Variable:,profit,R-squared:,0.072
Model:,OLS,Adj. R-squared:,0.071
Method:,Least Squares,F-statistic:,289.6
Date:,"Thu, 01 Dec 2022",Prob (F-statistic):,2.95e-240
Time:,07:29:08,Log-Likelihood:,-67987.0
No. Observations:,15000,AIC:,136000.0
Df Residuals:,14995,BIC:,136000.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
q_1,45.4085,0.381,119.135,0.000,44.661,46.156
q_2,40.5099,0.410,98.908,0.000,39.707,41.313
q_3,36.4883,0.412,88.527,0.000,35.680,37.296
q_4,50.9924,0.411,123.986,0.000,50.186,51.799
q_5,53.9582,0.448,120.364,0.000,53.080,54.837

0,1,2,3
Omnibus:,11453.738,Durbin-Watson:,2.015
Prob(Omnibus):,0.0,Jarque-Bera (JB):,608443.097
Skew:,3.191,Prob(JB):,0.0
Kurtosis:,33.541,Cond. No.,1.18


In [8]:
sc = StandardScaler()
stdzd = pd.DataFrame(
    sc.fit_transform(data.loc[:, ['age', 'sales_prediction']]),
    columns=sc.get_feature_names_out()
)

In [9]:
mod_5 = OLS(data['discount'], add_constant(stdzd))
fit_5 = mod_5.fit()
fit_5.summary()

0,1,2,3
Dep. Variable:,discount,R-squared:,0.611
Model:,OLS,Adj. R-squared:,0.611
Method:,Least Squares,F-statistic:,11760.0
Date:,"Thu, 01 Dec 2022",Prob (F-statistic):,0.0
Time:,07:29:08,Log-Likelihood:,-71057.0
No. Observations:,15000,AIC:,142100.0
Df Residuals:,14997,BIC:,142100.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,77.7643,0.225,344.912,0.000,77.322,78.206
age,13.1879,0.236,55.864,0.000,12.725,13.651
sales_prediction,28.2966,0.236,119.864,0.000,27.834,28.759

0,1,2,3
Omnibus:,6838.726,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,148686.803
Skew:,-1.683,Prob(JB):,0.0
Kurtosis:,18.052,Cond. No.,1.36


In [10]:
sc.inverse_transform(fit_5.params[1:].values.reshape(1, -1))

array([[ 114.07749571, 9047.68647854]])