In [1]:
!pip install causalinference

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting causalinference
  Downloading CausalInference-0.1.3-py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 5.9 MB/s 
[?25hInstalling collected packages: causalinference
Successfully installed causalinference-0.1.3


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from causalinference import CausalModel
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("/content/drive/MyDrive/Discount_Sales Project/discount.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,price,total,age,Region,Discount_Percent,customer_duration,season,status
0,29,140.0,140.0,65.0,Midwest,0.0,10.0,Fall,Lower_Status
1,72,379.8,379.8,71.0,Midwest,0.0,4.0,Winter,Lower_Status
2,158,175.0,175.0,71.0,South,0.0,35.0,Fall,Lower_Status
3,165,119.9,119.9,64.0,South,0.0,15.0,Winter,Lower_Status
4,166,140.0,126.0,64.0,South,10.0,15.0,Winter,Lower_Status


In [4]:
data['Discount_Percent'].value_counts()

0.000000     22025
20.000000     1439
10.000000      694
15.000000      517
5.000000       182
             ...  
15.666614        1
15.666614        1
15.666615        1
30.401338        1
31.321818        1
Name: Discount_Percent, Length: 2020, dtype: int64

In [5]:
data['Discount_Percent'].describe()

count    28334.000000
mean         5.052338
std         11.493130
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max         70.000000
Name: Discount_Percent, dtype: float64

### Perform experiment with disount of 10% and 15% compared with 0%

In [6]:
# Control sample

control_pop = data.query("Discount_Percent==0.00")

In [7]:
control_pop.describe()

Unnamed: 0.1,Unnamed: 0,price,total,age,Discount_Percent,customer_duration
count,22025.0,22025.0,22025.0,22025.0,22025.0,22025.0
mean,143246.956413,151.013849,229.29113,46.362134,0.0,13.170942
std,85663.882001,160.619888,385.275583,17.01678,0.0,8.449669
min,29.0,0.0,0.0,18.0,0.0,3.0
25%,65501.0,63.0,69.9,32.0,0.0,6.0
50%,154176.0,105.0,125.0,46.0,0.0,11.0
75%,210548.0,189.9,226.525,61.0,0.0,18.0
max,286390.0,4500.1,11900.0,75.0,0.0,42.0


In [8]:
treat1 = data.query("Discount_Percent==10.00")

In [9]:
treat15 = data.query("Discount_Percent==15.00")

In [10]:
control_pop.shape,treat1.shape,treat15.shape

((22025, 9), (694, 9), (517, 9))

### Select 800 samples from the control group

In [11]:
control = control_pop.sample(n=800,random_state=42)

In [12]:
# create columns for treatment status in the 3 dataframes

control['treat']=0
treat1['treat']=1
treat15['treat']=1

In [13]:
df_10 = pd.concat([control,treat1],axis=0)
df_15 = pd.concat([control,treat15],axis=0)

In [14]:
# drop discount column from both df

df_10 = df_10.drop("Discount_Percent",axis=1)
df_15 = df_15.drop("Discount_Percent",axis=1)

In [15]:
df_10 = df_10.iloc[:,1:]
df_10.head()

Unnamed: 0,price,total,age,Region,customer_duration,season,status,treat
21875,149.7,149.7,52.0,South,10.0,Spring,Higher_Status,0
20314,79.9,79.9,63.0,Northeast,6.0,Spring,Lower_Status,0
18724,51.9,51.9,57.0,South,9.0,Spring,Lower_Status,0
27298,300.0,300.0,37.0,Midwest,16.0,Summer,Lower_Status,0
26570,539.0,539.0,60.0,Northeast,34.0,Summer,Lower_Status,0


In [16]:
df_15 = df_15.iloc[:,1:]
df_15.head()

Unnamed: 0,price,total,age,Region,customer_duration,season,status,treat
21875,149.7,149.7,52.0,South,10.0,Spring,Higher_Status,0
20314,79.9,79.9,63.0,Northeast,6.0,Spring,Lower_Status,0
18724,51.9,51.9,57.0,South,9.0,Spring,Lower_Status,0
27298,300.0,300.0,37.0,Midwest,16.0,Summer,Lower_Status,0
26570,539.0,539.0,60.0,Northeast,34.0,Summer,Lower_Status,0


### Run experiment for the 10% discount group

In [17]:
# Create causalinference variables

X = df_10.drop(['treat','total'],axis=1)
X = pd.get_dummies(X,drop_first=True,prefix_sep=('*'))
X = X.values
D = df_10['treat'].values
Y = df_10['total'].values

In [18]:
# initiate causal model and get the summary statistics of the dataset

causal = CausalModel(Y,D,X)

In [19]:
print(causal.summary_stats)


Summary Statistics

                       Controls (N_c=800)         Treated (N_t=694)             
       Variable         Mean         S.d.         Mean         S.d.     Raw-diff
--------------------------------------------------------------------------------
              Y      224.405      341.812      160.170      175.439      -64.235

                       Controls (N_c=800)         Treated (N_t=694)             
       Variable         Mean         S.d.         Mean         S.d.     Nor-diff
--------------------------------------------------------------------------------
             X0      149.492      214.866      172.206      194.566        0.111
             X1       45.986       16.461       46.589       16.889        0.036
             X2       13.258        8.360       12.994        8.323       -0.032
             X3        0.175        0.380        0.182        0.386        0.017
             X4        0.417        0.493        0.353        0.478       -0.133
      

* There are 800 people in the control group and 694 in the treatment group
* Average outcome of the control group is $224 which is greater than $160 in the control group by $64
* The Nor-diff shows that most of the covariates are not balanced going by the statistics reported.  

### Generate Propensity Score

In [20]:
# Propensity Score Estimation

causal.est_propensity_s()
print(causal.propensity)


Estimated Parameters of Propensity Score

                    Coef.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
     Intercept     -1.130      0.261     -4.329      0.000     -1.642     -0.619
            X7     -2.784      0.372     -7.475      0.000     -3.514     -2.054
            X6     -0.719      0.192     -3.738      0.000     -1.096     -0.342
            X4     -0.287      0.127     -2.266      0.023     -0.535     -0.039
            X0      0.006      0.001      6.101      0.000      0.004      0.008
            X8      1.523      0.226      6.736      0.000      1.080      1.967
            X5      0.263      0.165      1.600      0.110     -0.059      0.586
            X9      0.297      0.167      1.782      0.075     -0.030      0.624
         X0*X8     -0.007      0.001     -7.032      0.000     -0.009     -0.005
         X0*X0     -0.000      0.000     -3.983      0.000     -0.

### Improve the Covariate Balance

In [21]:
causal.trim_s()
causal.cutoff

0.12692897813341592

In [22]:
print(causal.summary_stats)


Summary Statistics

                       Controls (N_c=643)         Treated (N_t=681)             
       Variable         Mean         S.d.         Mean         S.d.     Raw-diff
--------------------------------------------------------------------------------
              Y      208.593      285.145      156.190      167.081      -52.404

                       Controls (N_c=643)         Treated (N_t=681)             
       Variable         Mean         S.d.         Mean         S.d.     Nor-diff
--------------------------------------------------------------------------------
             X0      150.518      156.089      167.674      185.112        0.100
             X1       47.064       16.337       46.504       16.876       -0.034
             X2       13.086        8.285       13.057        8.371       -0.003
             X3        0.174        0.380        0.184        0.387        0.024
             X4        0.420        0.494        0.358        0.480       -0.127
      

In [23]:
causal.stratify_s()
print(causal.strata)


Stratification Summary

              Propensity Score         Sample Size     Ave. Propensity   Outcome
   Stratum      Min.      Max.  Controls   Treated  Controls   Treated  Raw-diff
--------------------------------------------------------------------------------
         1     0.130     0.279       134        35     0.218     0.231  -120.445
         2     0.281     0.410       114        50     0.346     0.352  -137.753
         3     0.412     0.506        97        71     0.461     0.467   -50.035
         4     0.506     0.535        41        41     0.519     0.522   -50.670
         5     0.536     0.557        21        60     0.544     0.546   -39.789
         6     0.557     0.636       127       204     0.596     0.593   -16.954
         7     0.637     0.666        69        93     0.650     0.650   -40.839
         8     0.668     0.873        40       127     0.717     0.731    25.879



In [32]:
causal.est_via_ols()
causal.est_via_weighting()
causal.est_via_matching(bias_adj=True)
print(causal.estimates)


Treatment Effect Estimates: OLS

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE    -57.180      8.078     -7.078      0.000    -73.013    -41.346
           ATC    -69.751     12.131     -5.750      0.000    -93.527    -45.975
           ATT    -45.310      5.591     -8.104      0.000    -56.268    -34.351

Treatment Effect Estimates: Weighting

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE    -57.142      7.961     -7.178      0.000    -72.745    -41.539

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE    -53.647     15.988     -3.355      0.001    -84.985    -22.

The ATE are the same in both the OLS and weighting estimations while it is lower in Matching estimation.  The bottom line is that after correcting for imbalances, the ATE was reduced from -64 to -57.  A discount 0f 10% reduced sales by -$57

### Run experiment for the 15% discount group

In [25]:
# Create causalinference variables

X1 = df_15.drop(['treat','total'],axis=1)
X1 = pd.get_dummies(X1,drop_first=True,prefix_sep=('*'))
X1 = X1.values
D1 = df_15['treat'].values
Y1 = df_15['total'].values

In [26]:
# initiate causal model and get the summary statistics of the dataset

causal1 = CausalModel(Y1,D1,X1)

In [27]:
print(causal1.summary_stats)


Summary Statistics

                       Controls (N_c=800)         Treated (N_t=517)             
       Variable         Mean         S.d.         Mean         S.d.     Raw-diff
--------------------------------------------------------------------------------
              Y      224.405      341.812      184.599      170.223      -39.806

                       Controls (N_c=800)         Treated (N_t=517)             
       Variable         Mean         S.d.         Mean         S.d.     Nor-diff
--------------------------------------------------------------------------------
             X0      149.492      214.866      213.031      200.282        0.306
             X1       45.986       16.461       43.395       17.076       -0.155
             X2       13.258        8.360       13.518        8.946        0.030
             X3        0.175        0.380        0.178        0.383        0.008
             X4        0.417        0.493        0.354        0.479       -0.131
      

The raw difference is -39.8 which is lower than the raw difference of 10% discount. The Mean of the control group is 224.4 while it is 184.6 in the treatment group.

### Generate Propensity score 

In [28]:
causal1.est_propensity_s()
print(causal1.propensity)


Estimated Parameters of Propensity Score

                    Coef.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
     Intercept     -0.767      0.385     -1.990      0.047     -1.522     -0.012
            X6     -2.727      0.364     -7.496      0.000     -3.440     -2.014
            X0      0.004      0.002      2.361      0.018      0.001      0.007
            X1     -0.008      0.007     -1.093      0.274     -0.022      0.006
            X4     -0.277      0.257     -1.078      0.281     -0.779      0.226
            X8      1.781      0.395      4.513      0.000      1.007      2.554
            X3     -0.165      0.173     -0.951      0.341     -0.504      0.175
         X0*X0     -0.000      0.000     -3.461      0.001     -0.000     -0.000
         X1*X8     -0.020      0.008     -2.655      0.008     -0.035     -0.005
         X0*X8     -0.003      0.001     -3.048      0.002     -0.

### Improve the Covariate Balance

In [29]:
causal1.trim_s()
causal1.cutoff

0.1343873181655615

In [30]:
print(causal1.summary_stats)


Summary Statistics

                       Controls (N_c=614)         Treated (N_t=491)             
       Variable         Mean         S.d.         Mean         S.d.     Raw-diff
--------------------------------------------------------------------------------
              Y      197.470      251.043      164.037      127.907      -33.432

                       Controls (N_c=614)         Treated (N_t=491)             
       Variable         Mean         S.d.         Mean         S.d.     Nor-diff
--------------------------------------------------------------------------------
             X0      146.227      132.620      188.913      149.849        0.302
             X1       45.492       16.724       42.558       16.802       -0.175
             X2       13.497        8.449       13.511        8.973        0.002
             X3        0.178        0.382        0.181        0.386        0.010
             X4        0.432        0.496        0.348        0.477       -0.171
      

The raw difference has been reduced from -39 to -33

In [31]:
causal1.stratify_s()
print(causal1.strata)


Stratification Summary

              Propensity Score         Sample Size     Ave. Propensity   Outcome
   Stratum      Min.      Max.  Controls   Treated  Controls   Treated  Raw-diff
--------------------------------------------------------------------------------
         1     0.135     0.294       117        22     0.243     0.245   -12.403
         2     0.295     0.341        94        44     0.319     0.315   -86.804
         3     0.341     0.438       162       114     0.388     0.382   -82.874
         4     0.439     0.544       145       131     0.496     0.495   -39.079
         5     0.544     0.866        96       180     0.616     0.622  -116.410



In [33]:
causal1.est_via_ols()
causal1.est_via_weighting()
causal1.est_via_matching(bias_adj=True)
print(causal1.estimates)


Treatment Effect Estimates: OLS

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE    -69.145      8.107     -8.529      0.000    -85.034    -53.256
           ATC    -69.036      8.320     -8.298      0.000    -85.343    -52.730
           ATT    -69.281      8.238     -8.409      0.000    -85.429    -53.134

Treatment Effect Estimates: Weighting

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE    -78.290     11.149     -7.022      0.000   -100.143    -56.438

Treatment Effect Estimates: Matching

                     Est.       S.e.          z      P>|z|      [95% Conf. int.]
--------------------------------------------------------------------------------
           ATE    -65.061     16.439     -3.958      0.000    -97.282    -32.

The ATE is poorer with 15% discount compared with 10% Discount.