## Step0. Package/Variable Setting

In [119]:
import numpy as np
import pandas as pd
import sklearn.linear_model as linear_model
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler 
from pandas.api.types import is_string_dtype, is_numeric_dtype
from sklearn_pandas import DataFrameMapper

In [44]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 100)

##  Step1. Estimation of Propensity Score

### 1-1. Data Processing

In [69]:
shop_profile = pd.read_csv('data/shopProfiling_v0820.csv', sep = ',')
# deco_effect = 

In [70]:
print(len(shop_profile))
shop_profile.head(2)

35588


Unnamed: 0,shopid,seller_type,rating_star,display_response_rate,sku,camp_flag,main_category,p30_gmv,p30_order_cnt,follower_count,Decorated
0,65408,NS,4.9,89,745,N,Muslim Fashion,3202.7,51,264.0,0
1,439807,NS,4.17,65,9,N,Groceries & Pets,329.04,6,101.0,0


In [71]:
# Check the null values
shop_profile.isnull().sum()

shopid                    0
seller_type               0
rating_star               0
display_response_rate     0
sku                       0
camp_flag                 0
main_category             0
p30_gmv                   0
p30_order_cnt             0
follower_count           38
Decorated                 0
dtype: int64

In [72]:
shop_profile.follower_count.fillna(value = 0, inplace = True)

In [7]:
# Check
# shop_profile.isnull().sum()

In [73]:
shop_profile.dtypes

shopid                     int64
seller_type               object
rating_star              float64
display_response_rate      int64
sku                        int64
camp_flag                 object
main_category             object
p30_gmv                  float64
p30_order_cnt              int64
follower_count           float64
Decorated                  int64
dtype: object

In [74]:
# shop_profile.columns.values
num_cols = [n for n in shop_profile.columns if is_numeric_dtype(shop_profile[n])]
str_cols = [n for n in shop_profile.columns if is_string_dtype(shop_profile[n])]
drop_cols = ['shopid']
y_cols = ['Decorated']

In [75]:
shop_profile.head(2)

Unnamed: 0,shopid,seller_type,rating_star,display_response_rate,sku,camp_flag,main_category,p30_gmv,p30_order_cnt,follower_count,Decorated
0,65408,NS,4.9,89,745,N,Muslim Fashion,3202.7,51,264.0,0
1,439807,NS,4.17,65,9,N,Groceries & Pets,329.04,6,101.0,0


In [77]:
# Scale the numeric variables
map_f = [([n],StandardScaler()) for n in shop_profile.drop(columns = str_cols + y_cols + drop_cols).columns]
# map_f
mapper = DataFrameMapper(map_f).fit(shop_profile)
shop_profile[mapper.transformed_names_] = mapper.transform(shop_profile)

In [78]:
shop_profile.head(2)

Unnamed: 0,shopid,seller_type,rating_star,display_response_rate,sku,camp_flag,main_category,p30_gmv,p30_order_cnt,follower_count,Decorated
0,65408,NS,0.412848,0.340051,0.301299,N,Muslim Fashion,-0.148874,-0.162638,-0.269229,0
1,439807,NS,-4.337566,-1.812548,-0.403096,N,Groceries & Pets,-0.244146,-0.261135,-0.31522,0


Concept of StandardScaler: After transformed, the average of features is 0 and s.t.d is 1.

In [83]:
# Transform the category variables to dummy variables
for_model_df = pd.get_dummies(shop_profile.drop(columns = drop_cols))

In [85]:
for_model_df.shape

(35588, 35)

### 1-2. Modeling

In [95]:
X, y = for_model_df.drop(columns = y_cols), for_model_df[y_cols]

In [93]:
linear_model.LogisticRegression()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [99]:
propensity_score_model = linear_model.LogisticRegression(solver="lbfgs")
propensity_score_model.fit(X, y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [101]:
propensity_score_model.predict_proba(X)[:,1]

array([0.01537847, 0.00550019, 0.02089876, ..., 0.01913074, 0.01553651,
       0.00720634])

In [108]:
# accuracy = propensity_score_model.score(X, y)
# print('The accuracy is: ' + str(accuracy *100) + '%')

If the data is sufficient, I think the propensity model still need to be verify like the general ML model.

## Step2. Choose Matching algorithm

In [129]:
for_causal_df = for_model_df.copy()
for_causal_df['propensity_score'] = propensity_score_model.predict_proba(X)[:,1]

In [130]:
for_causal_df.head(3)

Unnamed: 0,rating_star,display_response_rate,sku,p30_gmv,p30_order_cnt,follower_count,Decorated,seller_type_NS,seller_type_OS,seller_type_PS,...,main_category_Muslim Fashion,main_category_Others,main_category_Sports & Outdoor,main_category_Tickets & Vouchers,main_category_Travel & Luggage,main_category_Watches,main_category_Women Clothes,main_category_Women Shoes,main_category_Women's Bags,propensity_score
0,0.412848,0.340051,0.301299,-0.148874,-0.162638,-0.269229,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0.015378
1,-4.337566,-1.812548,-0.403096,-0.244146,-0.261135,-0.31522,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.0055
2,0.217626,0.429742,-0.311218,-0.185433,-0.25238,0.010385,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.020899


In [131]:
treated = for_causal_df.loc[for_causal_df['Decorated'] == 1]
control = for_causal_df.loc[for_causal_df['Decorated'] == 0]

In [132]:
print(for_causal_df.shape, '=' ,treated.shape, control.shape)

(35588, 36) = (1242, 36) (34346, 36)


In [133]:
control_neighbors = (
    NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
    .fit(control['propensity_score'].values.reshape(-1, 1))
)
distances, indices = control_neighbors.kneighbors(treated['propensity_score'].values.reshape(-1, 1))

In [134]:
len(indices)

1242

In [137]:
indices
# Question: How to conduct the replacement sampling?

array([[26883],
       [29573],
       [19843],
       ...,
       [31253],
       [31253],
       [31253]])

In [136]:
# control.iloc[indices[:, 0]]

## Step3. Check overlap and common support

## Step4. Assessing the Matching Quality

## Step5. Sensitivity Analysis