In [121]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# `Logit` on Orders - A warm-up challenge (~1h)

## Select features

🎯 Let's figure out the impact of `wait_time` and `delay_vs_expected` on very `good/bad reviews`

👉 Using our `orders` training_set, we will run two `multivariate logistic regressions`:
- `logit_one` to predict `dim_is_one_star` 
- `logit_five` to predict `dim_is_five_star`.

 

In [2]:
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

👉 Import your dataset:

In [127]:
from olist.order import Order
orders = Order().get_training_data(with_distance_seller_customer=True)

In [128]:
orders

Unnamed: 0,order_id,wait_time,expected_wait_time,delay_vs_expected,order_status,dim_is_five_star,dim_is_one_star,review_score,number_of_products,number_of_sellers,price,freight_value,distance_seller_customer
0,e481f51cbdc54678b7cc49136f2d6af7,8.0,15.0,0.0,delivered,0,0,4,1,1,29.99,8.72,18.063837
1,53cdb2fc8bc7dce0b6741e2150273451,13.0,19.0,0.0,delivered,0,0,4,1,1,118.70,22.76,856.292580
2,47770eb9100c2d0c44946d9cf07ec65d,9.0,26.0,0.0,delivered,1,0,5,1,1,159.90,19.22,514.130333
3,949d5b44dbf5de918fe9c16f97b45f8a,13.0,26.0,0.0,delivered,1,0,5,1,1,45.00,27.20,1822.800366
4,ad21c59c0840e6cb83a9ceb5573f8159,2.0,12.0,0.0,delivered,1,0,5,1,1,19.90,8.72,30.174037
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95875,9c5dedf39a927c1b2549525ed64a053c,8.0,18.0,0.0,delivered,1,0,5,1,1,72.00,13.08,69.481037
95876,63943bddc261676b46f01ca7ac2f7bd8,22.0,23.0,0.0,delivered,0,0,4,1,1,174.90,20.10,474.098245
95877,83c1379a015df1e13d02aae0204711ab,24.0,30.0,0.0,delivered,1,0,5,1,1,205.99,65.02,968.051192
95878,11c177c8e97725db2631073c19f07b62,17.0,37.0,0.0,delivered,0,0,2,2,1,359.98,81.18,370.146853


👉 Select in a list which features you want to use:

⚠️ Make sure you are not creating data leakage (i.e. selecting features that are derived from the target)

💡 To figure out the impact of `wait_time` and `delay_vs_expected` we need to control for the impact of other features, include in your list all features that may be relevant

In [129]:
features_list = ["dim_is_one_star", "dim_is_five_star", "wait_time","delay_vs_expected", "price", "distance_seller_customer", "number_of_products", "number_of_sellers"]

🕵🏻 Check the `multi-colinearity` of your features, using the `VIF index`.

* It shouldn't be too high (< 10 preferably) to ensure that we can trust the partial regression coefficents and their associated `p-values` 
* Do not forget to standardize your data ! 
    * A `VIF Analysis` is made by regressing a feature vs. the other features...
    * So you want to `remove the effect of scale` so that your features have an equal importance before running any linear regression!
    
    
📚 <a href="https://www.statisticshowto.com/variance-inflation-factor/">Statistics How To - Variance Inflation Factor</a>

📚  <a href="https://online.stat.psu.edu/stat462/node/180/">PennState - Detecting Multicollinearity Using Variance Inflation Factors</a>

⚖️ Standardizing:

In [130]:
orders_scaled = orders.copy()
orders_scaled = orders_scaled[features_list]

for feature in orders_scaled.columns[2:]:
    mu = orders_scaled[feature].mean()
    sigma = orders_scaled[feature].std()
    orders_scaled[feature] = orders_scaled[feature].apply(lambda x: (x-mu)/sigma)
    
orders_scaled

Unnamed: 0,dim_is_one_star,dim_is_five_star,wait_time,delay_vs_expected,price,distance_seller_customer,number_of_products,number_of_sellers
0,0,0,-0.428002,-0.153335,-0.513802,-0.979475,-0.264595,-0.112544
1,0,0,0.100519,-0.153335,-0.086640,0.429743,-0.264595,-0.112544
2,0,1,-0.322297,-0.153335,0.111748,-0.145495,-0.264595,-0.112544
3,0,1,0.100519,-0.153335,-0.441525,2.054621,-0.264595,-0.112544
4,0,1,-1.062226,-0.153335,-0.562388,-0.959115,-0.264595,-0.112544
...,...,...,...,...,...,...,...,...
95875,0,1,-0.428002,-0.153335,-0.311513,-0.893033,-0.264595,-0.112544
95876,0,0,1.051855,-0.153335,0.183977,-0.212797,-0.264595,-0.112544
95877,0,1,1.263263,-0.153335,0.333684,0.617630,-0.264595,-0.112544
95878,0,0,0.523335,-0.153335,1.075186,-0.387558,1.601605,-0.112544


👉 Run your VIF Analysis to analyze the potential multicolinearities:

In [131]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [132]:
df = pd.DataFrame()

df["features"] = orders_scaled.columns
df["vif_index"] = [vif(orders_scaled.values, i) for i in range(orders_scaled.shape[1])]
round(df.sort_values(by="vif_index", ascending = False),2)

df

Unnamed: 0,features,vif_index
0,dim_is_one_star,1.133492
1,dim_is_five_star,1.031407
2,wait_time,2.648983
3,delay_vs_expected,2.15181
4,price,1.032502
5,distance_seller_customer,1.327266
6,number_of_products,1.128042
7,number_of_sellers,1.102739


## Logistic Regressions

👉 Fit two `Logistic Regression` models:
- `logit_one` to predict `dim_is_one_star` 
- `logit_five` to predict `dim_is_five_star`.

`Logit 1️⃣`

In [133]:
orders_scaled

Unnamed: 0,dim_is_one_star,dim_is_five_star,wait_time,delay_vs_expected,price,distance_seller_customer,number_of_products,number_of_sellers
0,0,0,-0.428002,-0.153335,-0.513802,-0.979475,-0.264595,-0.112544
1,0,0,0.100519,-0.153335,-0.086640,0.429743,-0.264595,-0.112544
2,0,1,-0.322297,-0.153335,0.111748,-0.145495,-0.264595,-0.112544
3,0,1,0.100519,-0.153335,-0.441525,2.054621,-0.264595,-0.112544
4,0,1,-1.062226,-0.153335,-0.562388,-0.959115,-0.264595,-0.112544
...,...,...,...,...,...,...,...,...
95875,0,1,-0.428002,-0.153335,-0.311513,-0.893033,-0.264595,-0.112544
95876,0,0,1.051855,-0.153335,0.183977,-0.212797,-0.264595,-0.112544
95877,0,1,1.263263,-0.153335,0.333684,0.617630,-0.264595,-0.112544
95878,0,0,0.523335,-0.153335,1.075186,-0.387558,1.601605,-0.112544


In [139]:
logit_one = smf.logit(formula='''dim_is_one_star ~ wait_time + delay_vs_expected + price + 
distance_seller_customer + number_of_products + number_of_sellers''', data=orders_scaled).fit()
logit_one.summary()

Optimization terminated successfully.
         Current function value: 0.273940
         Iterations 7


0,1,2,3
Dep. Variable:,dim_is_one_star,No. Observations:,95872.0
Model:,Logit,Df Residuals:,95865.0
Method:,MLE,Df Model:,6.0
Date:,"Fri, 06 May 2022",Pseudo R-squ.:,0.1436
Time:,12:38:50,Log-Likelihood:,-26263.0
converged:,True,LL-Null:,-30669.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.4710,0.013,-190.640,0.000,-2.496,-2.446
wait_time,0.7448,0.017,45.101,0.000,0.712,0.777
delay_vs_expected,0.2124,0.018,11.606,0.000,0.177,0.248
price,0.0425,0.011,4.051,0.000,0.022,0.063
distance_seller_customer,-0.1949,0.013,-14.608,0.000,-0.221,-0.169
number_of_products,0.2407,0.009,26.231,0.000,0.223,0.259
number_of_sellers,0.1785,0.008,22.710,0.000,0.163,0.194


`Logit 5️⃣`

In [135]:
logit_five = smf.logit(formula='''dim_is_five_star ~ wait_time + delay_vs_expected + price + 
distance_seller_customer + number_of_products + number_of_sellers''', data=orders_scaled).fit()
logit_five.summary()

Optimization terminated successfully.
         Current function value: 0.637179
         Iterations 7


0,1,2,3
Dep. Variable:,dim_is_five_star,No. Observations:,95872.0
Model:,Logit,Df Residuals:,95865.0
Method:,MLE,Df Model:,6.0
Date:,"Fri, 06 May 2022",Pseudo R-squ.:,0.05754
Time:,12:36:14,Log-Likelihood:,-61088.0
converged:,True,LL-Null:,-64817.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3406,0.007,47.577,0.000,0.327,0.355
wait_time,-0.5349,0.012,-46.497,0.000,-0.557,-0.512
delay_vs_expected,-0.4114,0.024,-17.102,0.000,-0.459,-0.364
price,0.0225,0.007,3.189,0.001,0.009,0.036
distance_seller_customer,0.0926,0.008,11.661,0.000,0.077,0.108
number_of_products,-0.1343,0.008,-17.727,0.000,-0.149,-0.119
number_of_sellers,-0.1428,0.008,-18.232,0.000,-0.158,-0.127


💡 It's time to analyse the results of these two logistic regressions:

- Interpret the partial coefficients in your own words.
- Check their statistical significances with `p-values`
- Do you notice any differences between `logit_one` and `logit_five` in terms of coefficient importances?

In [140]:
# Among the following sentences, store the ones that are true in the list below

a = "delay_vs_expected influences five_star ratings even more than one_star ratings"
b = "wait_time influences five_star ratings even more more than one_star"

your_answer = [a]

🧪 __Test your code__

In [141]:
from nbresult import ChallengeResult

result = ChallengeResult('logit',
    answers = your_answer
)
result.write()
print(result.check())

platform linux -- Python 3.8.12, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 -- /home/chongxe1991/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /home/chongxe1991/code/chongxe1991/data-challenges/04-Decision-Science/04-Logistic-Regression/01-Logit
plugins: anyio-3.4.0
[1mcollecting ... [0mcollected 1 item

tests/test_logit.py::TestLogit::test_question [32mPASSED[0m[32m                     [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/logit.pickle

[32mgit[39m commit -m [33m'Completed logit step'[39m

[32mgit[39m push origin master


<details>
    <summary>- <i>Explanations and advanced concepts </i> -</summary>


> _All other thing being equal, the `delay factor` tends to increase the chances of getting stripped of the 5-star even more so than it affect the chances of 1-star reviews. Probably because 1-stars are really targeting bad products themselves, not bad deliveries_
    
❗️ However, to be totally rigorous, we have to be **more careful when comparing coefficients from two different models**, because **they might not be based on similar populations**!
    We have 2 sub-populations here: (people who gave 1-stars; and people who gave 5-stars) and they may exhibit intrinsically different behavior patterns. It may well be that "happy-people" (who tends to give 5-stars easily) are less sensitive as "grumpy-people" (who shoot 1-stars like Lucky-Luke), when it comes to "delay", or "price"...

</details>


## Logistic vs. Linear ?

👉 Compare:
- the regression coefficients obtained from the `Logistic Regression `
- with the regression coefficients obtained through a `Linear Regression` 
- on `review_score`, using the same features. 

⚠️ Check that both sets of coefficients  tell  "the same story".

> YOUR ANSWER HERE

In [151]:
features = ["review_score", "wait_time","delay_vs_expected", "price", "distance_seller_customer", 
       "number_of_products", "number_of_sellers"]

orders_scaled_1 = orders.copy()
orders_scaled_1 = orders_scaled_1[features]

for feature in orders_scaled_1.columns[1:]:
    mu = orders_scaled_1[feature].mean()
    sigma = orders_scaled_1[feature].std()
    orders_scaled_1[feature] = orders_scaled_1[feature].apply(lambda x: (x-mu)/sigma)
    
orders_scaled_1

Unnamed: 0,review_score,wait_time,delay_vs_expected,price,distance_seller_customer,number_of_products,number_of_sellers
0,4,-0.428002,-0.153335,-0.513802,-0.979475,-0.264595,-0.112544
1,4,0.100519,-0.153335,-0.086640,0.429743,-0.264595,-0.112544
2,5,-0.322297,-0.153335,0.111748,-0.145495,-0.264595,-0.112544
3,5,0.100519,-0.153335,-0.441525,2.054621,-0.264595,-0.112544
4,5,-1.062226,-0.153335,-0.562388,-0.959115,-0.264595,-0.112544
...,...,...,...,...,...,...,...
95875,5,-0.428002,-0.153335,-0.311513,-0.893033,-0.264595,-0.112544
95876,4,1.051855,-0.153335,0.183977,-0.212797,-0.264595,-0.112544
95877,5,1.263263,-0.153335,0.333684,0.617630,-0.264595,-0.112544
95878,2,0.523335,-0.153335,1.075186,-0.387558,1.601605,-0.112544


In [152]:
model = smf.ols(formula = '''review_score ~ wait_time + delay_vs_expected + price + 
distance_seller_customer + number_of_products + number_of_sellers''', data = orders_scaled_1)
model = model.fit()
model.summary()

0,1,2,3
Dep. Variable:,review_score,R-squared:,0.144
Model:,OLS,Adj. R-squared:,0.144
Method:,Least Squares,F-statistic:,2698.0
Date:,"Fri, 06 May 2022",Prob (F-statistic):,0.0
Time:,12:53:12,Log-Likelihood:,-152610.0
No. Observations:,95872,AIC:,305200.0
Df Residuals:,95865,BIC:,305300.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.1555,0.004,1082.409,0.000,4.148,4.163
wait_time,-0.4534,0.006,-74.213,0.000,-0.465,-0.441
delay_vs_expected,-0.0344,0.006,-6.117,0.000,-0.045,-0.023
price,-0.0014,0.004,-0.368,0.713,-0.009,0.006
distance_seller_customer,0.1021,0.004,23.129,0.000,0.093,0.111
number_of_products,-0.1280,0.004,-31.546,0.000,-0.136,-0.120
number_of_sellers,-0.1317,0.004,-32.800,0.000,-0.140,-0.124

0,1,2,3
Omnibus:,18653.258,Durbin-Watson:,2.009
Prob(Omnibus):,0.0,Jarque-Bera (JB):,36496.248
Skew:,-1.186,Prob(JB):,0.0
Kurtosis:,4.874,Cond. No.,2.84


🏁 Congratulations! 

💾 Don't forget to commit and push your `logit.ipynb` notebook !