In [None]:
%load_ext autoreload
%autoreload 2

# `Logit` on Orders - A warm-up challenge (~1h)

## Select features

🎯 Let's figure out the impact of `wait_time` and `delay_vs_expected` on very `good/bad reviews`

👉 Using our `orders` training_set, we will run two `multivariate logistic regressions`:
- `logit_one` to predict `dim_is_one_star` 
- `logit_five` to predict `dim_is_five_star`.

 

In [1]:
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

👉 Import your dataset:

In [2]:
from olist.order import Order
orders = Order().get_training_data(with_distance_seller_customer=True)

👉 Select in a list which features you want to use:

⚠️ Make sure you are not creating data leakage (i.e. selecting features that are derived from the target)

💡 To figure out the impact of `wait_time` and `delay_vs_expected` we need to control for the impact of other features, include in your list all features that may be relevant

In [7]:
candidates_all = ['order_id', 'wait_time', 'expected_wait_time', 'delay_vs_expected',
       'order_status', 'dim_is_five_star', 'dim_is_one_star', 'review_score',
       'number_of_products', 'number_of_sellers', 'price', 'freight_value',
       'distance_seller_customer']

features = ['wait_time', 'delay_vs_expected', 'number_of_products', 'price', 'freight_value', 'distance_seller_customer']
features

['wait_time',
 'delay_vs_expected',
 'number_of_products',
 'price',
 'freight_value',
 'distance_seller_customer']

🕵🏻 Check the `multi-colinearity` of your features, using the `VIF index`.

* It shouldn't be too high (< 10 preferably) to ensure that we can trust the partial regression coefficents and their associated `p-values` 
* Do not forget to standardize your data ! 
    * A `VIF Analysis` is made by regressing a feature vs. the other features...
    * So you want to `remove the effect of scale` so that your features have an equal importance before running any linear regression!
    
    
📚 <a href="https://www.statisticshowto.com/variance-inflation-factor/">Statistics How To - Variance Inflation Factor</a>

📚  <a href="https://online.stat.psu.edu/stat462/node/180/">PennState - Detecting Multicollinearity Using Variance Inflation Factors</a>

⚖️ Standardizing:

In [17]:
#First of all, we need to normalise the data

orders_new = orders[['wait_time','delay_vs_expected', 
                     'number_of_products','price','freight_value', 
                     'distance_seller_customer' ]].copy()

for f in features:
    mu = orders[f].mean()
    sigma = orders[f].std()
    orders_new[f] = orders[f].apply(lambda x: (x-mu)/sigma)
    
orders_new.head(3)

Unnamed: 0,wait_time,delay_vs_expected,number_of_products,price,freight_value,distance_seller_customer
0,-0.431192,-0.161781,-0.264595,-0.513802,-0.652038,-0.979475
1,0.134174,-0.161781,-0.264595,-0.08664,0.000467,0.429743
2,-0.329907,-0.161781,-0.264595,0.111748,-0.164053,-0.145495


👉 Run your VIF Analysis to analyze the potential multicolinearities:

In [18]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

df = pd.DataFrame()

df["features"] = orders_new.columns

df["vif_index"] = [vif(orders_new.values, i) for i in range(orders_new.shape[1])]

round(df.sort_values(by="vif_index", ascending = False),2)


Unnamed: 0,features,vif_index
0,wait_time,2.62
1,delay_vs_expected,2.21
4,freight_value,1.67
5,distance_seller_customer,1.44
2,number_of_products,1.28
3,price,1.21


## Logistic Regressions

👉 Fit two `Logistic Regression` models:
- `logit_one` to predict `dim_is_one_star` 
- `logit_five` to predict `dim_is_five_star`.

`Logit 1️⃣`

In [34]:
Y1 = orders['dim_is_one_star']
string_features = 'wait_time + delay_vs_expected + number_of_products + price + freight_value + distance_seller_customer'
formula1 = 'dim_is_one_star ~ ' + string_features
formula5 = 'dim_is_five_star ~ ' + string_features
model1 = smf.logit(formula=formula1, data=orders).fit();
model1.params
model1.summary()


Optimization terminated successfully.
         Current function value: 0.276012
         Iterations 7


0,1,2,3
Dep. Variable:,dim_is_one_star,No. Observations:,95872.0
Model:,Logit,Df Residuals:,95865.0
Method:,MLE,Df Model:,6.0
Date:,"Thu, 05 May 2022",Pseudo R-squ.:,0.1372
Time:,04:13:31,Log-Likelihood:,-26462.0
converged:,True,LL-Null:,-30669.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.8843,0.033,-116.003,0.000,-3.950,-3.819
wait_time,0.0727,0.002,40.906,0.000,0.069,0.076
delay_vs_expected,0.0574,0.004,14.461,0.000,0.050,0.065
number_of_products,0.5632,0.019,30.067,0.000,0.526,0.600
price,0.0002,5.41e-05,4.372,0.000,0.000,0.000
freight_value,-0.0009,0.001,-1.502,0.133,-0.002,0.000
distance_seller_customer,-0.0003,2.3e-05,-12.547,0.000,-0.000,-0.000


`Logit 5️⃣`

In [35]:
model5 = smf.logit(formula=formula5, data=orders).fit()
model5.params
model5.summary()


Optimization terminated successfully.
         Current function value: 0.638779
         Iterations 7


0,1,2,3
Dep. Variable:,dim_is_five_star,No. Observations:,95872.0
Model:,Logit,Df Residuals:,95865.0
Method:,MLE,Df Model:,6.0
Date:,"Thu, 05 May 2022",Pseudo R-squ.:,0.05517
Time:,04:13:42,Log-Likelihood:,-61241.0
converged:,True,LL-Null:,-64817.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.3629,0.022,62.992,0.000,1.320,1.405
wait_time,-0.0541,0.001,-43.921,0.000,-0.057,-0.052
delay_vs_expected,-0.0941,0.005,-18.643,0.000,-0.104,-0.084
number_of_products,-0.3319,0.015,-21.622,0.000,-0.362,-0.302
price,0.0001,3.69e-05,2.807,0.005,3.13e-05,0.000
freight_value,0.0001,0.000,0.283,0.778,-0.001,0.001
distance_seller_customer,0.0001,1.39e-05,10.238,0.000,0.000,0.000


💡 It's time to analyse the results of these two logistic regressions:

- Interpret the partial coefficients in your own words.
- Check their statistical significances with `p-values`
- Do you notice any differences between `logit_one` and `logit_five` in terms of coefficient importances?

In [36]:



# Among the following sentences, store the ones that are true in the list below

a = "delay_vs_expected influences five_star ratings even more than one_star ratings"
b = "wait_time influences five_star ratings even more more than one_star"

your_answer = [a]
#given the fact that the coeff of delay_vs_expected is higher in m1 compared to m5

🧪 __Test your code__

In [37]:
from nbresult import ChallengeResult

result = ChallengeResult('logit',
    answers = your_answer
)
result.write()
print(result.check())

platform linux -- Python 3.8.12, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 -- /home/cherif/.pyenv/versions/lewagon/bin/python3
cachedir: .pytest_cache
rootdir: /home/cherif/code/cherifbenham/data-challenges/04-Decision-Science/04-Logistic-Regression/01-Logit
plugins: anyio-3.4.0
[1mcollecting ... [0mcollected 1 item

tests/test_logit.py::TestLogit::test_question [32mPASSED[0m[32m                     [100%][0m



💯 You can commit your code:

[1;32mgit[39m add tests/logit.pickle

[32mgit[39m commit -m [33m'Completed logit step'[39m

[32mgit[39m push origin master


<details>
    <summary>- <i>Explanations and advanced concepts </i> -</summary>


> _All other thing being equal, the `delay factor` tends to increase the chances of getting stripped of the 5-star even more so than it affect the chances of 1-star reviews. Probably because 1-stars are really targeting bad products themselves, not bad deliveries_
    
❗️ However, to be totally rigorous, we have to be **more careful when comparing coefficients from two different models**, because **they might not be based on similar populations**!
    We have 2 sub-populations here: (people who gave 1-stars; and people who gave 5-stars) and they may exhibit intrinsically different behavior patterns. It may well be that "happy-people" (who tends to give 5-stars easily) are less sensitive as "grumpy-people" (who shoot 1-stars like Lucky-Luke), when it comes to "delay", or "price"...

</details>


## Logistic vs. Linear ?

👉 Compare:
- the regression coefficients obtained from the `Logistic Regression `
- with the regression coefficients obtained through a `Linear Regression` 
- on `review_score`, using the same features. 

⚠️ Check that both sets of coefficients  tell  "the same story".

> YOUR ANSWER HERE

In [47]:
#we create a new model ols and compare coeffs

model_ols_1 = smf.ols(formula='dim_is_one_star ~ wait_time + delay_vs_expected + number_of_products + price + distance_seller_customer', 
                data=orders).fit()

model_ols_5 = smf.ols(formula='dim_is_five_star ~ wait_time + delay_vs_expected + number_of_products + price + distance_seller_customer', 
                data=orders).fit()


🏁 Congratulations! 

💾 Don't forget to commit and push your `logit.ipynb` notebook !

In [46]:
model_logit_1 =  smf.logit(formula = 'dim_is_one_star ~ wait_time + delay_vs_expected + number_of_products + price  + distance_seller_customer', 
                data=orders).fit()

model_logit_5 =  smf.logit(formula = 'dim_is_five_star ~ wait_time + delay_vs_expected + number_of_products + price  + distance_seller_customer', 
                data=orders).fit()

Optimization terminated successfully.
         Current function value: 0.276025
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.638779
         Iterations 7


In [48]:
model_ols_1.summary()

0,1,2,3
Dep. Variable:,dim_is_one_star,R-squared:,0.122
Model:,OLS,Adj. R-squared:,0.122
Method:,Least Squares,F-statistic:,2665.0
Date:,"Thu, 05 May 2022",Prob (F-statistic):,0.0
Time:,04:32:14,Log-Likelihood:,-13355.0
No. Observations:,95872,AIC:,26720.0
Df Residuals:,95866,BIC:,26780.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0689,0.003,-26.080,0.000,-0.074,-0.064
wait_time,0.0079,0.000,51.847,0.000,0.008,0.008
delay_vs_expected,0.0072,0.000,25.013,0.000,0.007,0.008
number_of_products,0.0681,0.002,40.141,0.000,0.065,0.071
price,2.121e-05,4.39e-06,4.826,0.000,1.26e-05,2.98e-05
distance_seller_customer,-3.159e-05,1.74e-06,-18.153,0.000,-3.5e-05,-2.82e-05

0,1,2,3
Omnibus:,45078.307,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,259571.5
Skew:,2.244,Prob(JB):,0.0
Kurtosis:,9.696,Cond. No.,2810.0


In [49]:
model_logit_1.summary()

0,1,2,3
Dep. Variable:,dim_is_one_star,No. Observations:,95872.0
Model:,Logit,Df Residuals:,95866.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 05 May 2022",Pseudo R-squ.:,0.1371
Time:,04:32:35,Log-Likelihood:,-26463.0
converged:,True,LL-Null:,-30669.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.8759,0.033,-117.376,0.000,-3.941,-3.811
wait_time,0.0724,0.002,40.915,0.000,0.069,0.076
delay_vs_expected,0.0577,0.004,14.556,0.000,0.050,0.065
number_of_products,0.5489,0.016,33.920,0.000,0.517,0.581
price,0.0002,5.05e-05,4.088,0.000,0.000,0.000
distance_seller_customer,-0.0003,2.23e-05,-13.306,0.000,-0.000,-0.000


In [None]:
# coeffs seem to play a more important role in logit model -> more useful