# Statistical modelling of coffee desk data

## Import and read data from csv file

In [30]:
!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.12.2-cp39-none-win_amd64.whl (9.4 MB)
Collecting patsy>=0.5
  Using cached patsy-0.5.1-py2.py3-none-any.whl (231 kB)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.1 statsmodels-0.12.2


In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf

In [153]:
coffee_df = pd.read_csv('data\coffee_desk_dataset_ead.csv')
coffee_df.drop(columns='idx', axis=1, inplace=True) #dropping index not to be treated as vector dimension
coffee_df

Unnamed: 0,process,brewing method,roast,grind,origin,price_per_kg,arabica (%),Pure arabica,roast_brew,Washed,Natural,Fermented/macerated (traditional),Fermented/macerated (closed tank),process_general,region of origin
0,Monsooning,drip (alternative brewing methods),light,beans,Laos,52.22,100,True,light_drip (alternative brewing methods),False,False,False,False,Monsooning,Asia
1,Natural,drip (alternative brewing methods),medium,beans,Brazylia,31.92,100,True,medium_drip (alternative brewing methods),False,True,False,False,Natural,Latam
2,Natural,drip (alternative brewing methods),light,beans,Etiopia,39.20,100,True,light_drip (alternative brewing methods),False,True,False,False,Natural,Africa
3,Washed,drip (alternative brewing methods),light,beans,Etiopia,39.20,100,True,light_drip (alternative brewing methods),True,False,False,False,Washed,Africa
4,Natural,drip (alternative brewing methods),dark,beans,Indonezja,35.20,100,True,dark_drip (alternative brewing methods),False,True,False,False,Natural,Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,Rum Aged,drip (alternative brewing methods),light,beans,Gwatemala,73.33,100,True,light_drip (alternative brewing methods),False,False,False,False,Fermented,Latam
858,Natural,espresso,light,beans,Panama,50.00,30,False,light_espresso,False,True,False,False,Natural,Latam
859,Pulped natural,drip (alternative brewing methods),light,beans,Nikaragua,36.00,100,True,light_drip (alternative brewing methods),True,True,False,False,Hybrid,Latam
860,Washed,drip (alternative brewing methods),light,beans,Gwatemala,25.00,100,True,light_drip (alternative brewing methods),True,False,False,False,Washed,Latam


In [181]:
coffee_df.rename(columns={'brewing method':'brewing_method', 'arabica (%)':'percentage_of_arabica', 'Pure arabica':'pure_arabica', 'region of origin':'region_of_origin', 'Fermented/macerated (traditional)':'Fermented_traditional', 'Fermented/macerated (closed tank)':'Fermented_closedtank'}, inplace=True)

## Train-val-test split

In [182]:
X_df = coffee_df.drop('price_per_kg', axis=1) #defining predictors
y_df = coffee_df['price_per_kg'] #defining target variable

In [183]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.1, random_state=True) #using random state to ensure I always have random division with the same random numbers

In [184]:
train_df = pd.concat([y_train, X_train], axis=1)
train_df.head()

Unnamed: 0,price_per_kg,process,brewing_method,roast,grind,origin,percentage_of_arabica,pure_arabica,roast_brew,Washed,Natural,Fermented_traditional,Fermented_closedtank,process_general,region_of_origin
482,25.8,Natural,espresso,medium,beans,Brazylia,100,True,medium_espresso,False,True,False,False,Natural,Latam
426,21.6,Washed,"drip, espresso",medium,beans,Kolumbia,100,True,"medium_drip, espresso",True,False,False,False,Washed,Latam
836,25.2,Washed,espresso,medium,beans,Kolumbia,100,True,medium_espresso,True,False,False,False,Washed,Latam
364,52.8,Washed,drip (alternative brewing methods),light,beans,Etiopia,100,True,light_drip (alternative brewing methods),True,False,False,False,Washed,Africa
50,29.2,Washed,drip (alternative brewing methods),light,beans,Kolumbia,100,True,light_drip (alternative brewing methods),True,False,False,False,Washed,Latam


## Initial stats modelling

## Forward stepwise selection
1. Begins with a model that contains no variables (called the Null Model)
2. Then starts adding the most significant variables one after the other
3. Until a pre-specified stopping rule is reached or until all the variables under consideration are included in the model

Steps:
* determine the most significant variables:
    - has the smallest p-value
    - provides the highest increase in R^2
    - provides the highest drop in modell RSS (Residuals Sum of Squares)
* choose a stopping rule

In [185]:
model_0 = smf.ols(formula="price_per_kg ~ 1", data=train_df).fit() # the initial model with no variables
print(model_0.summary())

                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                       nan
Date:                Mon, 31 May 2021   Prob (F-statistic):                nan
Time:                        20:31:33   Log-Likelihood:                -3485.1
No. Observations:                 775   AIC:                             6972.
Df Residuals:                     774   BIC:                             6977.
Df Model:                           0                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     36.8732      0.780     47.246      0.0

In [175]:
model_0.ssr # sum in squares for the original model

365375.29442787095

In [173]:
# process
process_model = smf.ols(formula="price_per_kg ~ C(process)", data=train_df).fit()
print(process_model.summary())

                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.187
Model:                            OLS   Adj. R-squared:                  0.162
Method:                 Least Squares   F-statistic:                     7.486
Date:                Mon, 31 May 2021   Prob (F-statistic):           5.78e-22
Time:                        20:25:09   Log-Likelihood:                -3405.1
No. Observations:                 775   AIC:                             6858.
Df Residuals:                     751   BIC:                             6970.
Df Model:                          23                                         
Covariance Type:            nonrobust                                         
                                                    coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------

In [174]:
process_model.ssr

297228.4546231441

In [186]:
# process_general TODO: low p-value -> potentially good variable
model = smf.ols(formula="price_per_kg ~ C(process_general)", data=train_df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.048
Model:                            OLS   Adj. R-squared:                  0.038
Method:                 Least Squares   F-statistic:                     4.775
Date:                Mon, 31 May 2021   Prob (F-statistic):           9.61e-06
Time:                        20:31:52   Log-Likelihood:                -3466.2
No. Observations:                 775   AIC:                             6950.
Df Residuals:                     766   BIC:                             6992.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Inte

In [164]:
# brewing_method # roast TODO: very low p-value -> potentially good variable
model = smf.ols(formula="price_per_kg ~ C(brewing_method)", data=train_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price_per_kg,R-squared:,0.299
Model:,OLS,Adj. R-squared:,0.297
Method:,Least Squares,F-statistic:,164.3
Date:,"Mon, 31 May 2021",Prob (F-statistic):,3.54e-60
Time:,20:17:07,Log-Likelihood:,-3347.6
No. Observations:,775,AIC:,6701.0
Df Residuals:,772,BIC:,6715.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,48.4343,0.985,49.160,0.000,46.500,50.368
"C(brewing_method)[T.drip, espresso]",-1.9765,2.450,-0.807,0.420,-6.785,2.832
C(brewing_method)[T.espresso],-24.0583,1.369,-17.569,0.000,-26.746,-21.370

0,1,2,3
Omnibus:,492.479,Durbin-Watson:,2.006
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5124.659
Skew:,2.781,Prob(JB):,0.0
Kurtosis:,14.303,Cond. No.,4.4


In [163]:
# roast TODO: very low p-value -> potentially good variable
model = smf.ols(formula="price_per_kg ~ C(roast)", data=train_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price_per_kg,R-squared:,0.259
Model:,OLS,Adj. R-squared:,0.257
Method:,Least Squares,F-statistic:,135.0
Date:,"Mon, 31 May 2021",Prob (F-statistic):,5.32e-51
Time:,20:16:56,Log-Likelihood:,-3368.8
No. Observations:,775,AIC:,6744.0
Df Residuals:,772,BIC:,6758.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,22.9194,1.392,16.467,0.000,20.187,25.652
C(roast)[T.light],25.9239,1.719,15.077,0.000,22.548,29.299
C(roast)[T.medium],7.5853,1.828,4.151,0.000,3.998,11.173

0,1,2,3
Omnibus:,504.475,Durbin-Watson:,2.007
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5487.48
Skew:,2.856,Prob(JB):,0.0
Kurtosis:,14.718,Cond. No.,4.42


In [165]:
# grind TODO: low p-value -> potentially good variable
model = smf.ols(formula="price_per_kg ~ C(grind)", data=train_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price_per_kg,R-squared:,0.01
Model:,OLS,Adj. R-squared:,0.009
Method:,Least Squares,F-statistic:,7.836
Date:,"Mon, 31 May 2021",Prob (F-statistic):,0.00525
Time:,20:17:35,Log-Likelihood:,-3481.1
No. Observations:,775,AIC:,6966.0
Df Residuals:,773,BIC:,6976.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,36.1612,0.818,44.229,0.000,34.556,37.766
C(grind)[T.ground],7.3572,2.628,2.799,0.005,2.198,12.516

0,1,2,3
Omnibus:,333.151,Durbin-Watson:,2.025
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1651.09
Skew:,1.935,Prob(JB):,0.0
Kurtosis:,9.013,Cond. No.,3.42


In [166]:
# origin   TODO: low p-value -> potentially good variable, BUT it has high granularuty
model = smf.ols(formula="price_per_kg ~ C(origin)", data=train_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price_per_kg,R-squared:,0.232
Model:,OLS,Adj. R-squared:,0.176
Method:,Least Squares,F-statistic:,4.116
Date:,"Mon, 31 May 2021",Prob (F-statistic):,1.83e-18
Time:,20:17:45,Log-Likelihood:,-3382.6
No. Observations:,775,AIC:,6873.0
Df Residuals:,721,BIC:,7124.0
Df Model:,53,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,53.7320,8.821,6.091,0.000,36.414,71.050
C(origin)[T.Brazylia],-21.4141,9.041,-2.369,0.018,-39.164,-3.664
"C(origin)[T.Brazylia, Burundi, Peru]",-21.7320,21.607,-1.006,0.315,-64.152,20.688
"C(origin)[T.Brazylia, Etiopia]",-25.1320,14.404,-1.745,0.081,-53.412,3.148
"C(origin)[T.Brazylia, Gwatemala]",-30.3891,11.549,-2.631,0.009,-53.063,-7.715
"C(origin)[T.Brazylia, Gwatemala, Etiopia]",-31.0170,13.231,-2.344,0.019,-56.994,-5.040
"C(origin)[T.Brazylia, Gwatemala, Kostaryka]",-16.1320,21.607,-0.747,0.456,-58.552,26.288
"C(origin)[T.Brazylia, Honduras]",-32.5820,13.231,-2.462,0.014,-58.559,-6.605
"C(origin)[T.Brazylia, Indie]",-31.3491,9.576,-3.274,0.001,-50.150,-12.549

0,1,2,3
Omnibus:,482.338,Durbin-Watson:,1.975
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4890.185
Skew:,2.713,Prob(JB):,0.0
Kurtosis:,14.045,Cond. No.,95.6


In [169]:
# percentage_of_arabica -> TODO very high p-value, not a good variable!
model = smf.ols(formula="price_per_kg ~ percentage_of_arabica", data=train_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price_per_kg,R-squared:,0.076
Model:,OLS,Adj. R-squared:,0.075
Method:,Least Squares,F-statistic:,63.69
Date:,"Mon, 31 May 2021",Prob (F-statistic):,5.26e-15
Time:,20:18:12,Log-Likelihood:,-3454.4
No. Observations:,775,AIC:,6913.0
Df Residuals:,773,BIC:,6922.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2154,4.708,-0.046,0.964,-9.457,9.026
percentage_of_arabica,0.3911,0.049,7.980,0.000,0.295,0.487

0,1,2,3
Omnibus:,401.352,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2539.501
Skew:,2.315,Prob(JB):,0.0
Kurtosis:,10.564,Cond. No.,603.0


In [170]:
# pure_arabica TODO: low p-value -> potentially good variable
model = smf.ols(formula="price_per_kg ~ C(pure_arabica)", data=train_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price_per_kg,R-squared:,0.108
Model:,OLS,Adj. R-squared:,0.107
Method:,Least Squares,F-statistic:,93.63
Date:,"Mon, 31 May 2021",Prob (F-statistic):,5.5600000000000006e-21
Time:,20:18:22,Log-Likelihood:,-3440.8
No. Observations:,775,AIC:,6886.0
Df Residuals:,773,BIC:,6895.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,19.3253,1.958,9.871,0.000,15.482,23.168
C(pure_arabica)[T.True],20.4506,2.113,9.676,0.000,16.302,24.599

0,1,2,3
Omnibus:,410.648,Durbin-Watson:,2.02
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2773.148
Skew:,2.353,Prob(JB):,0.0
Kurtosis:,10.983,Cond. No.,5.13


In [171]:
# roast_brew TODO: higher p-value -> not such a good variable
model = smf.ols(formula="price_per_kg ~ C(roast_brew)", data=train_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price_per_kg,R-squared:,0.311
Model:,OLS,Adj. R-squared:,0.304
Method:,Least Squares,F-statistic:,43.29
Date:,"Mon, 31 May 2021",Prob (F-statistic):,2.64e-57
Time:,20:18:30,Log-Likelihood:,-3340.5
No. Observations:,775,AIC:,6699.0
Df Residuals:,766,BIC:,6741.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,35.2000,18.124,1.942,0.052,-0.379,70.779
"C(roast_brew)[T.dark_drip, espresso]",-5.4200,22.198,-0.244,0.807,-48.995,38.155
C(roast_brew)[T.dark_espresso],-12.4266,18.175,-0.684,0.494,-48.105,23.252
C(roast_brew)[T.light_drip (alternative brewing methods)],13.3377,18.151,0.735,0.463,-22.295,48.970
"C(roast_brew)[T.light_drip, espresso]",27.4912,19.224,1.430,0.153,-10.246,65.228
C(roast_brew)[T.light_espresso],9.3000,22.198,0.419,0.675,-34.275,52.875
C(roast_brew)[T.medium_drip (alternative brewing methods)],10.1900,19.376,0.526,0.599,-27.846,48.226
"C(roast_brew)[T.medium_drip, espresso]",9.5343,18.285,0.521,0.602,-26.361,45.430
C(roast_brew)[T.medium_espresso],-9.5137,18.173,-0.524,0.601,-45.188,26.160

0,1,2,3
Omnibus:,500.363,Durbin-Watson:,1.989
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5459.844
Skew:,2.822,Prob(JB):,0.0
Kurtosis:,14.714,Cond. No.,96.6


In [140]:
# Washed TODO: low p-value -> potentially good variable
model = smf.ols(formula="price_per_kg ~ C(Washed)", data=train_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price_per_kg,R-squared:,0.001
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.9477
Date:,"Mon, 31 May 2021",Prob (F-statistic):,0.331
Time:,20:07:35,Log-Likelihood:,-3484.6
No. Observations:,775,AIC:,6973.0
Df Residuals:,773,BIC:,6982.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,37.7486,1.191,31.704,0.000,35.411,40.086
C(Washed)[T.True],-1.5349,1.577,-0.974,0.331,-4.630,1.560

0,1,2,3
Omnibus:,366.146,Durbin-Watson:,2.024
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2029.449
Skew:,2.118,Prob(JB):,0.0
Kurtosis:,9.701,Cond. No.,2.82


In [139]:
# Natural  TODO: low p-value -> potentially good variable
model = smf.ols(formula="price_per_kg ~ C(Natural)", data=train_df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     5.906
Date:                Mon, 31 May 2021   Prob (F-statistic):             0.0153
Time:                        20:07:22   Log-Likelihood:                -3482.1
No. Observations:                 775   AIC:                             6968.
Df Residuals:                     773   BIC:                             6978.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             38.5316      1

In [136]:
# Fermented_closedtank TODO: low p-value -> potentially good variable
model = smf.ols(formula="price_per_kg ~ C(Fermented_closedtank)", data=train_df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     2.179
Date:                Mon, 31 May 2021   Prob (F-statistic):              0.140
Time:                        20:06:48   Log-Likelihood:                -3484.0
No. Observations:                 775   AIC:                             6972.
Df Residuals:                     773   BIC:                             6981.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                      coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 

In [138]:
# Fermented_traditional TODO: low p-value -> potentially good variable
model = smf.ols(formula="price_per_kg ~ C(Fermented_traditional)", data=train_df).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     17.67
Date:                Mon, 31 May 2021   Prob (F-statistic):           2.94e-05
Time:                        20:07:14   Log-Likelihood:                -3476.3
No. Observations:                 775   AIC:                             6957.
Df Residuals:                     773   BIC:                             6966.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

In [132]:
# region_of_origin TODO: low p-value -> potentially good variable
model = smf.ols(formula="price_per_kg ~ C(region_of_origin)", data=train_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price_per_kg,R-squared:,0.102
Model:,OLS,Adj. R-squared:,0.098
Method:,Least Squares,F-statistic:,21.9
Date:,"Mon, 31 May 2021",Prob (F-statistic):,3.86e-17
Time:,20:05:53,Log-Likelihood:,-3443.3
No. Observations:,775,AIC:,6897.0
Df Residuals:,770,BIC:,6920.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,43.5291,1.526,28.529,0.000,40.534,46.524
C(region_of_origin)[T.Asia],-9.9995,4.255,-2.350,0.019,-18.353,-1.646
C(region_of_origin)[T.Europe],-21.2125,8.564,-2.477,0.013,-38.023,-4.402
C(region_of_origin)[T.Latam],-4.5782,1.830,-2.502,0.013,-8.170,-0.987
C(region_of_origin)[T.Mixed],-20.1944,2.313,-8.731,0.000,-24.735,-15.654

0,1,2,3
Omnibus:,415.685,Durbin-Watson:,2.024
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3075.27
Skew:,2.349,Prob(JB):,0.0
Kurtosis:,11.554,Cond. No.,13.6


## Backward stepwise selection

In [191]:
# Modelling with all variables
# columns = ['process', 'brewing_method', 'roast', 'grind', 'origin', 'percentage_of_arabica', 'pure_arabica', 'roast_brew', 'Washed', 'Natural', 'Fermented_traditional',                       'Fermented_closedtank', 'process_general', 'region_of_origin']

# for idx, col in enumerate(columns):
#     if idx == 0:
#         all_columns = col
#     else:
#         all_columns = all_columns + ' + ' + col
# print(all_columns)

model_step1 = smf.ols(formula="price_per_kg ~ C(process) + C(brewing_method) + C(roast) + C(grind) + C(origin) + percentage_of_arabica + C(pure_arabica) + C(roast_brew) + C(Washed) + C(Natural) + C(Fermented_traditional) + C(Fermented_closedtank) + C(process_general) + C(region_of_origin)", data=train_df).fit()
print(model_step1.summary())


                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.543
Model:                            OLS   Adj. R-squared:                  0.486
Method:                 Least Squares   F-statistic:                     9.401
Date:                Mon, 31 May 2021   Prob (F-statistic):           5.61e-72
Time:                        20:33:41   Log-Likelihood:                -3181.2
No. Observations:                 775   AIC:                             6538.
Df Residuals:                     687   BIC:                             6948.
Df Model:                          87                                         
Covariance Type:            nonrobust                                         
                                                                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------

In [193]:
model_step1.pvalues.sort_values(ascending=False)

C(origin)[T.Mjanma]                            9.960423e-01
C(Fermented_traditional)[T.True]               9.722023e-01
C(process_general)[T.Honey]                    9.645785e-01
C(origin)[T.Salwador, Nikaragua]               9.380207e-01
C(origin)[T.Brazylia, Gwatemala, Kostaryka]    9.380207e-01
                                                   ...     
C(origin)[T.Brazylia, Peru, Wietnam]           1.284246e-03
C(origin)[T.Brazylia, Kolumbia]                2.427431e-04
C(roast_brew)[T.light_drip, espresso]          1.460044e-04
C(process)[T.Carbonic Maceration]              1.933903e-07
C(grind)[T.ground]                             1.154379e-24
Length: 108, dtype: float64

In [194]:
model_step2 = smf.ols(formula="price_per_kg ~ C(process) + C(brewing_method) + C(roast) + C(grind) + percentage_of_arabica + C(pure_arabica) + C(roast_brew) + C(Washed) + C(Natural) + C(Fermented_traditional) + C(Fermented_closedtank) + C(process_general) + C(region_of_origin)", data=train_df).fit()
print(model_step2.summary()) # there was a decrease in intercept p-value by 0.052

                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.464
Model:                            OLS   Adj. R-squared:                  0.437
Method:                 Least Squares   F-statistic:                     16.79
Date:                Mon, 31 May 2021   Prob (F-statistic):           6.88e-76
Time:                        20:37:34   Log-Likelihood:                -3243.2
No. Observations:                 775   AIC:                             6564.
Df Residuals:                     736   BIC:                             6746.
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                                                                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------

In [195]:
model_step2.pvalues.sort_values(ascending=False)

C(roast)[T.medium]                                            9.968788e-01
C(process)[T.Experimental]                                    9.567508e-01
C(process_general)[T.Experimental]                            9.567508e-01
C(process)[T.Yellow Bourbon]                                  9.543915e-01
C(brewing_method)[T.drip, espresso]                           9.374646e-01
C(process)[T.Fermentacja kontrolowana]                        9.049498e-01
C(process_general)[T.Fermented]                               8.837454e-01
C(process)[T.Semi-Carbonic Maceration]                        8.729577e-01
C(process_general)[T.Hybrid]                                  8.095145e-01
C(process)[T.Semi-washed]                                     8.064379e-01
C(process_general)[T.Semi-washed]                             8.064379e-01
C(Fermented_traditional)[T.True]                              7.247505e-01
C(roast_brew)[T.medium_drip (alternative brewing methods)]    6.945736e-01
C(region_of_origin)[T.Eur

In [198]:
model_step3A = smf.ols(formula="price_per_kg ~ C(process) + C(brewing_method) + C(grind) + percentage_of_arabica + C(pure_arabica) + C(roast_brew) + C(Washed) + C(Natural) + C(Fermented_traditional) + C(Fermented_closedtank) + C(process_general) + C(region_of_origin)", data=train_df).fit()
print(model_step3A.summary())

                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.464
Model:                            OLS   Adj. R-squared:                  0.437
Method:                 Least Squares   F-statistic:                     16.79
Date:                Mon, 31 May 2021   Prob (F-statistic):           6.88e-76
Time:                        20:40:33   Log-Likelihood:                -3243.2
No. Observations:                 775   AIC:                             6564.
Df Residuals:                     736   BIC:                             6746.
Df Model:                          38                                         
Covariance Type:            nonrobust                                         
                                                                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------

In [199]:
model_step3B = smf.ols(formula="price_per_kg ~ C(brewing_method) + C(roast) + C(grind) + percentage_of_arabica + C(pure_arabica) + C(roast_brew) + C(Washed) + C(Natural) + C(Fermented_traditional) + C(Fermented_closedtank) + C(process_general) + C(region_of_origin)", data=train_df).fit()
print(model_step3B.summary()) # a significant further decrease in the pvalue

                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.436
Model:                            OLS   Adj. R-squared:                  0.416
Method:                 Least Squares   F-statistic:                     21.43
Date:                Mon, 31 May 2021   Prob (F-statistic):           3.21e-75
Time:                        20:40:41   Log-Likelihood:                -3262.8
No. Observations:                 775   AIC:                             6582.
Df Residuals:                     747   BIC:                             6712.
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                                                                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------

In [200]:
model_step3B.pvalues.sort_values(ascending=False)

C(brewing_method)[T.drip, espresso]                           9.949604e-01
C(roast)[T.medium]                                            9.616497e-01
C(process_general)[T.Hybrid]                                  9.302522e-01
C(roast_brew)[T.medium_drip (alternative brewing methods)]    7.245097e-01
C(region_of_origin)[T.Europe]                                 5.521340e-01
C(process_general)[T.Natural]                                 5.258522e-01
C(process_general)[T.Honey]                                   5.182149e-01
C(roast_brew)[T.medium_drip, espresso]                        5.091440e-01
C(process_general)[T.Washed]                                  4.840029e-01
C(brewing_method)[T.espresso]                                 4.422022e-01
C(roast_brew)[T.light_drip (alternative brewing methods)]     3.845217e-01
C(roast_brew)[T.dark_drip, espresso]                          2.728699e-01
C(region_of_origin)[T.Latam]                                  2.720754e-01
C(roast_brew)[T.dark_espr

In [208]:
model_step4 = smf.ols(formula="price_per_kg ~ C(brewing_method) + C(roast) + C(grind) + C(pure_arabica) + C(roast_brew) + C(Washed) + C(Natural) + C(Fermented_traditional) + C(Fermented_closedtank) + C(process_general) + C(region_of_origin)", data=train_df).fit()
print(model_step4.summary()) # a significant further decrease in the pvalue

                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.434
Model:                            OLS   Adj. R-squared:                  0.415
Method:                 Least Squares   F-statistic:                     22.08
Date:                Mon, 31 May 2021   Prob (F-statistic):           2.85e-75
Time:                        20:43:03   Log-Likelihood:                -3264.4
No. Observations:                 775   AIC:                             6583.
Df Residuals:                     748   BIC:                             6708.
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                                                                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------

In [214]:
model_step5 = smf.ols(formula="price_per_kg ~ C(brewing_method) + C(roast) + C(grind) + C(pure_arabica)+ C(Washed) + C(Natural) + C(Fermented_traditional) + C(Fermented_closedtank) + C(process_general) + C(region_of_origin)", data=train_df).fit()
print(model_step5.summary()) # a significant further decrease in the pvalue

                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.429
Model:                            OLS   Adj. R-squared:                  0.412
Method:                 Least Squares   F-statistic:                     25.68
Date:                Mon, 31 May 2021   Prob (F-statistic):           1.27e-76
Time:                        20:44:10   Log-Likelihood:                -3267.9
No. Observations:                 775   AIC:                             6582.
Df Residuals:                     752   BIC:                             6689.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
In

In [223]:
model_step5 = smf.ols(formula="price_per_kg ~ C(brewing_method) + C(roast) + C(grind) + C(pure_arabica) + C(Washed) + C(Natural) + C(Fermented_traditional) + C(Fermented_closedtank) + C(region_of_origin)", data=train_df).fit()
print(model_step5.summary()) # a significant further decrease in the pvalue

                            OLS Regression Results                            
Dep. Variable:           price_per_kg   R-squared:                       0.418
Model:                            OLS   Adj. R-squared:                  0.407
Method:                 Least Squares   F-statistic:                     39.00
Date:                Mon, 31 May 2021   Prob (F-statistic):           1.07e-79
Time:                        20:45:54   Log-Likelihood:                -3275.3
No. Observations:                 775   AIC:                             6581.
Df Residuals:                     760   BIC:                             6650.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
In

In [188]:
train_df.corr()

Unnamed: 0,price_per_kg,percentage_of_arabica,pure_arabica,Washed,Natural,Fermented_traditional,Fermented_closedtank
price_per_kg,1.0,0.275893,0.328694,-0.034993,-0.087074,0.149486,0.053013
percentage_of_arabica,0.275893,1.0,0.829326,0.164142,-0.187585,0.059221,0.033315
pure_arabica,0.328694,0.829326,1.0,0.237032,-0.225001,0.096783,0.025451
Washed,-0.034993,0.164142,0.237032,1.0,-0.605709,0.145402,0.091098
Natural,-0.087074,-0.187585,-0.225001,-0.605709,1.0,0.107534,-0.258827
Fermented_traditional,0.149486,0.059221,0.096783,0.145402,0.107534,1.0,-0.09155
Fermented_closedtank,0.053013,0.033315,0.025451,0.091098,-0.258827,-0.09155,1.0


In [None]:
# import plotly.graph_objects as go

# coffee_df['arabica_fitted'] = model.fittedvalues

# fig = go.Figure()

# fig.add_trace(go.Scatter(
#     x=coffee_df["percentage_of_arabica"], y=coffee_df["price_per_kg"], name="percentage_of_arabica vs price_per_kg (million sq km)", mode="markers"))
# fig.add_trace(go.Scatter(
#     x=coffee_df["percentage_of_arabica"], y=coffee_df["arabica_fitted"], name="Fitted Regression Line"))
# fig.update_layout(title="Regression line of percentage_of_arabica vs price_per_kg (million sq km)", xaxis_title="percentage_of_arabica",
#     yaxis_title="price_per_kg", height=800, width=950)
# fig.show()

In [190]:
# import plotly.express as px

# px.imshow(train_df_no_idx.corr(), color_continuous_scale='Agsunset', title="Correlation between coffee variables")

In [None]:
# model_step2 = smf.ols(formula="price_per_kg ~ C(process) + C(brewing_method) + C(roast) + C(grind) + C(origin) + percentage_of_arabica + C(pure_arabica) + C(roast_brew) + C(Washed) + C(Natural) + C(Fermented_traditional) + C(Fermented_closedtank) + C(process_general) + C(region_of_origin)", data=train_df).fit()
# print(model_step2.summary())