In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [8]:
def fit_mxied_effect_regression_model(independent=None, dependent=None, 
                                      random_intercept=True, random_slope=True, scaled=True):

    vars = independent + [dependent, "reviewerID"]
    test_df =  df[vars].copy()
    
    for var in vars:
        if var != "reviewerID":
            test_df.loc[:, var+"_scaled"] = (test_df[var] - test_df[var].mean())/test_df[var].std()

    if scaled == True:
        formula = f"{dependent}_scaled ~  {' + '.join([x+'_scaled' for x in independent])}"
        print(formula)
        if random_intercept & random_slope:
            md = smf.mixedlm(formula, 
                             test_df.dropna(), 
                              groups=test_df.dropna()["reviewerID"],
                            re_formula=f"~ {' + '.join([x+'_scaled' for x in independent])}")
            
        elif random_intercept:
            md = smf.mixedlm(formula, 
                             test_df.dropna(), 
                              groups=test_df.dropna()["reviewerID"])
        else:
            md = smf.mixedlm(formula, 
                             test_df.dropna())
    else:
        formula = f"{dependent} ~  {' + '.join([x+'_scaled' for x in independent])}"
        print(formula)
        if random_intercept & random_slope:
            md = smf.mixedlm(formula, 
                             test_df.dropna(), 
                              groups=test_df.dropna()["reviewerID"],
                            re_formula=f"~ {' + '.join([x+'_scaled' for x in independent])}")
            
        elif random_intercept:
            md = smf.mixedlm(formula, 
                             test_df.dropna(), 
                              groups=test_df.dropna()["reviewerID"])
        else:
            md = smf.mixedlm(formula, 
                             test_df.dropna())
        
    mdf = md.fit()
    print(mdf.summary())

In [5]:
df = pd.read_csv("../s1/s1_stats_test_final.csv")

# Testing Bivariate relationship

$$
Rating_{i,j} \sim \beta_{0} + \beta_{1} * Number\_of\_past\_readings_{i,j} + \gamma_{0,j} + \gamma_{1,j} * Number\_of\_past\_readings_{i,j} + \epsilon_{i,j}
$$

In [9]:
fit_mxied_effect_regression_model(dependent="overall", independent=["order"], 
                                  random_slope=True, random_intercept=True, scaled=True)

overall_scaled ~  order_scaled
              Mixed Linear Model Regression Results
Model:              MixedLM   Dependent Variable:   overall_scaled
No. Observations:   2083630   Method:               REML          
No. Groups:         35478     Scale:                0.6875        
Min. group size:    21        Log-Likelihood:       -2632042.3127 
Max. group size:    298       Converged:            Yes           
Mean group size:    58.7                                          
------------------------------------------------------------------
                         Coef. Std.Err.   z    P>|z| [0.025 0.975]
------------------------------------------------------------------
Intercept                0.017    0.003  5.383 0.000  0.011  0.024
order_scaled             0.052    0.002 25.362 0.000  0.048  0.056
Group Var                0.327    0.003                           
Group x order_scaled Cov 0.027    0.002                           
order_scaled Var         0.044    0.001       

In [10]:
fit_mxied_effect_regression_model(dependent="avg_rating", independent=["order"], 
                                  random_slope=True, random_intercept=True, scaled=True)

avg_rating_scaled ~  order_scaled
              Mixed Linear Model Regression Results
Model:             MixedLM  Dependent Variable:  avg_rating_scaled
No. Observations:  2083630  Method:              REML             
No. Groups:        35478    Scale:               0.8888           
Min. group size:   21       Log-Likelihood:      -2874377.7958    
Max. group size:   298      Converged:           Yes              
Mean group size:   58.7                                           
------------------------------------------------------------------
                         Coef. Std.Err.   z    P>|z| [0.025 0.975]
------------------------------------------------------------------
Intercept                0.036    0.002 17.521 0.000  0.032  0.041
order_scaled             0.114    0.002 60.481 0.000  0.110  0.117
Group Var                0.112    0.001                           
Group x order_scaled Cov 0.005    0.001                           
order_scaled Var         0.019    0.001    

$$
Step\_size_{i,j} \sim \beta_{0} + \beta_{1} * Number\_of\_past\_readings_{i,j} + \gamma_{0,j} + \gamma_{1,j} * Number\_of\_past\_readings_{i,j} + \epsilon_{i,j}
$$

In [11]:
fit_mxied_effect_regression_model(dependent="step_size_after", independent=["order"], 
                                  random_slope=True, random_intercept=True, scaled=True)

step_size_after_scaled ~  order_scaled
               Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: step_size_after_scaled
No. Observations: 2048152 Method:             REML                  
No. Groups:       35478   Scale:              0.8931                
Min. group size:  20      Log-Likelihood:     -2829205.6124         
Max. group size:  297     Converged:          Yes                   
Mean group size:  57.7                                              
--------------------------------------------------------------------
                          Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------------------
Intercept                  0.004    0.002  2.236 0.025  0.001  0.008
order_scaled              -0.009    0.002 -5.289 0.000 -0.013 -0.006
Group Var                  0.100    0.001                           
Group x order_scaled Cov   0.002    0.001                           
order_scale

$$
Step\_size_{i,j} \sim \beta_{0} + \beta_{1} * Previous\_rating_{i,j} + \gamma_{0,j} + \gamma_{1,j} * Previous\_rating_{i,j} + \epsilon_{i,j}
$$

In [12]:
fit_mxied_effect_regression_model(dependent="step_size_after", independent=["overall"], 
                                  random_slope=True, random_intercept=True, scaled=True)

step_size_after_scaled ~  overall_scaled




                Mixed Linear Model Regression Results
Model:              MixedLM Dependent Variable: step_size_after_scaled
No. Observations:   2048152 Method:             REML                  
No. Groups:         35478   Scale:              0.8968                
Min. group size:    20      Log-Likelihood:     -2829410.6689         
Max. group size:    297     Converged:          Yes                   
Mean group size:    57.7                                              
----------------------------------------------------------------------
                           Coef.  Std.Err.    z    P>|z| [0.025 0.975]
----------------------------------------------------------------------
Intercept                   0.012    0.002   6.415 0.000  0.008  0.015
overall_scaled             -0.040    0.001 -47.310 0.000 -0.042 -0.039
Group Var                   0.098    0.001                            
Group x overall_scaled Cov  0.007    0.000                            
overall_scaled Var     

$$
Step\_size_{i,j} \sim \beta_{0} + \beta_{1} * Rating\_variance_{i,j} + \gamma_{0,j} + \gamma_{1,j} * Rating\_variance_{i,j} + \epsilon_{i,j}
$$

In [13]:
fit_mxied_effect_regression_model(dependent="step_size_after", independent=["rating_dispersion"], 
                                  random_slope=True, random_intercept=True, scaled=True)

step_size_after_scaled ~  rating_dispersion_scaled




                    Mixed Linear Model Regression Results
Model:                MixedLM    Dependent Variable:    step_size_after_scaled
No. Observations:     2027407    Method:                REML                  
No. Groups:           35478      Scale:                 0.8886                
Min. group size:      4          Log-Likelihood:        -2815038.8817         
Max. group size:      296        Converged:             No                    
Mean group size:      57.1                                                    
------------------------------------------------------------------------------
                                     Coef. Std.Err.   z    P>|z| [0.025 0.975]
------------------------------------------------------------------------------
Intercept                            0.010    0.001  6.813 0.000  0.007  0.013
rating_dispersion_scaled             0.026    0.002 12.550 0.000  0.022  0.030
Group Var                            0.054    0.000                      

$$
Step\_size_{i,j} \sim \beta_{0} + \beta_{1} * Log\_Rating\_count_{i,j} + \gamma_{0,j} + \gamma_{1,j} * Log\_Rating\_count_{i,j} + \epsilon_{i,j}
$$

In [27]:
df["review_count_log_after"] = np.log(df.groupby('reviewerID')['review_count'].shift(-1)+0.001)

In [28]:
fit_mxied_effect_regression_model(dependent="step_size_after", independent=["review_count_log_after"],
                                  random_slope=True, random_intercept=True, scaled=True)

step_size_after_scaled ~  review_count_log_after_scaled




                        Mixed Linear Model Regression Results
Model:                   MixedLM      Dependent Variable:      step_size_after_scaled
No. Observations:        2048152      Method:                  REML                  
No. Groups:              35478        Scale:                   0.8916                
Min. group size:         20           Log-Likelihood:          -2826892.2009         
Max. group size:         297          Converged:               Yes                   
Mean group size:         57.7                                                        
-------------------------------------------------------------------------------------
                                          Coef.  Std.Err.    z    P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------
Intercept                                  0.011    0.002   5.971 0.000  0.007  0.014
review_count_log_after_scaled             -0.053    0.001 -52.610 0.000 -0.055

$$
\begin{align}
Step\_size_{i,j} \sim \beta_{0} + \beta_{1} * Number\_of\_past\_readings_{i,j}  + \beta_{2} * Previous\_rating_{i,j} \\
+ \beta_{3} * Rating\_variance_{i,j} + \beta_{4} * Log\_Rating\_count_{i,j} \\
+ \gamma_{0,j} + \gamma_{1,j} * Number\_of\_past\_readings_{i,j} + \gamma_{2,j} * Previous\_rating_{i,j}\\
+ \gamma_{3,j} * Rating\_variance_{i,j} + \gamma_{4,j} * Log\_Rating\_count_{i,j} \\
+ \epsilon_{i,j}
\end{align}
$$

In [29]:
fit_mxied_effect_regression_model(dependent="step_size_after", 
                                  independent=["order", "overall", "rating_dispersion", "review_count_log_after"], 
                                  random_slope=True, random_intercept=True, scaled=True)

step_size_after_scaled ~  order_scaled + overall_scaled + rating_dispersion_scaled + review_count_log_after_scaled




                                 Mixed Linear Model Regression Results
Model:                        MixedLM             Dependent Variable:             step_size_after_scaled
No. Observations:             2027407             Method:                         REML                  
No. Groups:                   35478               Scale:                          0.8777                
Min. group size:              4                   Log-Likelihood:                 -2844426.4288         
Max. group size:              296                 Converged:                      No                    
Mean group size:              57.1                                                                      
--------------------------------------------------------------------------------------------------------
                                                             Coef.  Std.Err.    z    P>|z| [0.025 0.975]
-----------------------------------------------------------------------------------------

  sdf[0:self.k_fe, 1] = np.sqrt(np.diag(self.cov_params()[0:self.k_fe]))
