In [47]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import numpy as np
import random
import math
from sklearn.datasets import make_blobs, make_moons
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
%matplotlib inline

In [3]:
sns.set_theme()

In [5]:
import os

In [6]:
os.getcwd()

'/Users/kilianwan/GitHub/ada-2024-project-pdfromscratch/src/workspaces'

In [7]:
df_ba_beers     = pd.read_csv('../data/beer_advocate/beers.csv')
df_ba_users     = pd.read_csv('../data/beer_advocate/users.csv')
df_ba_ratings   = pd.read_csv('../data/beer_advocate/ratings.csv')

In [15]:
df_ba_ratings.columns

Index(['user_id', 'beer_id', 'brewery_id', 'date', 'review', 'rating',
       'overall', 'aroma', 'appearance', 'palate', 'taste', 'text',
       'user_past_ratings_count', 'user_past_ratings_average',
       'beer_past_ratings_count', 'beer_past_ratings_average',
       'beer_global_style', 'user_beer_style_past_ratings_count',
       'user_beer_style_past_ratings_average', 'gini_impurity'],
      dtype='object')

In [16]:
df_ba_ratings.head()

Unnamed: 0,user_id,beer_id,brewery_id,date,review,rating,overall,aroma,appearance,palate,taste,text,user_past_ratings_count,user_past_ratings_average,beer_past_ratings_count,beer_past_ratings_average,beer_global_style,user_beer_style_past_ratings_count,user_beer_style_past_ratings_average,gini_impurity
0,nmann08.184925,142544,37262,1440064800,True,2.88,3.0,2.75,3.25,3.25,2.75,"From a bottle, pours a piss yellow color with ...",5915,3.742926,0,,Pale Lager,324,3.183796,0.94766
1,stjamesgate.163714,19590,10093,1235127600,True,3.67,3.5,3.5,3.0,3.5,4.0,Pours pale copper with a thin head that quickl...,27,3.788148,3,3.903333,Pale Ale,4,3.765,0.881834
2,mdagnew.19527,19590,10093,1142247600,True,3.73,3.5,3.5,4.0,3.5,4.0,"500ml Bottle bought from The Vintage, Antrim.....",133,4.017068,2,3.99,Pale Ale,30,4.018667,0.952044
3,helloloser12345.10867,19590,10093,1101898800,True,3.98,4.5,3.5,4.0,4.0,4.0,Serving: 500ml brown bottlePour: Good head wit...,1,3.65,1,4.0,Pale Ale,2,3.815,0.0
4,cypressbob.3708,19590,10093,1093860000,True,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ...",168,3.229583,0,,Pale Ale,44,3.501818,0.837813


In [38]:
df_ba_ratings_filtered = df_ba_ratings.dropna(subset=['user_past_ratings_average', 'beer_past_ratings_average'])
X = df_ba_ratings_filtered[['user_past_ratings_average', 'beer_past_ratings_average']]
y = df_ba_ratings_filtered['rating']

In [39]:
len(X)

7993072

In [41]:
len(y)

7993072

In [48]:
reg = sm.OLS(y, X).fit()

In [49]:
reg.summary()

0,1,2,3
Dep. Variable:,rating,R-squared (uncentered):,0.987
Model:,OLS,Adj. R-squared (uncentered):,0.987
Method:,Least Squares,F-statistic:,301500000.0
Date:,"Sat, 30 Nov 2024",Prob (F-statistic):,0.0
Time:,18:18:48,Log-Likelihood:,-4954200.0
No. Observations:,7993072,AIC:,9908000.0
Df Residuals:,7993070,BIC:,9908000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
user_past_ratings_average,0.2163,0.000,610.464,0.000,0.216,0.217
beer_past_ratings_average,0.7816,0.000,2237.780,0.000,0.781,0.782

0,1,2,3
Omnibus:,1700513.143,Durbin-Watson:,1.874
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6948170.999
Skew:,-1.007,Prob(JB):,0.0
Kurtosis:,7.099,Cond. No.,17.2


## Interpretation

- We get a high $R^2$ ($0.987$). This indicates that the model explains $98.7\%$ of the variance in `rating`. This suggests a strong linear relationship between the predictors and the dependent variable.
- Coefficients: 
    - `user_past_ratings_average` has a coefficient of $0.2163$. So for every 1-unit increase in `user_past_ratings_average`, `rating` increases by $0.216$ (when we hold `beer_past_ratings_average` constant)
    - `beer_past_ratings_average`: has a coefficient of $0.7816$. so for every 1-unit increase in `beer_past_ratings_average`, `rating` increases by $0.782$ (when we hold `user_past_ratings_average` constant).
- $p$-values : we get for both predictors a $p$-value of zero, meaning that their coefficients are highly statistically significant. So they are both strong predictors of the `rating`.
- $F$-statistic: we get an extremely high value ($\texttt{3.015e+08}$), and with its corresponding $p$-value (0) we get that at least one of the predictors contributes meaningfully to the `rating`.

## Improve the model?

Now we can for example add the interaction term to see whether their combined effect significantly impacts `rating`. We will then look at the residuals to see if there is any linear relationship, and/or outliers/heteroskedasticity. In the case where we find nonlinear models (when residuals show symmetric patterns), we will consider a polynomial regression for example

In [50]:
from sklearn.preprocessing import PolynomialFeatures

In [51]:
interaction_term = PolynomialFeatures(interaction_only=True, include_bias=False)
interaction_term.fit_transform(X)

array([[ 3.78814815,  3.90333333, 14.78640494],
       [ 4.01706767,  3.99      , 16.0281    ],
       [ 3.65      ,  4.        , 14.6       ],
       ...,
       [ 3.81806452,  2.17      ,  8.2852    ],
       [ 3.75364929,  1.54      ,  5.78061991],
       [ 2.78571429,  3.        ,  8.35714286]])

In [54]:
X['interaction'] = X['user_past_ratings_average'] * X['beer_past_ratings_average']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['interaction'] = X['user_past_ratings_average'] * X['beer_past_ratings_average']


In [55]:
reg_inter = sm.OLS(y,X).fit()

In [56]:
reg_inter.summary()

0,1,2,3
Dep. Variable:,rating,R-squared (uncentered):,0.987
Model:,OLS,Adj. R-squared (uncentered):,0.987
Method:,Least Squares,F-statistic:,206000000.0
Date:,"Sat, 30 Nov 2024",Prob (F-statistic):,0.0
Time:,18:31:28,Log-Likelihood:,-4856100.0
No. Observations:,7993072,AIC:,9712000.0
Df Residuals:,7993069,BIC:,9712000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
user_past_ratings_average,0.1375,0.000,350.525,0.000,0.137,0.138
beer_past_ratings_average,0.6170,0.001,1220.395,0.000,0.616,0.618
interaction,0.0623,0.000,445.515,0.000,0.062,0.063

0,1,2,3
Omnibus:,1536687.907,Durbin-Watson:,1.909
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6961595.765
Skew:,-0.877,Prob(JB):,0.0
Kurtosis:,7.222,Cond. No.,55.3


## Interpretation

- We get the same $R^2$ ($0.987$), and the $R_{\text{adj}}^2$ is the same, so we have no penalty by adding the interaction term.
- Coefficients: 
    - `user_past_ratings_average` has a coefficient of $0.1375$. Which is less than before (**interpret more**) So for every 1-unit increase in `user_past_ratings_average`, `rating` increases by $0.1375$ (when we hold `beer_past_ratings_average` constant)
    - `beer_past_ratings_average`: has a coefficient of $0.6170$. Again less than before (**interpret more**) so for every 1-unit increase in `beer_past_ratings_average`, `rating` increases by $0.6170$ (when we hold `user_past_ratings_average` constant).
    - `user_past_ratings_average:beer_past_ratings_average` has a coefficient of $0.0623$ and a $p$-value of zero, showing that the interaction term is statistically significant
- $p$-values : we get for both predictors a $p$-value of zero, meaning that their coefficients are highly statistically significant. So they are both strong predictors of the `rating`.
- $F$-statistic: we get an extremely high value ($\texttt{3.015e+08}$), and with its corresponding $p$-value (0) we get that at least one of the predictors contributes meaningfully to the `rating`.

**FINISH**

In [57]:
df_ba_ratings.head()

Unnamed: 0,user_id,beer_id,brewery_id,date,review,rating,overall,aroma,appearance,palate,taste,text,user_past_ratings_count,user_past_ratings_average,beer_past_ratings_count,beer_past_ratings_average,beer_global_style,user_beer_style_past_ratings_count,user_beer_style_past_ratings_average,gini_impurity
0,nmann08.184925,142544,37262,1440064800,True,2.88,3.0,2.75,3.25,3.25,2.75,"From a bottle, pours a piss yellow color with ...",5915,3.742926,0,,Pale Lager,324,3.183796,0.94766
1,stjamesgate.163714,19590,10093,1235127600,True,3.67,3.5,3.5,3.0,3.5,4.0,Pours pale copper with a thin head that quickl...,27,3.788148,3,3.903333,Pale Ale,4,3.765,0.881834
2,mdagnew.19527,19590,10093,1142247600,True,3.73,3.5,3.5,4.0,3.5,4.0,"500ml Bottle bought from The Vintage, Antrim.....",133,4.017068,2,3.99,Pale Ale,30,4.018667,0.952044
3,helloloser12345.10867,19590,10093,1101898800,True,3.98,4.5,3.5,4.0,4.0,4.0,Serving: 500ml brown bottlePour: Good head wit...,1,3.65,1,4.0,Pale Ale,2,3.815,0.0
4,cypressbob.3708,19590,10093,1093860000,True,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ...",168,3.229583,0,,Pale Ale,44,3.501818,0.837813


In [60]:
df_ba_ratings_grouped = df_ba_ratings_filtered.groupby('beer_global_style')


In [62]:
len(df_ba_ratings_grouped)

15

In [71]:
def summary_per_style(df, style_column, independents, dependent):
    results = []
    
    grouped_data = df.groupby(style_column)
    
    for style, group in grouped_data:
        group = group.dropna(subset=independents + [dependent])
        
        if group.empty:
            continue
        X = group[independents].copy()
        # maybe add interaction but we will see depending on results
        X = sm.add_constant(X)
        y = group[dependent]
        
        model = sm.OLS(y, X).fit()
        
        result = {
            'Style': style,
            'R^2': model.rsquared,
            'Coeff: const': model.params.get('const', None),
            'Coeff: User Past Ratings (avg)': model.params.get(independents[0], None),
            'Coeff: Beer Past Ratings (avg)': model.params.get(independents[1], None),
            # maybe not now : 'Coeff: interaction': model.params.get('interaction', None),
            'P-value: User Past Ratings ': model.pvalues.get(independents[0], None),
            'P-value: Beer Past Ratings': model.pvalues.get(independents[1], None),
            # 'P-value: interaction': model.pvalues.get('interaction', None),
            'F-statistic': model.fvalue,
            'F P-value': model.f_pvalue,
        }
        results.append(result)
    
    summary_df = pd.DataFrame(results).set_index('Style')
    
    return summary_df

In [74]:
independents = ['user_past_ratings_average', 'beer_past_ratings_average']
dependent = 'rating'
style_column = 'beer_global_style'
summary_table = summary_per_style(df_ba_ratings, style_column, independents, dependent)

# Display the table
print(summary_table)

                       R^2  Coeff: const  Coeff: User Past Ratings (avg)  \
Style                                                                      
Bock              0.335119     -0.747250                        0.380904   
Brown Ale         0.258772     -0.590283                        0.409896   
Dark Ales         0.265376     -0.800020                        0.408315   
Dark Lager        0.296969     -0.671450                        0.362504   
Hybrid Beer       0.282970     -0.577498                        0.385518   
India Pale Ale    0.377873     -0.801683                        0.368157   
Low Alcohol Beer  0.267280     -0.931093                        0.389556   
Pale Ale          0.361233     -0.811586                        0.417286   
Pale Lager        0.575643     -0.910555                        0.327572   
Porter            0.316983     -0.645783                        0.397467   
Speciality Beer   0.361857     -0.878745                        0.384875   
Stout       

# Interpretation for 2 beers (IPA and Pale Lager)

- IPA : 
    - $R^2 = 0.377873$. This model explains $37.8\%$ of the variation in ratings for IPAs.
    - Coeff (intercept) = $-0.801683$. The intercept is negative, which represents the expected rating when all predictors are zero. (Not very interpretable)
    - Coeff `user_past_ratings_average` = $0.368157$