In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import numpy as np
import random
import math
from sklearn.datasets import make_blobs, make_moons
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sns.set_theme()

In [4]:
import os

In [5]:
os.getcwd()

'/Users/kilianwan/GitHub/ada-2024-project-pdfromscratch/src/workspaces'

In [4]:
df_ba_beers     = pd.read_csv('../data/beer_advocate/beers.csv')
df_ba_users     = pd.read_csv('../data/beer_advocate/users.csv')
df_ba_ratings   = pd.read_csv('../data/beer_advocate/ratings.csv')

In [5]:
df_ba_ratings.columns

Index(['user_id', 'beer_id', 'brewery_id', 'date', 'review', 'rating',
       'overall', 'aroma', 'appearance', 'palate', 'taste', 'text',
       'user_past_ratings_count', 'user_past_ratings_average',
       'beer_past_ratings_count', 'beer_past_ratings_average',
       'beer_global_style', 'user_beer_style_past_ratings_count',
       'user_beer_style_past_ratings_average', 'gini_impurity'],
      dtype='object')

In [6]:
df_ba_ratings.head()

Unnamed: 0,user_id,beer_id,brewery_id,date,review,rating,overall,aroma,appearance,palate,taste,text,user_past_ratings_count,user_past_ratings_average,beer_past_ratings_count,beer_past_ratings_average,beer_global_style,user_beer_style_past_ratings_count,user_beer_style_past_ratings_average,gini_impurity
0,nmann08.184925,142544,37262,1440064800,True,2.88,3.0,2.75,3.25,3.25,2.75,"From a bottle, pours a piss yellow color with ...",5915,3.742926,0,,Pale Lager,324,3.183796,0.94766
1,stjamesgate.163714,19590,10093,1235127600,True,3.67,3.5,3.5,3.0,3.5,4.0,Pours pale copper with a thin head that quickl...,27,3.788148,3,3.903333,Pale Ale,4,3.765,0.881834
2,mdagnew.19527,19590,10093,1142247600,True,3.73,3.5,3.5,4.0,3.5,4.0,"500ml Bottle bought from The Vintage, Antrim.....",133,4.017068,2,3.99,Pale Ale,30,4.018667,0.952044
3,helloloser12345.10867,19590,10093,1101898800,True,3.98,4.5,3.5,4.0,4.0,4.0,Serving: 500ml brown bottlePour: Good head wit...,1,3.65,1,4.0,Pale Ale,2,3.815,0.0
4,cypressbob.3708,19590,10093,1093860000,True,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ...",168,3.229583,0,,Pale Ale,44,3.501818,0.837813


In [7]:
df_ba_ratings_filtered = df_ba_ratings.dropna(subset=['user_beer_style_past_ratings_average', 'beer_past_ratings_average'])
X = df_ba_ratings_filtered[['user_beer_style_past_ratings_average', 'beer_past_ratings_average']]
y = df_ba_ratings_filtered['rating']

In [8]:
len(X)

8145050

In [9]:
len(y)

8145050

In [10]:
reg = sm.OLS(y, X).fit()

In [11]:
reg.summary()

0,1,2,3
Dep. Variable:,rating,R-squared (uncentered):,0.989
Model:,OLS,Adj. R-squared (uncentered):,0.989
Method:,Least Squares,F-statistic:,377800000.0
Date:,"Sun, 01 Dec 2024",Prob (F-statistic):,0.0
Time:,11:59:18,Log-Likelihood:,-4223900.0
No. Observations:,8145050,AIC:,8448000.0
Df Residuals:,8145048,BIC:,8448000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
user_beer_style_past_ratings_average,0.5448,0.000,1623.609,0.000,0.544,0.545
beer_past_ratings_average,0.4557,0.000,1365.887,0.000,0.455,0.456

0,1,2,3
Omnibus:,1649292.337,Durbin-Watson:,1.758
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6032645.629
Skew:,-0.991,Prob(JB):,0.0
Kurtosis:,6.721,Cond. No.,18.4


## Interpretation

- We get a high $R^2$ ($0.987$). This indicates that the model explains $98.7\%$ of the variance in `rating`. This suggests a strong linear relationship between the predictors and the dependent variable.
- Coefficients: 
    - `user_past_ratings_average` has a coefficient of $0.2163$. So for every 1-unit increase in `user_past_ratings_average`, `rating` increases by $0.216$ (when we hold `beer_past_ratings_average` constant)
    - `beer_past_ratings_average`: has a coefficient of $0.7816$. so for every 1-unit increase in `beer_past_ratings_average`, `rating` increases by $0.782$ (when we hold `user_past_ratings_average` constant).
- $p$-values : we get for both predictors a $p$-value of zero, meaning that their coefficients are highly statistically significant. So they are both strong predictors of the `rating`.
- $F$-statistic: we get an extremely high value ($\texttt{3.015e+08}$), and with its corresponding $p$-value (0) we get that at least one of the predictors contributes meaningfully to the `rating`.

# CHANGE

## Improve the model?

Now we can for example add the interaction term to see whether their combined effect significantly impacts `rating`. We will then look at the residuals to see if there is any linear relationship, and/or outliers/heteroskedasticity. In the case where we find nonlinear models (when residuals show symmetric patterns), we will consider a polynomial regression for example

In [50]:
from sklearn.preprocessing import PolynomialFeatures

In [51]:
interaction_term = PolynomialFeatures(interaction_only=True, include_bias=False)
interaction_term.fit_transform(X)

array([[ 3.78814815,  3.90333333, 14.78640494],
       [ 4.01706767,  3.99      , 16.0281    ],
       [ 3.65      ,  4.        , 14.6       ],
       ...,
       [ 3.81806452,  2.17      ,  8.2852    ],
       [ 3.75364929,  1.54      ,  5.78061991],
       [ 2.78571429,  3.        ,  8.35714286]])

In [12]:
X['interaction'] = X['user_beer_style_past_ratings_average'] * X['beer_past_ratings_average']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['interaction'] = X['user_beer_style_past_ratings_average'] * X['beer_past_ratings_average']


In [13]:
reg_inter = sm.OLS(y,X).fit()

In [14]:
reg_inter.summary()

0,1,2,3
Dep. Variable:,rating,R-squared (uncentered):,0.99
Model:,OLS,Adj. R-squared (uncentered):,0.99
Method:,Least Squares,F-statistic:,264700000.0
Date:,"Sun, 01 Dec 2024",Prob (F-statistic):,0.0
Time:,11:59:28,Log-Likelihood:,-4023200.0
No. Observations:,8145050,AIC:,8046000.0
Df Residuals:,8145047,BIC:,8046000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
user_beer_style_past_ratings_average,0.4035,0.000,1022.498,0.000,0.403,0.404
beer_past_ratings_average,0.3378,0.000,903.349,0.000,0.337,0.338
interaction,0.0655,0.000,641.360,0.000,0.065,0.066

0,1,2,3
Omnibus:,1372507.787,Durbin-Watson:,1.821
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5975822.53
Skew:,-0.774,Prob(JB):,0.0
Kurtosis:,6.9,Cond. No.,54.9


## Interpretation

- We get the same $R^2$ ($0.987$), and the $R_{\text{adj}}^2$ is the same, so we have no penalty by adding the interaction term.
- Coefficients: 
    - `user_past_ratings_average` has a coefficient of $0.1375$. Which is less than before (**interpret more**) So for every 1-unit increase in `user_past_ratings_average`, `rating` increases by $0.1375$ (when we hold `beer_past_ratings_average` constant)
    - `beer_past_ratings_average`: has a coefficient of $0.6170$. Again less than before (**interpret more**) so for every 1-unit increase in `beer_past_ratings_average`, `rating` increases by $0.6170$ (when we hold `user_past_ratings_average` constant).
    - `user_past_ratings_average:beer_past_ratings_average` has a coefficient of $0.0623$ and a $p$-value of zero, showing that the interaction term is statistically significant
- $p$-values : we get for both predictors a $p$-value of zero, meaning that their coefficients are highly statistically significant. So they are both strong predictors of the `rating`.
- $F$-statistic: we get an extremely high value ($\texttt{3.015e+08}$), and with its corresponding $p$-value (0) we get that at least one of the predictors contributes meaningfully to the `rating`.

# CHANGE

In [15]:
df_ba_ratings.head()

Unnamed: 0,user_id,beer_id,brewery_id,date,review,rating,overall,aroma,appearance,palate,taste,text,user_past_ratings_count,user_past_ratings_average,beer_past_ratings_count,beer_past_ratings_average,beer_global_style,user_beer_style_past_ratings_count,user_beer_style_past_ratings_average,gini_impurity
0,nmann08.184925,142544,37262,1440064800,True,2.88,3.0,2.75,3.25,3.25,2.75,"From a bottle, pours a piss yellow color with ...",5915,3.742926,0,,Pale Lager,324,3.183796,0.94766
1,stjamesgate.163714,19590,10093,1235127600,True,3.67,3.5,3.5,3.0,3.5,4.0,Pours pale copper with a thin head that quickl...,27,3.788148,3,3.903333,Pale Ale,4,3.765,0.881834
2,mdagnew.19527,19590,10093,1142247600,True,3.73,3.5,3.5,4.0,3.5,4.0,"500ml Bottle bought from The Vintage, Antrim.....",133,4.017068,2,3.99,Pale Ale,30,4.018667,0.952044
3,helloloser12345.10867,19590,10093,1101898800,True,3.98,4.5,3.5,4.0,4.0,4.0,Serving: 500ml brown bottlePour: Good head wit...,1,3.65,1,4.0,Pale Ale,2,3.815,0.0
4,cypressbob.3708,19590,10093,1093860000,True,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ...",168,3.229583,0,,Pale Ale,44,3.501818,0.837813


In [16]:
df_ba_ratings_grouped = df_ba_ratings_filtered.groupby('beer_global_style')

In [17]:
len(df_ba_ratings_grouped)

15

In [18]:
def summary_per_style(df, style_column, independents, dependent):
    results = []
    
    grouped_data = df.groupby(style_column)
    
    for style, group in grouped_data:
        group = group.dropna(subset=independents + [dependent])
        
        if group.empty:
            continue
        X = group[independents].copy()
        # maybe add interaction but we will see depending on results
        X = sm.add_constant(X)
        y = group[dependent]
        
        model = sm.OLS(y, X).fit()
        
        result = {
            'Style': style,
            'R^2': model.rsquared,
            '| Coeff: const': model.params.get('const', None),
            '| Coeff: User per beer Past Ratings (avg)': model.params.get(independents[0], None),
            '| Coeff: Beer Past Ratings (avg)': model.params.get(independents[1], None),
            # maybe not now : 'Coeff: interaction': model.params.get('interaction', None),
            '| P-value: User per beer Past Ratings ': model.pvalues.get(independents[0], None),
            '| P-value: Beer Past Ratings': model.pvalues.get(independents[1], None),
            # 'P-value: interaction': model.pvalues.get('interaction', None),
            'F-statistic': model.fvalue,
            'F P-value': model.f_pvalue,
        }
        results.append(result)
    
    summary_df = pd.DataFrame(results).set_index('Style')
    
    return summary_df

In [19]:
independents = ['user_beer_style_past_ratings_average', 'beer_past_ratings_average']
dependent = 'rating'
style_column = 'beer_global_style'
summary_table = summary_per_style(df_ba_ratings, style_column, independents, dependent)

print(summary_table)

                       R^2  | Coeff: const  \
Style                                        
Bock              0.599008       -1.467333   
Brown Ale         0.551852       -1.589077   
Dark Ales         0.525978       -1.918984   
Dark Lager        0.570635       -1.623234   
Hybrid Beer       0.710977       -1.066024   
India Pale Ale    0.511582       -1.924551   
Low Alcohol Beer  0.824118       -0.358360   
Pale Ale          0.492103       -1.685172   
Pale Lager        0.669550       -0.942026   
Porter            0.534926       -1.752181   
Speciality Beer   0.547459       -1.459975   
Stout             0.494627       -1.908537   
Strong Ale        0.490553       -2.148193   
Wheat Beer        0.564594       -1.551311   
Wild/Sour Beer    0.523799       -1.848571   

                  | Coeff: User per beer Past Ratings (avg)  \
Style                                                         
Bock                                               0.811293   
Brown Ale                   

# Interpretation for 2 beers (IPA and Pale Lager)

- IPA : 
    - $R^2 = 0.377873$. This model explains $37.8\%$ of the variation in ratings for IPAs.
    - Coeff (intercept) = $-0.801683$. The intercept is negative, which represents the expected rating when all predictors are zero. (Not very interpretable)
    - Coeff `user_past_ratings_average` = $0.368157$

In [20]:
def summary_per_style_interact(df, style_column, independents, dependent):
    results = []
    
    grouped_data = df.groupby(style_column)
    
    for style, group in grouped_data:
        group = group.dropna(subset=independents + [dependent])
        
        if group.empty:
            continue
        X = group[independents].copy()
        X['interaction'] = X[independents[0]] * X[independents[1]]
        X = sm.add_constant(X)
        y = group[dependent]
        
        model = sm.OLS(y, X).fit()
        
        result = {
            'Style': style,
            'R^2': model.rsquared,
            '| Coeff: const': model.params.get('const', None),
            '| Coeff: User per beer Past Ratings (avg)': model.params.get(independents[0], None),
            '| Coeff: Beer Past Ratings (avg)': model.params.get(independents[1], None),
            '| Coeff: interaction': model.params.get('interaction', None),
            '| P-value: User per beer Past Ratings ': model.pvalues.get(independents[0], None),
            '| P-value: Beer Past Ratings': model.pvalues.get(independents[1], None),
            '| P-value: interaction': model.pvalues.get('interaction', None),
            'F-statistic': model.fvalue,
            'F P-value': model.f_pvalue,
        }
        results.append(result)
    
    summary_df = pd.DataFrame(results).set_index('Style')
    
    return summary_df

In [None]:
independents = ['user_beer_style_past_ratings_average', 'beer_past_ratings_average']
dependent = 'rating'
style_column = 'beer_global_style'
summary_table = summary_per_style_interact(df_ba_ratings, style_column, independents, dependent)

print(summary_table)

                       R^2  | Coeff: const  \
Style                                        
Bock              0.599008       -1.469347   
Brown Ale         0.552191       -2.745773   
Dark Ales         0.525999       -2.255472   
Dark Lager        0.571005       -2.538585   
Hybrid Beer       0.711062       -1.435255   
India Pale Ale    0.512304       -3.870092   
Low Alcohol Beer  0.824203       -0.481803   
Pale Ale          0.493288       -3.785159   
Pale Lager        0.677567       -2.660281   
Porter            0.535709       -3.609966   
Speciality Beer   0.548345       -2.479802   
Stout             0.495179       -3.728350   
Strong Ale        0.490960       -3.780283   
Wheat Beer        0.565984       -3.289709   
Wild/Sour Beer    0.523897       -1.158959   

                  | Coeff: User per beer Past Ratings (avg)  \
Style                                                         
Bock                                               0.811830   
Brown Ale                   