# Beer type preferences

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

## Functions

In [2]:
def get_LR(data, columns):
    data_to_process = data.copy() # copy original dataset
    
    # create formula
    columns=list(columns)
    formula = 'rating ~ ' + columns[0]
    for el in columns[1:-1]:
        formula += ' + ' + el
    
    # standardization and creation of the formula
    columns.append('rating')  # add rating for the linear regression and standardization
    data_to_process = data_to_process[columns].dropna().sample(frac=1)  # only keeps columns of interest and shuffle the samples
    data_to_process['rating'] = (data_to_process['rating'] - data_to_process['rating'].mean()) / data_to_process['rating'].std()
    
    # create the model and fit it to the dataset
    mod = smf.ols(formula=formula, data=data_to_process)
    np.random.seed(2)
    res = mod.fit()
    return res

In [51]:
def styles_rephrase(lst_styles):
    # replace some characters
    old_values = [' ', '-', '/', '(', ')', '&', ',', '____', '___', '__', 'style_']
    for val in old_values:
        lst_styles = lst_styles.str.replace(val,'_')
    
    # remove '_' if in first or last position of style name
    lst_styles = [x.rstrip('_') for x in lst_styles]
    lst_styles = [x.lstrip('_') for x in lst_styles]
    
    return lst_styles

## Loading and selecting data

In [3]:
df_ba = pd.read_pickle('../data/preprocessed/BeerAdvocate/transformed_ratings_ba.pkl')
df_rb = pd.read_pickle('../data/preprocessed/RateBeer/transformed_ratings_rb.pkl')
print(df_ba.columns)
print(df_rb.columns)

Index(['beer_name', 'beer_id', 'brewery_name', 'brewery_id', 'style', 'abv',
       'date', 'user_name', 'user_id', 'appearance', 'aroma', 'palate',
       'taste', 'overall', 'rating', 'text', 'review', 'user_location',
       'brewery_location', 'nbr_beers_by_brewery', 'zscore'],
      dtype='object')
Index(['beer_name', 'beer_id', 'brewery_name', 'brewery_id', 'style', 'abv',
       'date', 'user_name', 'user_id', 'appearance', 'aroma', 'palate',
       'taste', 'overall', 'rating', 'text', 'user_location',
       'brewery_location', 'nbr_beers_by_brewery', 'zscore'],
      dtype='object')


In [4]:
df_ba_style = df_ba[['beer_id', 'beer_name', 'brewery_id', 'brewery_name', 'brewery_location', 'user_location', 'style','rating']]
df_rb_style = df_rb[['beer_id', 'beer_name', 'brewery_id', 'brewery_name', 'brewery_location', 'user_location', 'style','rating']]
df_styles_reviews = pd.concat([df_ba_style, df_rb_style]).reset_index().drop('index', axis=1).dropna()
print(df_styles_reviews.head())
print(len(df_styles_reviews))



   beer_id                            beer_name  brewery_id  \
0    10187          Downtown Nut Brown (D.T.B.)         682   
1    77352                      American Muscle       16773   
2   188079                          Three Lions         458   
3    35625                Weihenstephaner Vitus         252   
4      412  Old Rasputin Russian Imperial Stout         112   

                              brewery_name           brewery_location  \
0                     Pearl Street Brewery   United States, Wisconsin   
1                             Local Option    United States, Illinois   
2                 Real Ale Brewing Company       United States, Texas   
3  Bayerische Staatsbrauerei Weihenstephan                    Germany   
4                  North Coast Brewing Co.  United States, California   

                 user_location                           style  rating  
0      United States, Illinois               English Brown Ale    3.75  
1      United States, Illinois  Amer

## Sorting by countries

In [5]:
user_loc_count = df_styles_reviews.groupby(['user_location'])['user_location'].count().sort_values(ascending=False)

# select 10 samples with the most reviews:
user_loc_count = user_loc_count[:10]
print(user_loc_count)

user_location
United States, California       94391
United States, Pennsylvania     85412
Canada                          77944
United States, Illinois         70746
United States, New York         62099
Denmark                         57723
United States, Texas            51296
England                         48077
United States, Massachusetts    45477
United States, Ohio             44612
Name: user_location, dtype: int64


## Finding favourite styles

In [49]:
df_styles_location_dumm = pd.get_dummies(df_styles_reviews, columns=['style'])
best_type_per_user_location = {}

# remove special characters in columns names
df_styles_location_dumm.columns = styles_rephrase(df_styles_location_dumm.columns)

# transform boolean values to int values for linear regression
for col in df_styles_location_dumm.drop(['beer_id', 'beer_name', 'brewery_name', 'brewery_location','user_location', 'brewery_id', 'rating'], axis=1).columns:
    df_styles_location_dumm[col] = df_styles_location_dumm[col] * 1

# perform linear regression on each location
columns= list(df_styles_location_dumm.drop(['beer_id', 'beer_name', 'user_location', 'brewery_name', 'brewery_location', 'brewery_id', 'rating'], axis=1).columns)
for idx, loc in enumerate(list(user_loc_count.keys())):
    print(loc)
    df_reviews = df_styles_location_dumm[df_styles_reviews['user_location'] == loc]
    res = get_LR(df_reviews, columns)
    print(res.params.sort_values(ascending=False)[0:3])
    best_type_per_user_location[loc] = res.params.sort_values(ascending=False)[0:5]

United States, California
Gueuze                            1.565430
American_Double_Imperial_Stout    1.503391
Russian_Imperial_Stout            1.402145
dtype: float64
United States, Pennsylvania
Gueuze                            1.626245
American_Double_Imperial_Stout    1.570449
Quadrupel_Quad                    1.505597
dtype: float64
Canada
Lambic_Unblended                  1.192381
American_Double_Imperial_Stout    1.154319
Flanders_Red_Ale                  1.137932
dtype: float64
United States, Illinois
Gueuze                            1.284335
American_Double_Imperial_Stout    1.276363
Russian_Imperial_Stout            1.133976
dtype: float64
United States, New York
Gueuze                            1.720417
American_Double_Imperial_Stout    1.684690
Lambic_Fruit                      1.632737
dtype: float64
Denmark
Lambic_Unblended                2.610578
Quadrupel_Quad                  2.468576
American_Double_Imperial_IPA    2.463413
dtype: float64
United States, Texas
Gueu

In [7]:
print(best_type_per_user_location)

{'United States, California': Gueuze                            1.565430
American_Double_Imperial_Stout    1.503391
Russian_Imperial_Stout            1.402145
American_Wild_Ale                 1.398732
English_Barleywine                1.373325
dtype: float64, 'United States, Pennsylvania': Gueuze                            1.626245
American_Double_Imperial_Stout    1.570449
Quadrupel_Quad_                   1.505597
Flanders_Red_Ale                  1.451138
American_Double_Imperial_IPA      1.435362
dtype: float64, 'Canada': Lambic_Unblended                  1.192381
American_Double_Imperial_Stout    1.154319
Flanders_Red_Ale                  1.137932
Gueuze                            1.120182
Quadrupel_Quad_                   1.112203
dtype: float64, 'United States, Illinois': Gueuze                            1.284335
American_Double_Imperial_Stout    1.276363
Russian_Imperial_Stout            1.133976
English_Barleywine                1.132260
American_Double_Imperial_IPA      1.1

## Finding local breweries indicator

In [8]:
frac_reviews_per_brewery = {}

# loop on every location
for idx, loc in enumerate(list(user_loc_count.keys())):
    print("\n=== " + loc + " ===")

    # total number of reviews per brewery
    num_reviews_per_brewery = df_styles_reviews[df_styles_reviews['brewery_location'] == loc] \
    .groupby(['brewery_name'])['brewery_name'] \
    .count() \
    .sort_values(ascending=False)

    # number of local reviews per brewery
    num_reviews_per_brewery_loc = df_styles_reviews[(df_styles_reviews['brewery_location'] == loc) & (df_styles_reviews['user_location'] == loc)] \
    .groupby(['brewery_name'])['brewery_name'] \
    .count() \
    .sort_values(ascending=False)
    
    # fraction of local rewviews per brewery (90% and up)
    frac_reviews_per_brewery[loc] = num_reviews_per_brewery_loc.divide(num_reviews_per_brewery).dropna().sort_values(ascending=False)
    frac_reviews_per_brewery[loc] = frac_reviews_per_brewery[loc][frac_reviews_per_brewery[loc].values > .9]


=== United States, California ===

=== United States, Pennsylvania ===

=== Canada ===

=== United States, Illinois ===

=== United States, New York ===

=== Denmark ===

=== United States, Texas ===

=== England ===

=== United States, Massachusetts ===

=== United States, Ohio ===


In [9]:
frac_reviews_per_brewery

{'United States, California': brewery_name
 Pacific Brewing Company          1.000000
 Old Hangtown Beer Works          1.000000
 Old Baldy Brewery                1.000000
 Camino Brewing                   1.000000
 O'Meara Bros. Brewing Company    1.000000
                                    ...   
 Strike Brewing Co.               0.937500
 Claimstake Brewing               0.923077
 Island Brewing Company           0.923077
 Old Orange Brewing Co.           0.916667
 Hoparazzi Brewing Co.            0.916667
 Name: brewery_name, Length: 204, dtype: float64,
 'United States, Pennsylvania': brewery_name
 Zuckfoltzfus Brewing Co.     1.000000
 Coal Tipple Brewery          1.000000
 Howling Henrys Brewery      1.000000
 Howling Henry's Brewery      1.000000
 Hop Hill Brewing Co.         1.000000
                                ...   
 River House Brewpub          0.933333
 Spoonwood Brewing Company    0.923077
 Doylestown Brewing Co        0.916667
 Kutztown Tavern              0.913043

## Looking for each country if local breweries is answering to local preferences

### Get the global average representation of each style

In [11]:
"""df_brewery_style = df_styles_reviews[['brewery_name','style']] \
    .groupby(['brewery_name','style']) \
    .count()"""
print(df_styles_reviews[df_styles_reviews['brewery_name'] == 'Founders Brewing Company'].groupby(['style','beer_name']))
df_brewery_style = df_styles_reviews.groupby(['brewery_name','style','beer_id']).size().reset_index().rename(columns={0:'count'})
df_brewery_style = df_brewery_style.drop(['beer_id','count'], axis=1)
#df_glob_avg = df_styles_reviews.groupby(['brewery_name','style']).size().sort_values(ascending=False)
print(df_brewery_style.groupby(['brewery_name','style']).size().sort_values(ascending=False))

list_breweries = df_brewery_style.brewery_name.unique()
list_styles = df_brewery_style['style'].unique()
print(len(list_breweries))
print(len(list_styles))

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000011B86B09700>
brewery_name                                                        style                 
Tired Hands Brewing Company                                         American IPA              192
                                                                    Saison / Farmhouse Ale    149
Crooked Stave                                                       Sour/Wild Ale              97
Crooked Stave Artisan Beer Project                                  American Wild Ale          93
de Garde Brewing                                                    American Wild Ale          83
                                                                                             ... 
Fourpure Brewing Co.                                                Sour/Wild Ale               1
                                                                    Smoked                      1
                                                 

In [57]:
global_frac_per_style = (df_brewery_style.groupby('style')['style'].count()/len(df_brewery_style)).sort_values(ascending=False)
global_frac_per_style.index = styles_rephrase(global_frac_per_style.index)
print(global_frac_per_style)

India_Pale_Ale_IPA      0.054235
American_IPA            0.042821
American_Pale_Ale       0.032643
Golden_Ale_Blond_Ale    0.027105
Imperial_Stout          0.022538
                          ...   
Saké_Ginjo              0.000008
Saké_Futsu_shu          0.000008
Saké_Taru               0.000004
Saké_Koshu              0.000004
Saké_Genshu             0.000004
Name: style, Length: 180, dtype: float64


### Compare for each location

In [65]:
print(type(best_type_per_user_location['United States, California']))

<class 'pandas.core.series.Series'>


In [76]:
# loop on every location
for idx, loc in enumerate(list(user_loc_count.keys())):
    breweries = frac_reviews_per_brewery[loc].keys()
    styles = best_type_per_user_location[loc].keys()
    frac={}
    for style in styles:
        frac[style] = 0
        for brewery in breweries:
            df_brewery = df_brewery_style[df_brewery_style['brewery_name'] == brewery]
            print(type(df_brewery))
            style = 'American Strong Ale'
            print(style)
            print(df_brewery)
            print(df_brewery[df_brewery['style'] == style].groupby('style')['style'].count()/len(df_brewery))
            frac[style] += 1
            break
        break
    break

<class 'pandas.core.frame.DataFrame'>
American Strong Ale
                   brewery_name                    style
163560  Pacific Brewing Company  American Pale Ale (APA)
163561  Pacific Brewing Company      American Strong Ale
163562  Pacific Brewing Company      American Strong Ale
163563  Pacific Brewing Company                Cream Ale
163564  Pacific Brewing Company         English Pale Ale
163565  Pacific Brewing Company     India Pale Ale (IPA)
style
American Strong Ale    0.333333
Name: style, dtype: float64


KeyError: 'American Strong Ale'

# OLD STUFF

The first part of the analysis is to find the main predictor for each country

In [14]:
from utils import *
import numpy as np
import statsmodels.api as sm

# REDUCED = None
REDUCED = 2e5

# Preprocessed ratings 
preprocessed_ratings_ba = load_data("pre_ba", REDUCED)
preprocessed_ratings_ba['same_country'] = (preprocessed_ratings_ba['user_location'] == preprocessed_ratings_ba['brewery_location']).astype(int)


In [15]:
preprocessed_ratings_ba = pd.get_dummies(preprocessed_ratings_ba,columns=['style', 'user_location', 'brewery_location'], dtype='int')

In [16]:
columns_of_interest = ['abv','same_country']
columns_of_interest += [col for col in preprocessed_ratings_ba.columns if ('style_' in col or 'location_' in col)] 

In [17]:
X = preprocessed_ratings_ba[columns_of_interest]  # Predictors
y = preprocessed_ratings_ba['zscore']             # Target variable
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                 zscore   R-squared:                       0.158
Model:                            OLS   Adj. R-squared:                  0.157
Method:                 Least Squares   F-statistic:                     144.0
Date:                Fri, 15 Dec 2023   Prob (F-statistic):               0.00
Time:                        13:47:40   Log-Likelihood:            -2.5299e+05
No. Observations:              200000   AIC:                         5.065e+05
Df Residuals:                  199738   BIC:                         5.092e+05
Df Model:                         261                                         
Covariance Type:            nonrobust                                         
                                                   coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf


def get_LR(data, columns):
    data_to_process = data.copy() # copy original dataset
    
    # create formula
    columns=list(columns)
    formula = 'rating ~ ' + columns[0]
    for el in columns[1:-1]:
        formula += ' + ' + el
    
    # standardization and creation of the formula
    columns.append('rating')  # add rating for the linear regression and standardization
    data_to_process = data_to_process[columns].dropna().sample(frac=1)  # only keeps columns of interest and shuffle the samples
    data_to_process = (data_to_process - data_to_process.mean()) / data_to_process.std()
    
    # create the model and fit it to the dataset
    mod = smf.ols(formula=formula, data=data_to_process)
    np.random.seed(2)
    res = mod.fit()
    return res

# Merge the datasets using the "user_id" column
merged_data = pd.merge(ratings_ba, users_ba[['user_id', 'location']], on='user_id', how='inner')

# Create an empty DataFrame to store regression results
results = pd.DataFrame(columns=['country', 'main_predictor'])

# Define the columns of interest (predictors)
columns_of_interest = ["appearance", "aroma", "palate", "taste", "overall"]

# Loop through each country and perform linear regression
for country in merged_data['location'].unique():
    country_data = merged_data[merged_data['location'] == country]
    try:
        res=get_LR(country_data, columns_of_interest)
        main_predictor = res.params.idxmax()  # Get the main predictor with the highest coefficient
        results = pd.concat([results, pd.DataFrame({'country': [country], 'main_predictor': [main_predictor]})], ignore_index=True)
    except:
        pass



# Load a world shapefile for mapping (you may need to download a suitable shapefile)
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Merge the world shapefile with the results DataFrame
world = world.merge(results, left_on='name', right_on='country', how='left')

# Plot the main predictor on a world map
fig, ax = plt.subplots(1, 1, figsize=(15, 10))
world.boundary.plot(ax=ax, linewidth=1)
world.plot(column='main_predictor', cmap='coolwarm', ax=ax, legend=True)
plt.title('Main Predictor by Country')
plt.show()
