# Are gluten-free beers worse than traditional beers?

One of the stereotypes that can be found on gluten-free beers is that they taste different than normal beers, and most often taste worse. To take a look at this matter, we can look at the difference in overall score, palate, aroma and taste scores between the gluten-free and normal beers. However, all beer types are not equally translated into gluten-free beers and it is very possible that this as a consequence on the user's ratings. To see that, we can compare the gluten-free datasets with a subset of the all-beers that has the same composition of beer type than the gluten-free one.

In [2]:
#Importing packages, not to put in actual notebook
import os
import sys
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import unidecode as ud
from tqdm import tqdm
from collections import Counter
from datetime import datetime
%matplotlib inline

### Importing datas

In [8]:
#Importing datasets
#add your data folder into the 'code' folder, make sure it is in the .gitignore file

CODE_DIR = os.getcwd()
DATA_DIR = os.path.join(CODE_DIR, 'data')

print('your data directory:')
DATA_DIR

your data directory:


'c:\\Users\\fulci\\git\\ada_23\\ada-2023-project-adarescueteam\\code\\data'

Emile path

In [4]:
# set path for BeerAdvocate data
DATA_BeerAdvocate = os.path.join(DATA_DIR, "BeerAdvocate")

# set path for RateBeer data
DATA_RateBeer = os.path.join(DATA_DIR, "RateBeer")

# set path for MatchedBeer data
DATA_MatchedBeers = os.path.join(DATA_DIR, "matched_beer_data.tar")

In [11]:
#Data are imported from the preprocessed csv and pkl files
BA_gf_ratings=pd.read_csv(os.path.join(DATA_DIR,'ba_gf_ratings.csv'))
RB_gf_ratings=pd.read_csv(os.path.join(DATA_DIR,'rb_gf_ratings.csv'))
BA_beers=pd.read_csv(os.path.join(DATA_BeerAdvocate,'beers.csv')) 
RB_beers=pd.read_csv(os.path.join(DATA_RateBeer, 'beers.csv')) 


In [None]:
# read ratings from pickle file
with open(os.path.join(DATA_DIR, "RB_ratings.pkl"), 'rb') as f:
    RB_ratings = pickle.load(f)
with open(os.path.join(DATA_DIR, "BA_ratings.pkl"), 'rb') as f:
    BA_ratings = pickle.load(f)

Ale path

In [13]:
# set path for BeerAdvocate data
DATA_BeerAdvocate = os.path.join(DATA_DIR, "BeerAdvocate.tar")

# set path for RateBeer data
DATA_RateBeer = os.path.join(DATA_DIR, "RateBeer.tar")

# set path for MatchedBeer data
DATA_MatchedBeers = os.path.join(DATA_DIR, "matched_beer_data.tar")

In [15]:
#Data are imported from the preprocessed csv and pkl files
BA_gf_ratings=pd.read_csv(os.path.join(DATA_BeerAdvocate,'ba_gf_ratings.csv'))
RB_gf_ratings=pd.read_csv(os.path.join(DATA_RateBeer,'rb_gf_ratings.csv'))
BA_beers=pd.read_csv(os.path.join(DATA_BeerAdvocate,'beers.csv')) 
RB_beers=pd.read_csv(os.path.join(DATA_RateBeer, 'beers.csv')) 

In [14]:
# read ratings from pickle file
with open(os.path.join(DATA_RateBeer, "RB_ratings.pkl"), 'rb') as f:
    RB_ratings = pickle.load(f)
with open(os.path.join(DATA_BeerAdvocate, "BA_ratings.pkl"), 'rb') as f:
    BA_ratings = pickle.load(f)

In [16]:
#Create common gluten-free dataframe and common beer dataframe
BA_gf_ratings.drop(axis=1,labels="review",inplace=True)
gf_ratings= pd.concat([RB_gf_ratings,BA_gf_ratings]) #Need to change index so that it works


### Cleaning data a bit more to make analysis of this part

-> Keeping only beer type present in gluten-free beers (already done on other analyses normally)


In [17]:
#Keep only beer-types present in gluten-free beers

gf_beers=gf_ratings['beer_name'].unique()

for i in range(len(gf_beers)): #Somehow there is a space at beginning of beer_names
    gf_beers[i]=gf_beers[i][1:]

BA_gf_beertypes = BA_beers[BA_beers['beer_name'].isin(gf_beers)]['style'].unique()
RB_gf_beertypes = RB_beers[RB_beers['beer_name'].isin(gf_beers)]['style'].unique()

In [18]:
gf_beertypes = np.concatenate((BA_gf_beertypes,RB_gf_beertypes))
gf_beertypes=pd.DataFrame(gf_beertypes,columns=['style'])
gf_beertypes=gf_beertypes['style'].unique()

In [133]:
#Delete ratings with style not in gf
# Need to create a "style" columns for 
# BA_ratings[~BA_ratings['style'].isin(gf_beertypes)]


In [147]:
# somehow it doesnt find the beertypes, not from the gf_beertypes not from the newly created BA_beerID_style which jst contains the gf_beertypes...
# but with the reasoning that every style is also associated with the beer_id, we can use the beer_id to do the comparison

BA_beers[BA_beers['style'].isin(gf_beertypes)]  # 169793 rows
BA_beerID_style = BA_beers[BA_beers['style'].isin(gf_beertypes)][['beer_id', 'style']]
BA_ratings_style = BA_ratings[BA_ratings['beer_id'].isin(BA_beerID_style['beer_id'])]

In [148]:
BA_ratings_style

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,aroma,palate,taste,overall,rating,text,review
0,Régab,142544,Societe des Brasseries du Gabon (SOBRAGA),37262,0 0 0 Euro Pale ...,4.5,2015-08-20 10:00:00,nmann08,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with...",True
1,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,0 0 0 Euro Pale ...,4.5,2009-02-20 11:00:00,StJamesGate,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67,Pours pale copper with a thin head that quick...,True
2,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,0 0 0 Euro Pale ...,4.5,2006-03-13 11:00:00,mdagnew,mdagnew.19527,4.0,3.5,3.5,4.0,3.5,3.73,"500ml Bottle bought from The Vintage, Antrim....",True
3,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,0 0 0 Euro Pale ...,4.5,2004-12-01 11:00:00,helloloser12345,helloloser12345.10867,4.0,3.5,4.0,4.0,4.5,3.98,Serving: 500ml brown bottlePour: Good head wi...,True
4,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,0 0 0 Euro Pale ...,4.5,2004-08-30 10:00:00,cypressbob,cypressbob.3708,4.0,4.0,4.0,4.0,4.0,4.00,"500ml bottlePours with a light, slightly hazy...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8393027,Kölsch Ale,19139,Summit Station Restaurant & Brewery,885,0 0 0 Euro Pale ...,5.0,2006-02-27 11:00:00,jaluria,jaluria.31362,3.5,3.5,3.5,4.0,4.0,3.80,I'll start off reviewing this beer with the f...,True
8393028,Kölsch Ale,19139,Summit Station Restaurant & Brewery,885,0 0 0 Euro Pale ...,5.0,2006-02-19 11:00:00,Dithyramb,dithyramb.4413,3.0,3.0,3.0,2.5,3.0,2.80,Served on draft at the pub. Poured a clear ye...,True
8393029,Kölsch Ale,19139,Summit Station Restaurant & Brewery,885,0 0 0 Euro Pale ...,5.0,2004-08-10 10:00:00,ManekiNeko,manekineko.2654,3.0,1.5,1.0,1.5,1.5,1.54,Served on tap at the brewpub in a pint glass....,True
8393030,Nut Brown Ale,19140,Summit Station Restaurant & Brewery,885,0 0 0 Euro Pale ...,5.3,2012-07-08 10:00:00,Redsox1539,redsox1539.683632,,,,,,3.50,,False


In [163]:
#it seems as there is a problem with recognizing the style in the full dataset 
BA_ratings_style['style'].unique(), BA_ratings_style['style'].unique().shape
# so just keep going with the beer_id if nt finding any solution..

(array(['0          0          0             Euro Pale Lager\\n1   ...\n1          0          0             Euro Pale Lager\\n1   ...\n2          0          0             Euro Pale Lager\\n1   ...\n3          0          0             Euro Pale Lager\\n1   ...\n4          0          0             Euro Pale Lager\\n1   ...\n                                 ...                        \n8393027    0          0             Euro Pale Lager\\n1   ...\n8393028    0          0             Euro Pale Lager\\n1   ...\n8393029    0          0             Euro Pale Lager\\n1   ...\n8393030    0          0             Euro Pale Lager\\n1   ...\n8393031    0          0             Euro Pale Lager\\n1   ...\nName: style, Length: 8393032, dtype: object'],
       dtype=object),
 (1,))

In [169]:
# compare how many ratings we have in the original and the dataset with the represented beer styles in the gf dataset
BA_ratings.shape, BA_ratings_style.shape

((8393032, 17), (4810208, 17))

In [167]:
# and the same for Ratebeer

RB_beers[RB_beers['style'].isin(gf_beertypes)]
RB_beerID_style = RB_beers[RB_beers['style'].isin(gf_beertypes)][['beer_id', 'style']]
RB_ratings_style = RB_ratings[RB_ratings['beer_id'].isin(RB_beerID_style['beer_id'])]

In [166]:
RB_ratings_style

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,aroma,palate,taste,overall,rating,text
0,33 Export (Gabon),410549,Sobraga,3198,Pale Lager,5.0,2016-04-26 10:00:00,Manslow,175852,2,4,2,4,8,2.0,"Puszka 0,33l dzięki Christoph . Kolor jasnozł..."
1,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2017-02-17 11:00:00,MAGICuenca91,442761,2,3,2,4,8,1.9,Cerveza pale lager gabonesa. Más floja que la...
2,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2016-06-24 10:00:00,Sibarh,288889,3,3,2,3,5,1.6,"Kolor- złoty, klarowny. Piana - drobna, średn..."
3,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2016-01-01 11:00:00,fombe89,250510,4,3,1,2,5,1.5,"Botella, de Gabón regalo familiar.31/01/2015C..."
4,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2015-10-23 10:00:00,kevnic2008,122778,2,4,2,4,7,1.9,Many thanks for this beer to Erzengel. Pours ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7122069,Svejk Blonde,220897,Svejk Beer Garden,17155,Pale Lager,,2014-09-18 10:00:00,Travlr,83882,3,6,2,5,10,2.6,"Draft at the source. Clear golden color, fluf..."
7122070,Svejk Blonde,220897,Svejk Beer Garden,17155,Pale Lager,,2013-12-01 11:00:00,TBone,10233,2,5,2,6,10,2.5,"Tap @brewpub, TiraneClear golden color, good ..."
7122071,Svejk Dark,220898,Svejk Beer Garden,17155,Dunkel/Tmavý,,2014-11-04 11:00:00,Rob_D_UK,257161,3,4,2,5,9,2.3,In their beer garden after a walking tour aro...
7122072,Svejk Dark,220898,Svejk Beer Garden,17155,Dunkel/Tmavý,,2014-09-16 10:00:00,Travlr,83882,3,5,1,4,6,1.9,"Draft at the source. Hazy maroon color, tan h..."


In [168]:
# compare how many ratings we have in the original and the dataset with the represented beer styles in the gf dataset
RB_ratings.shape, RB_ratings_style.shape

((7122074, 16), (4561726, 16))