# Make a single dataset

The goal of this notebook is to combine both datasets in a single one with matched beers.

## Breweries data

In [None]:
import os
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import copy
import ast

dataset_path = Path(os.getcwd())

ADVOCATE = "BeerAdvocate"
RATEBEER = "RateBeer"
MATCHED = "Matched"
FULL = "Full"

ADVOCATE_PATH = os.path.join(dataset_path,ADVOCATE)
RATEBEER_PATH = os.path.join(dataset_path,RATEBEER)
MATCHED_PATH = os.path.join(dataset_path,MATCHED)
FULL_PATH = os.path.join(dataset_path,FULL)

In [2]:
#Open this one already as it may become usefull for reweries already
beers_matched = pd.read_csv(os.path.join(MATCHED_PATH,'beers.csv'),header=1)

In [3]:
breweries_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'breweries.csv'))
breweries_matched = pd.read_csv(os.path.join(MATCHED_PATH,'breweries.csv'),header=1)
breweries_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'breweries.csv'))

print("Length of the three datasets:\n-advocate:",len(breweries_advocate),"\n-matched:", len(breweries_matched),"\n-ratebeer:", len(breweries_ratebeer))

#Already delete the breweries presents in matched
breweries_ratebeer_solo = breweries_ratebeer[~breweries_ratebeer.id.isin(breweries_matched['id.1'])]
breweries_advocate_solo = breweries_advocate[~breweries_advocate.id.isin(breweries_matched['id'])]

Length of the three datasets:
-advocate: 16758 
-matched: 8281 
-ratebeer: 24189


In [4]:
breweries_duplicates_ratebeer = breweries_matched[breweries_matched['id.1'].duplicated(keep=False)]
breweries__not_duplicated_ratebeer = breweries_matched[~breweries_matched['id.1'].duplicated(keep=False)]#For later
breweries_duplicates_advocate = breweries_matched[breweries_matched['id'].duplicated(keep=False)]

print("Number of duplicated/tripled Ratbeer breweries",len(breweries_duplicates_ratebeer))
print("Number of duplicated/tripled Advocate breweries",len(breweries_duplicates_advocate))

Number of duplicated/tripled Ratbeer breweries 91
Number of duplicated/tripled Advocate breweries 0


In [5]:
#We are going to build a new dataset instead of playing with the dataset that contains every information for duplicates.

unique_ratebeer_brewery_id = breweries_duplicates_ratebeer['id.1'].unique()
new_matched_brewery_duplicate = pd.DataFrame(columns=['location', 'name', 'nbr_beers','old_advocate_id','old_ratebeer_id'])

for id in unique_ratebeer_brewery_id:
    # Define data for each row
    location = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['location.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    name = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['name.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    old_advocate_id = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['id']
    list_old_advocate_id = []
    for id_advocate in (old_advocate_id):
        list_old_advocate_id.append(id_advocate)
    old_ratebeer_id = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['id.1'].iloc[0]

    total_beers_advocate = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['nbr_beers'].sum()
    total_beers_ratebeer = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['nbr_beers.1'].iloc[0] #don't want to double the number of beers for the one duplicated

    nbr_beers = total_beers_ratebeer + total_beers_advocate

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'location': location, 'name': name, 'nbr_beers': nbr_beers,'old_advocate_id':[list_old_advocate_id],'old_ratebeer_id':old_ratebeer_id})
    new_matched_brewery_duplicate = pd.concat([new_matched_brewery_duplicate, new_row], ignore_index=True)

new_matched_brewery_duplicate.head(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
0,England,Seven Bro7hers,11,"[37180, 45243]",20891
1,England,Dartmoor,19,"[25939, 22832]",3480
2,China,Great Leap Brewing,77,"[24935, 32111]",12325


In [6]:
new_matched_brewery_non_duplicate = copy.deepcopy(breweries__not_duplicated_ratebeer)
new_matched_brewery_non_duplicate['nbr_beers'] = new_matched_brewery_non_duplicate[['nbr_beers', 'nbr_beers.1']].sum(axis=1)
new_matched_brewery_non_duplicate = new_matched_brewery_non_duplicate.drop(columns=['location','name','diff', 'sim', 'nbr_beers.1'])
new_matched_brewery_non_duplicate = new_matched_brewery_non_duplicate.rename(columns={'location.1':'location','name.1':'name','id':'old_advocate_id', 'id.1':'old_ratebeer_id',})
new_matched_brewery_non_duplicate = new_matched_brewery_non_duplicate[['location', 'name', 'nbr_beers', 'old_advocate_id', 'old_ratebeer_id']]

display(new_matched_brewery_non_duplicate.head(3))

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
0,Northern Ireland,Strangford Lough,10,10093,4959
1,Northern Ireland,Sheelin,6,32848,17616
2,Northern Ireland,Walled City,9,40360,24866


In [7]:
#Concat the two datasets together
new_matched_brewery = pd.concat([new_matched_brewery_non_duplicate, new_matched_brewery_duplicate], ignore_index=True)
new_matched_brewery.sample(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
2045,Germany,Brauhaus Südstern,41,14617,7278
1446,Italy,Birrificio della Ghironda,6,29120,17441
2054,Germany,Schlossbrauhaus Schwangau,10,38017,13817


### Check the previous split and concatenation operations

In [8]:
print(len(new_matched_brewery), len(new_matched_brewery_non_duplicate), len(new_matched_brewery_duplicate))
print(len(new_matched_brewery.old_ratebeer_id.unique()))

8235 8190 45
8235


### Number of beers

Now we need to count the number of beers that there really is. We assume that a beer can only be matched if its brewery is matched too.

In [9]:
new_matched_brewery_good_amount_rating = copy.deepcopy(new_matched_brewery)

# Count occurrences of each 'old_ratebeer_id' in 'beers_matched'
ratebeer_id_counts = beers_matched['brewery_id.1'].value_counts()

# Decrement 'nbr_beers' by the counts of each 'old_ratebeer_id' to remove the amount of beers that are already in the dataset
new_matched_brewery_good_amount_rating['nbr_beers'] -= new_matched_brewery_good_amount_rating['old_ratebeer_id'].map(ratebeer_id_counts).fillna(0).astype(int)

display(new_matched_brewery_good_amount_rating.head(3))

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
0,Northern Ireland,Strangford Lough,6,10093,4959
1,Northern Ireland,Sheelin,5,32848,17616
2,Northern Ireland,Walled City,8,40360,24866


In [10]:
breweries_ratebeer_solo = breweries_ratebeer_solo.rename(columns={'id': 'old_ratebeer_id'})
breweries_ratebeer_solo['old_advocate_id'] = np.nan
display(breweries_ratebeer_solo.sample(3))

breweries_advocate_solo = breweries_advocate_solo.rename(columns={'id': 'old_advocate_id'})
breweries_advocate_solo['old_ratebeer_id'] = np.nan
display(breweries_advocate_solo.sample(3))

full_breweries = pd.concat([new_matched_brewery_good_amount_rating,breweries_advocate_solo, breweries_ratebeer_solo], ignore_index=True)
full_breweries['id'] = range(1, len(full_breweries) + 1)
display(full_breweries.sample(10))

Unnamed: 0,old_ratebeer_id,location,name,nbr_beers,old_advocate_id
3955,7887,Denmark,Viby Bryghus,6,
2408,23331,Hungary,Hopkins,1,
16301,388,"United States, Pennsylvania",Ortliebs Brewery & Grille at Sunnybrook Ballroom,16,


Unnamed: 0,old_advocate_id,location,name,nbr_beers,old_ratebeer_id
5158,47166,Russia,Black Cat Brewery,1,
5150,8533,Russia,KV - SibPivKompania,0,
12900,33530,"United States, Ohio",Wolf's Ridge Brewing,122,


Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
32353,France,Cévennes,6,,7347,32354
20102,Germany,Brauerei Hotel Hirsch,9,,12039,20103
21748,Chile,Cervecería Weisser Ltda.,4,,16102,21749
7520,Philippines,Palaweño Brewery,11,39985.0,19115,7521
28828,"United States, Idaho",Bi-Plane Brewing Company,17,,12946,28829
32461,Estonia,UJH Mans Brewery,3,,30957,32462
21651,Israel,Beera Ralf,1,,29017,21652
26498,Norway,Raulandsakademiet,2,,23945,26499
4825,"United States, New York",Suarez Family Brewery,51,41980.0,27672,4826
31560,Belgium,Brouwerij De Boeretang,2,,30465,31561


In [11]:
display(full_breweries.head(2))
print(type(full_breweries['old_advocate_id'].iloc[0]))
print(full_breweries['old_advocate_id'].iloc[16712])


Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
0,Northern Ireland,Strangford Lough,6,10093,4959,1
1,Northern Ireland,Sheelin,5,32848,17616,2


<class 'int'>
nan


In [12]:
print(len(full_breweries))
print(len(breweries_matched),len(breweries_advocate),len(breweries_ratebeer))
print(-len(breweries_matched)+len(breweries_advocate)+len(breweries_ratebeer))

32666
8281 16758 24189
32666


In [13]:
full_breweries.to_csv(os.path.join(FULL_PATH,'breweries.csv'),index=False)

test_breweri = pd.read_csv(os.path.join(FULL_PATH,'breweries.csv'))
print(len(test_breweri))
del test_breweri

32666


## Users data

We prefer to consider the users approx rather than the users of the matched dataset. We saw that the users do have duplicates (unlike the normal file). We decided to take the different users in the approx file as a single user even for a lower similitude value (no sim value is below 0.8006407690254358). We consider the effect of this to be negligeable as it is a small percent of data. (like in the brewery file).

In [14]:
users_matched = pd.read_csv(os.path.join(MATCHED_PATH,'users_approx.csv'),header=1)
users_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'users.csv'))
users_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'users.csv'))

ratings_matched = pd.read_csv(os.path.join(MATCHED_PATH,'ratings.csv'),header=1)#needed 


print("Length of the three datasets:\n-advocate:",len(users_advocate),"\n-matched:", len(users_matched),"\n-ratebeer:", len(users_ratebeer))

#Already delete the breweries presents in matched
users_ratebeer_solo = users_ratebeer[~users_ratebeer.user_name.isin(users_matched['user_name.1'])]
users_advocate_solo = users_advocate[~users_advocate.user_name.isin(users_matched['user_name'])]

print("New length of:\n-advocate:",len(users_advocate_solo),"\n-ratebeer:", len(users_ratebeer_solo))

display(users_matched.head(2))

Length of the three datasets:
-advocate: 153704 
-matched: 3341 
-ratebeer: 70174
New length of:
-advocate: 150388 
-ratebeer: 66833


Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1,sim
0,1483009000.0,Spain,3,0,magicuenca.1185749,MAGICuenca,magicuenca,1484046000.0,Spain,89,442761,MAGICuenca91,magicuenca91,0.904534
1,1220868000.0,Germany,6,6,erzengel.248045,Erzengel,erzengel,1224324000.0,Germany,8781,83106,Erzengel,erzengel,1.0


In [15]:
users_duplicates_ratebeer = users_matched[users_matched['user_name.1'].duplicated(keep=False)]
users_duplicates_advocate = users_matched[users_matched['user_name'].duplicated(keep=False)]
users__not_duplicated_advocate = users_matched[~users_matched['user_name'].duplicated(keep=False)]#For later

print("Number of duplicated/tripled Ratbeer users",len(users_duplicates_ratebeer))
print("Number of duplicated/tripled Advocate users",len(users_duplicates_advocate))

Number of duplicated/tripled Ratbeer users 0
Number of duplicated/tripled Advocate users 47


Both advocate and ratebeer dataset have as columns ['user_id', 'location', 'user_name','user_name_lower', 'joined','nbr_ratings']. Advocate has nbr_reviews in addition. Matched has those columns with .1 referring to ratebeer. It also has a column 'sim' that we will drop. The end format we want to have: [general_id, old_user_id_advocate, old_user__id_ratebeer, location, user_name_lower, joined_advocate, joined_ratebeer and nbr_ratings]. We think that the other columns will not be usefull for our analysis. The next cell shows that the user_name_lower are equal between the two datasets. For the location we will in opposition to the brewery give priority to the advocate dataset, as it is this time the one having one correspondence to multiple ratebeer users.

In [16]:
display(users_duplicates_advocate.sample(3))
users_duplicates_advocate[users_duplicates_advocate['user_id']=='lonestar.677281'].head(2)

Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1,sim
10,1245751000.0,England,32,32,leighton.343447,leighton,leighton,1209204000.0,England,19568,74136,Leighton,leighton,1.0
2469,1121422000.0,Canada,10,10,beers.29246,Beers,beers,1359457000.0,Canada,1,241070,beerseh,beerseh,0.816497
2797,1104318000.0,"United States, Ohio",1,1,beerbeerbeerbeer.12454,beerbeerbeerbeer,beerbeerbeerbeer,1107256000.0,"United States, Ohio",5,19505,Beerboy,beerboy,0.811107


Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1,sim
562,1337422000.0,"United States, Texas",1,1,lonestar.677281,Lonestar,lonestar,1081332000.0,"United States, Texas",13,11446,oneStar,onestar,0.822609
2059,1337422000.0,"United States, Texas",1,1,lonestar.677281,Lonestar,lonestar,1162984000.0,"United States, Texas",6,44744,LONESTAR,lonestar,1.0


In [17]:
#We are going to build a new dataset instead of playing with the dataset that contains every information for duplicates.

unique_advocate_user_user_name = users_duplicates_advocate['user_name'].unique()
new_matched_user_duplicate = pd.DataFrame(columns=['location', 'joined_advocate', 'old_user_id_advocate','joined_ratebeer','old_user_id_ratebeer','user_name','nbr_ratings_total'])

for user_name in unique_advocate_user_user_name:  # Replace with your actual loop condition
    # Define data for each row
    location = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['location'].iloc[0]#Always take the first as it is the same for the two/three of them
    joined_advocate = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['joined'].iloc[0]
    joined_ratebeer = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['joined.1'].min() #take min value of joined time (we want the first time they joined)

    old_user_id_advocate = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['user_id'].iloc[0]
    old_user_id_ratebeer = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['user_id.1']
    name = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['user_name'].iloc[0]

    list_old_ratebeer_id = []
    for id_ratbeer in (old_user_id_ratebeer):
        list_old_ratebeer_id.append(id_ratbeer)

    total_ratings_advocate = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['nbr_ratings'].iloc[0]
    total_ratings_ratebeer = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['nbr_ratings.1'].sum()

    nbr_ratings_total = total_ratings_advocate + total_ratings_ratebeer

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'location': location, 'user_name': name,'old_user_id_ratebeer':[list_old_ratebeer_id],'old_user_id_advocate':old_user_id_advocate,'joined_advocate':joined_advocate,'joined_ratebeer':joined_ratebeer,'nbr_ratings_total':nbr_ratings_total})
    new_matched_user_duplicate = pd.concat([new_matched_user_duplicate, new_row], ignore_index=True)

new_matched_user_duplicate.sample(10)


  new_matched_user_duplicate = pd.concat([new_matched_user_duplicate, new_row], ignore_index=True)


Unnamed: 0,location,joined_advocate,old_user_id_advocate,joined_ratebeer,old_user_id_ratebeer,user_name,nbr_ratings_total
14,Canada,1121422000.0,beers.29246,1185444000.0,"[58154, 241070, 130784]",Beers,13
6,"United States, New York",1159956000.0,drinkinbuddy.101255,1052042000.0,"[7402, 7736]",DrinkinBuddy,175
1,England,1245751000.0,leighton.343447,1209204000.0,"[74136, 257478]",leighton,19601
21,"United States, Ohio",1104318000.0,beerbeerbeerbeer.12454,1107256000.0,"[19505, 411715]",beerbeerbeerbeer,8
8,"United States, Massachusetts",1336298000.0,maximusmaximus.675527,1108897000.0,"[20099, 356497]",Maximusmaximus,112
17,Norway,1421492000.0,ketil.928679,1292584000.0,"[119220, 256180]",Ketil,8
13,"United States, Pennsylvania",1200568000.0,beerfinder.187713,1004177000.0,"[54721, 2078]",beerfinder,244
5,"United States, California",1295521000.0,chadski.555343,1381054000.0,"[282710, 397860]",chadski,406
16,Norway,1334225000.0,morten.672301,1140606000.0,"[137013, 33840, 137302]",Morten,16
7,Canada,1156068000.0,nighthawk.93517,1150452000.0,"[124623, 38708]",nighthawk,212


In [18]:
users_matched_not_duplicated = users__not_duplicated_advocate.drop(columns=['nbr_reviews','user_name.1','user_name_lower','location','user_name_lower.1','sim'])
users_matched_not_duplicated = users_matched_not_duplicated.rename(columns={'joined': 'joined_advocate','joined.1': 'joined_ratebeer','user_id': 'old_user_id_advocate','user_id.1': 'old_user_id_ratebeer','location.1':'location'})
users_matched_not_duplicated["nbr_ratings_total"] = users_matched_not_duplicated.nbr_ratings+users_matched_not_duplicated["nbr_ratings.1"]
users_matched_not_duplicated = users_matched_not_duplicated.drop(columns=['nbr_ratings','nbr_ratings.1'])
users_matched_not_duplicated.sample(5)

Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total
514,1178878000.0,mnstorm99.138083,mnstorm99,1171537000.0,"United States, Minnesota",49904,196
1671,1263208000.0,mdfb79.414386,mdfb79,1265627000.0,"United States, New York",101381,4515
1654,1281434000.0,bierinjisp.489918,bierinjisp,1278929000.0,Netherlands,109724,3
3302,1269601000.0,jtsingletary.442006,JTSingletary,1317118000.0,"United States, Florida",136569,5
1777,1224583000.0,sleuthdog.259536,sleuthdog,1299064000.0,"United States, Illinois",124539,382


In [19]:
#Concat the two datasets together
new_matched_user = pd.concat([users_matched_not_duplicated, new_matched_user_duplicate], ignore_index=True)
new_matched_user.sample(3)

Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total
82,1289473000.0,llcooldave.524178,llcooldave,1186913000.0,"United States, Texas",59040,1054
2205,1280311000.0,prospero.484868,Prospero,1297336000.0,"United States, Colorado",123318,998
3173,1412417000.0,oleguerito.873310,Oleguerito,1412849000.0,Czech Republic,338844,3


In [20]:
users_matched_not_duplicated_good_amount_ratings = copy.deepcopy(new_matched_user)

# Count occurrences of each 'old_ratebeer_id' in 'beers_matched'
ratings_matched_id_counts = ratings_matched['user_id'].value_counts()

# Decrement 'nbr_beers' by the counts of each 'old_ratebeer_id' to remove the amount of beers that are already in the dataset
users_matched_not_duplicated_good_amount_ratings['nbr_ratings_total'] -= users_matched_not_duplicated_good_amount_ratings['old_user_id_advocate'].map(ratings_matched_id_counts).fillna(0).astype(int)

print(new_matched_user.nbr_ratings_total.sum())
print(users_matched_not_duplicated_good_amount_ratings.nbr_ratings_total.sum())

1608147
1586183


In [21]:
# Refactor the dataframe to prepare for the merge
users_ratebeer_solo = users_ratebeer_solo.rename(columns={'user_id': 'old_user_id_ratebeer','joined':'joined_ratebeer','nbr_ratings':'nbr_ratings_total'})
users_ratebeer_solo['old_user_id_advocate'] = np.nan
users_ratebeer_solo['joined_advocate'] = np.nan

users_advocate_solo = users_advocate_solo.rename(columns={'user_id': 'old_user_id_advocate','joined':'joined_advocate','nbr_ratings':'nbr_ratings_total'})
users_advocate_solo['old_user_id_ratebeer'] = np.nan
users_advocate_solo['joined_ratebeer'] = np.nan
users_advocate_solo = users_advocate_solo.drop(columns=['nbr_reviews'])

# Check the former operations
print("Sample of users only present in Ratebeer:")
display(users_ratebeer_solo.sample(3))

print("Sample of users only present in BeerAdvocate:")
display(users_advocate_solo.sample(3))


# Merge the three dataframes
full_users = pd.concat([users_matched_not_duplicated_good_amount_ratings,users_ratebeer_solo, users_advocate_solo], ignore_index=True)
full_users['id'] = range(1, len(full_users) + 1)

# Check the former operation
print("Sample of all users:")
display(full_users.sample(2))

print("Total number of users: ", len(full_users))
print("Number of users present in both datasets: ", len(users_matched))
print("Number of users present in BeerAdvocate: ",len(users_advocate))
print("Number of users present in RateBeer: ",len(users_ratebeer))
print("Sum of all the users (for chekcing purposes): ", len(users_matched_not_duplicated_good_amount_ratings)+len(users_advocate_solo)+len(users_ratebeer_solo))

Sample of users only present in Ratebeer:


Unnamed: 0,nbr_ratings_total,old_user_id_ratebeer,user_name,joined_ratebeer,location,old_user_id_advocate,joined_advocate
5975,82,241334,MarkQ,1359544000.0,Canada,,
26082,1,280342,RKurtzweil,1379758000.0,,,
11807,3,20232,LuciferSam,1109243000.0,"United States, New Hampshire",,


Sample of users only present in BeerAdvocate:


Unnamed: 0,nbr_ratings_total,old_user_id_advocate,user_name,joined_advocate,location,old_user_id_ratebeer,joined_ratebeer
78364,104,phuntasy.727283,phuntasy,1365156000.0,"United States, New Hampshire",,
24340,50,mrfreakybig.340598,mrfreakybig,1245146000.0,"United States, New York",,
123204,1,chehnly.756416,chehnly,1380449000.0,"United States, Pennsylvania",,


Sample of all users:


Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total,id
130806,1292843000.0,ailong.541304,Ailong,,"United States, Illinois",,6,130807
107680,1359112000.0,opiate42.714956,Opiate42,,Canada,,1,107681


Total number of users:  220537
Number of users present in both datasets:  3341
Number of users present in BeerAdvocate:  153704
Number of users present in RateBeer:  70174
Sum of all the users (for chekcing purposes):  220537


In [22]:
test = full_users[full_users['user_name'].duplicated(keep=False)]
test[test['user_name']=='Elwood']

Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total,id
990,1152871000.0,elwood.88673,Elwood,1374314000.0,Canada,270235,6,991
4952,,,Elwood,1235041000.0,"United States, Virginia",87609,2966,4953


Oh so in different dataset people can have the same user_name. We need to be carefull about this. If we treat the data we need to use id.

In [23]:
# Save the dataframe to a csv file
full_users.to_csv(os.path.join(FULL_PATH,'users.csv'),index=False)

# Check the former operation
test_user = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))
assert len(test_user) == len(full_users), f"Expected {len(full_users)}, but got {len(test_user)}"
del test_user

  test_user = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))


## Beer data

In this dataset we saw that there were no duplicates. We do not need to make the carefull analysis we made before. However we still need to look at the matched dataset. We also need to link the new brewery id, but we also need to count the number of ratings.

In [24]:
beers_matched = pd.read_csv(os.path.join(MATCHED_PATH,'beers.csv'),header=1)
beers_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'beers.csv'))
beers_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'beers.csv'))

ratings_matched = pd.read_csv(os.path.join(MATCHED_PATH,'ratings.csv'),header=1)#needed 
full_breweries_for_beers = pd.read_csv(os.path.join(FULL_PATH,'breweries.csv'))#needed


print("Length of the three datasets:\n-advocate:",len(beers_advocate),"\n-matched:", len(beers_matched),"\n-ratebeer:", len(beers_ratebeer))

#Already delete the breweries presents in matched
beers_ratebeer_solo = beers_ratebeer[~beers_ratebeer.beer_id.isin(beers_matched['beer_id.1'])]
beers_advocate_solo = beers_advocate[~beers_advocate.beer_id.isin(beers_matched['beer_id'])]

print("New length of:\n-advocate:",len(beers_advocate_solo),"\n-ratebeer:", len(beers_ratebeer_solo))

Length of the three datasets:
-advocate: 280823 
-matched: 45640 
-ratebeer: 442081
New length of:
-advocate: 235183 
-ratebeer: 396441


In [25]:
display(beers_matched.sample(2))
beers_new = beers_matched.drop(columns=['avg','beer_wout_brewery_name.1','avg.1','avg_matched_valid_ratings.1','nbr_reviews','beer_name.1','brewery_name.1','brewery_name','avg_computed','avg_computed.1','avg_matched_valid_ratings','ba_score','beer_wout_brewery_name','sim','diff','zscore','zscore.1','overall_score','style_score','nbr_matched_valid_ratings','nbr_matched_valid_ratings.1'])
display(beers_new.sample(2))
print((beers_new['abv'] == beers_new['abv.1']).all())

Unnamed: 0,abv,avg,avg_computed,avg_matched_valid_ratings,ba_score,beer_id,beer_name,beer_wout_brewery_name,brewery_id,brewery_name,...,brewery_id.1,brewery_name.1,nbr_matched_valid_ratings.1,nbr_ratings.1,overall_score,style.1,style_score,zscore.1,diff,sim
15090,9.1,4.13,4.135333,4.123333,86.0,259331,5 Lb Sledgehammer,Sledgehammer 5 Lb,25920,El Segundo Brewing Company,...,12939,El Segundo Brewing Company,13,13,88.0,Imperial IPA,48.0,0.339849,1.0,1.0
42912,5.4,3.9,3.952072,3.881923,87.0,196618,Prairie Flare,Flare,30356,Prairie Artisan Ales,...,15476,Prairie Artisan Ales,217,217,93.0,Grodziskie/Gose/Lichtenhainer,94.0,0.29252,1.0,1.0


Unnamed: 0,abv,beer_id,beer_name,brewery_id,bros_score,nbr_ratings,style,abv.1,beer_id.1,brewery_id.1,nbr_ratings.1,style.1
38603,7.3,81993,Gigantic IPA,29003,90.0,472,American IPA,7.3,173411,14535,392,India Pale Ale (IPA)
30696,6.5,228385,Apogee Perigee Ale,35693,,5,American Pale Ale (APA),6.5,416962,20092,3,American Pale Ale


True


The alcohol by volume is exactly the same between the two datasets.

In [26]:
#We are going to build a new dataset instead of playing with the dataset that contains every information for duplicates.
#Not sure about valid ratings so prefer to keep it as it is for now
new_matched_beer = pd.DataFrame(columns=['abv', 'old_beer_id_advocate', 'old_beer_id_ratebeer','beer_name','brewery_id','bros_score','nbr_ratings','style_advocate','style_ratebeer'])

# Use a list to collect rows and create the DataFrame at once
rows = []

for index, row in beers_new.iterrows():
    # Define data for each row
    abv = row['abv']
    beer_id_advocate = row['beer_id']
    beer_id_ratebeer = row['beer_id.1']
    beer_name = row['beer_name']
    bros_score = row['bros_score']
    style_advocate = row['style']
    style_ratebeer = row['style.1']

    brewery_id = full_breweries_for_beers[full_breweries_for_beers['old_ratebeer_id']==row['brewery_id.1']]['id'].values[0]

    total_ratings_advocate = row['nbr_ratings']
    total_ratings_ratebeer = row['nbr_ratings.1']

    nbr_ratings_total = total_ratings_advocate + total_ratings_ratebeer

    # Append the new row to the list
    rows.append({'abv': abv, 'old_beer_id_advocate': beer_id_advocate, 'old_beer_id_ratebeer': beer_id_ratebeer, 'beer_name': beer_name, 'brewery_id': brewery_id, 'bros_score': bros_score, 'nbr_ratings': nbr_ratings_total, 'style_advocate': style_advocate, 'style_ratebeer': style_ratebeer})

# Create the DataFrame from the list of rows
new_matched_beer = pd.DataFrame(rows)

new_matched_beer.head(10)

Unnamed: 0,abv,old_beer_id_advocate,old_beer_id_ratebeer,beer_name,brewery_id,bros_score,nbr_ratings,style_advocate,style_ratebeer
0,4.8,19827,37923,Legbiter,1,80.0,164,English Pale Ale,Golden Ale/Blond Ale
1,6.0,20841,41286,St. Patrick's Ale,1,,19,English Pale Ale,Irish Ale
2,4.2,20842,41287,St. Patrick's Best,1,90.0,138,English Bitter,Bitter
3,4.8,22659,41285,St. Patrick's Gold,1,,5,American Pale Wheat Ale,Amber Ale
4,4.5,178681,230283,Sheelin Stout,2,,2,Irish Dry Stout,Mild Ale
5,4.2,178689,368966,Boom,3,,3,American Pale Ale (APA),American Pale Ale
6,4.6,169948,155699,Bally Black Stout,4,,6,Irish Dry Stout,Stout
7,5.2,169950,160664,Pig Island Pale Ale,4,,4,English Pale Ale,Bitter
8,4.4,169949,177517,Rockin’ Goose,4,,3,English Pale Mild Ale,Irish Ale
9,4.9,169951,299091,Scrabo Gold,4,,2,English Pale Ale,Golden Ale/Blond Ale


In [27]:
new_matched_beer_good_amount_ratings = copy.deepcopy(new_matched_beer)

# Count occurrences of each 'old_ratebeer_id' in 'beers_matched'
ratings_matched_beer_id_counts = ratings_matched['beer_id'].value_counts()

# Decrement 'nbr_beers' by the counts of each 'old_ratebeer_id' to remove the amount of beers that are already in the dataset
new_matched_beer_good_amount_ratings['nbr_ratings'] -= new_matched_beer_good_amount_ratings['old_beer_id_advocate'].map(ratings_matched_beer_id_counts).fillna(0).astype(int)

print(new_matched_beer.nbr_ratings.sum())
print(new_matched_beer_good_amount_ratings.nbr_ratings.sum())

1976606
1954642


In [28]:
beers_ratebeer_solo_new = beers_ratebeer_solo.copy()
beers_ratebeer_solo_new = beers_ratebeer_solo_new.drop(columns=['brewery_name','overall_score','style_score','avg','avg_computed','zscore','nbr_matched_valid_ratings','avg_matched_valid_ratings'])
beers_ratebeer_solo_new = beers_ratebeer_solo_new.rename(columns={'beer_id': 'old_beer_id_ratebeer','style':'style_ratebeer'})

beers_ratebeer_solo_new = pd.merge(beers_ratebeer_solo_new,full_breweries_for_beers[['old_ratebeer_id', 'id']],how='left', left_on='brewery_id',right_on='old_ratebeer_id')
beers_ratebeer_solo_new = beers_ratebeer_solo_new.drop(columns=['brewery_id','old_ratebeer_id'])
beers_ratebeer_solo_new = beers_ratebeer_solo_new.rename(columns={'id': 'brewery_id'})

beers_ratebeer_solo_new['old_beer_id_advocate'] = np.nan
beers_ratebeer_solo_new['bros_score'] = np.nan
beers_ratebeer_solo_new['style_advocate']=np.nan

beers_ratebeer_solo_new.sample(3)

Unnamed: 0,old_beer_id_ratebeer,beer_name,style_ratebeer,nbr_ratings,abv,brewery_id,old_beer_id_advocate,bros_score,style_advocate
215839,465055,Catalina Monkey Brew,German Hefeweizen,2,5.2,5140,,,
191181,346780,Evans Cat Fight IPA,Session IPA,2,3.0,27482,,,
133212,257564,Austmann / Voss The Hopressor,American Strong Ale,9,9.9,3692,,,


In [29]:
display(beers_advocate_solo.sample(1))
beers_advocate_solo_new = copy.deepcopy(beers_advocate_solo)
beers_advocate_solo_new = beers_advocate_solo_new.drop(columns=['brewery_name','nbr_reviews','ba_score','avg','avg_computed','zscore','nbr_matched_valid_ratings','avg_matched_valid_ratings'])
beers_advocate_solo_new = beers_advocate_solo_new.rename(columns={'beer_id': 'old_beer_id_advocate','style':'style_advocate'})

display(full_breweries_for_beers[full_breweries_for_beers['old_ratebeer_id']==20891].head(2))

# Create a dictionary that links each 'id' to 'old_ratebeer_id' and 'old_advocate_id'
id_dict = full_breweries_for_beers.groupby('id').agg({
    'old_ratebeer_id': 'first',
    'old_advocate_id': lambda x: list(x)
}).to_dict('index')

print(str(id_dict[8191]['old_ratebeer_id']))

beers_advocate_solo_new['brewery_id'] = beers_advocate_solo_new['brewery_id'].astype(str)
full_breweries_for_beers['old_advocate_id'] = full_breweries_for_beers['old_advocate_id'].astype(str)
full_breweries_for_beers['id'] = full_breweries_for_beers['id'].astype(str)

beers_advocate_solo_new = pd.merge(beers_advocate_solo_new,full_breweries_for_beers[['old_advocate_id', 'id']],how='left', left_on='brewery_id',right_on='old_advocate_id')
beers_advocate_solo_new = beers_advocate_solo_new.drop(columns=['brewery_id','old_advocate_id'])
beers_advocate_solo_new = beers_advocate_solo_new.rename(columns={'id': 'brewery_id'})

beers_advocate_solo_new['old_beer_id_ratebeer'] = np.nan
beers_advocate_solo_new['style_ratebeer']=np.nan

display(beers_advocate_solo_new.head())

Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style,nbr_ratings,nbr_reviews,avg,ba_score,bros_score,abv,avg_computed,zscore,nbr_matched_valid_ratings,avg_matched_valid_ratings
127531,191529,Deserted Island IPA,41880,Rough Cut Brewing Co.,American IPA,2,0,4.25,,,6.7,4.25,,0,


Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
8190,England,Seven Bro7hers,10,"[37180, 45243]",20891.0,8191


20891.0


Unnamed: 0,old_beer_id_advocate,beer_name,style_advocate,nbr_ratings,bros_score,abv,brewery_id,old_beer_id_ratebeer,style_ratebeer
0,166064,Nashe Moskovskoe,Euro Pale Lager,0,,4.7,8236,,
1,166065,Nashe Pivovskoe,Euro Pale Lager,0,,3.8,8236,,
2,166066,Nashe Shakhterskoe,Euro Pale Lager,0,,4.8,8236,,
3,166067,Nashe Zhigulevskoe,Euro Pale Lager,0,,4.0,8236,,
4,166063,Zhivoe,Euro Pale Lager,0,,4.5,8236,,


In [30]:
full_beers = pd.concat([new_matched_beer_good_amount_ratings,beers_ratebeer_solo_new, beers_advocate_solo_new], ignore_index=True)
full_beers['id'] = range(1, len(full_beers) + 1)
full_beers.sample(2)

Unnamed: 0,abv,old_beer_id_advocate,old_beer_id_ratebeer,beer_name,brewery_id,bros_score,nbr_ratings,style_advocate,style_ratebeer,id
76067,5.0,,74751.0,Minamishinshu Ki No Sato,18243,,5,,Golden Ale/Blond Ale,76068
248488,9.2,,247690.0,De Steeg Belgian Imperial Wheat,4914,,1,,Belgian Strong Ale,248489


In [31]:
print(len(full_beers))
print(len(beers_matched),len(beers_advocate),len(beers_ratebeer))
print(len(new_matched_beer_good_amount_ratings)+len(beers_advocate_solo_new)+len(beers_ratebeer_solo_new))

677264
45640 280823 442081
677264


In [32]:
full_beers.to_csv(os.path.join(FULL_PATH,'beers.csv'),index=False)

In [33]:
test_beer = pd.read_csv(os.path.join(FULL_PATH,'beers.csv'))
assert len(test_beer) == len(full_beers), f"Expected {len(full_beers)}, but got {len(test_beer)}"
del test_beer

  test_beer = pd.read_csv(os.path.join(FULL_PATH,'beers.csv'))


## Ratings

In [1]:
import os
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import copy
import ast

dataset_path = Path(os.getcwd())

ADVOCATE = "BeerAdvocate"
RATEBEER = "RateBeer"
MATCHED = "Matched"
FULL = "Full"

ADVOCATE_PATH = os.path.join(dataset_path,ADVOCATE)
RATEBEER_PATH = os.path.join(dataset_path,RATEBEER)
MATCHED_PATH = os.path.join(dataset_path,MATCHED)
FULL_PATH = os.path.join(dataset_path,FULL)

In [2]:
ratings_matched = pd.read_csv(os.path.join(MATCHED_PATH,'ratings.csv'),header=1)
ratings_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'ratings-advocate.csv'))
ratings_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'ratings.csv'))

print("Length of the three datasets:\n-advocate:",len(ratings_advocate),"\n-matched:", len(ratings_matched),"\n-ratebeer:", len(ratings_ratebeer))

Length of the three datasets:
-advocate: 8393032 
-matched: 21964 
-ratebeer: 7122074


In [3]:
full_users = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))
full_beers = pd.read_csv(os.path.join(FULL_PATH,'beers.csv'))
full_breweries = pd.read_csv(os.path.join(FULL_PATH,'breweries.csv'))

  full_users = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))
  full_beers = pd.read_csv(os.path.join(FULL_PATH,'beers.csv'))


In [4]:
ratings_advocate = ratings_advocate.drop(columns=['review','beer_name','brewery_name','style','user_name','abv'])
ratings_matched = ratings_matched.drop(columns=['review','beer_name','beer_name.1','brewery_name','brewery_name.1','style','style.1','user_name','user_name.1','abv','abv.1'])
ratings_ratebeer = ratings_ratebeer.drop(columns=['beer_name','brewery_name','style','user_name','abv'])

In contradiction to the previous method, we would like to keep both comments and grades given the the datasets for a macthed comments. We want to do this as the text might differ and the grades too. First we just want to add a column 'dataset' to remind from which dataset the data contain. We also want to add a comment duplicate, which will link the comment 'id' to its matched comment. Naturally a comment id column would be added too. Finally the columns beer_id, brewery_id et user_id would be link to the one of the new id given in full_users, full_breweries and full_beers.

In [5]:
ratings_ratebeer['dataset'] = 'rb'
ratings_advocate['dataset'] = 'ad'

final_id_ratebeer = len(ratings_ratebeer) + 1
first_id_advocate = final_id_ratebeer
final_id_advocate = first_id_advocate + len(ratings_advocate)
ratings_ratebeer['id_rating'] = range(1, final_id_ratebeer) #For the moment call it id_rating, rename it later 
ratings_advocate['id_rating'] = range(first_id_advocate, final_id_advocate)

assert ratings_ratebeer.iloc[len(ratings_ratebeer)-1]['id_rating']!=ratings_advocate.iloc[0]['id_rating']

Give new brewery id

In [6]:
ratings_ratebeer = ratings_ratebeer.merge(full_breweries[['old_ratebeer_id', 'id']],how='left',left_on='brewery_id',right_on='old_ratebeer_id')
ratings_ratebeer = ratings_ratebeer.drop(columns='old_ratebeer_id')
ratings_ratebeer = ratings_ratebeer.rename(columns={'id':'id_brewery'})

# Convert the Series to a list and apply ast.literal_eval to each element if it's a string
result = [ast.literal_eval(x) if isinstance(x, str) else x for x in full_breweries['old_advocate_id'].tolist()]
full_breweries['old_advocate_id'] = pd.Series(result, index=full_breweries.index)
full_breweries = full_breweries.explode('old_advocate_id')

ratings_advocate = ratings_advocate.merge(full_breweries[['old_advocate_id', 'id']],how='left',left_on='brewery_id',right_on='old_advocate_id')
ratings_advocate = ratings_advocate.drop(columns='old_advocate_id')
ratings_advocate = ratings_advocate.rename(columns={'id':'id_brewery'})

Give new user id

In [7]:
# Convert the Series to a list and apply ast.literal_eval to each element if it's a string
result = [ast.literal_eval(x) if isinstance(x, str) else x for x in full_users['old_user_id_ratebeer'].tolist()]
full_users['old_user_id_ratebeer'] = pd.Series(result, index=full_users.index)
full_users = full_users.explode('old_user_id_ratebeer')

ratings_ratebeer = ratings_ratebeer.merge(full_users[['old_user_id_ratebeer', 'id']],how='left',left_on='user_id',right_on='old_user_id_ratebeer')
ratings_ratebeer = ratings_ratebeer.drop(columns='old_user_id_ratebeer')
ratings_ratebeer = ratings_ratebeer.rename(columns={'id':'id_user'})

ratings_advocate = ratings_advocate.merge(full_users[['old_user_id_advocate', 'id']],how='left',left_on='user_id',right_on='old_user_id_advocate')
ratings_advocate = ratings_advocate.drop(columns='old_user_id_advocate')
ratings_advocate = ratings_advocate.rename(columns={'id':'id_user'})

Get new beer id

In [10]:
ratings_ratebeer = ratings_ratebeer.merge(full_beers[['old_beer_id_ratebeer', 'id']],how='left',left_on='beer_id',right_on='old_beer_id_ratebeer')
ratings_ratebeer = ratings_ratebeer.drop(columns='old_beer_id_ratebeer')
ratings_ratebeer = ratings_ratebeer.rename(columns={'id':'id_beer'})

# Convert the Series to a list and apply ast.literal_eval to each element if it's a string
result = [ast.literal_eval(x) if isinstance(x, str) else x for x in full_beers['old_beer_id_ratebeer'].tolist()]
full_beers['old_beer_id_ratebeer'] = pd.Series(result, index=full_beers.index)
full_beers = full_beers.explode('old_beer_id_ratebeer')

ratings_advocate = ratings_advocate.merge(full_beers[['old_beer_id_ratebeer', 'id']],how='left',left_on='beer_id',right_on='old_beer_id_ratebeer')
ratings_advocate = ratings_advocate.drop(columns='old_beer_id_ratebeer')
ratings_advocate = ratings_advocate.rename(columns={'id':'id_beer'})

Find the matched commentss

In [9]:
ratings_advocate_matched = copy.deepcopy(ratings_advocate)
ratings_ratebeer_matched = copy.deepcopy(ratings_ratebeer)

ratings_advocate_matched['matched'] = np.nan
ratings_ratebeer_matched['matched'] = np.nan

# Create a dictionary to map (user_id, beer_id) to id_rating for both datasets
advocate_dict = ratings_advocate_matched.set_index(['user_id', 'beer_id'])['id_rating'].to_dict()
ratebeer_dict = ratings_ratebeer_matched.set_index(['user_id', 'beer_id'])['id_rating'].to_dict()

for index, row in ratings_matched.iterrows():
    beer_advocate = row['beer_id']
    beer_ratebeer = row['beer_id.1']
    user_advocate = row['user_id']
    user_ratebeer = row['user_id.1']

    id_advocate = advocate_dict.get((user_advocate, beer_advocate))
    id_ratebeer = ratebeer_dict.get((user_ratebeer, beer_ratebeer))

    if id_advocate is not None and id_ratebeer is not None:
        ratings_advocate_matched.loc[ratings_advocate_matched['id_rating'] == id_advocate, 'matched'] = id_ratebeer
        ratings_ratebeer_matched.loc[ratings_ratebeer_matched['id_rating'] == id_ratebeer, 'matched'] = id_advocate

ratings_advocate_matched = ratings_advocate_matched.drop(columns=['beer_id', 'brewery_id', 'user_id'])
ratings_ratebeer_matched = ratings_ratebeer_matched.drop(columns=['beer_id', 'brewery_id', 'user_id'])

ratings_advocate_matched = ratings_advocate_matched.rename(columns={'id_rating': 'id'})
ratings_ratebeer_matched = ratings_ratebeer_matched.rename(columns={'id_rating': 'id'})


In [10]:
print(len(ratings_advocate_matched[ratings_advocate_matched['id_beer'].notnull()]))
print(len(ratings_advocate_matched[ratings_advocate_matched['matched'].notnull()]))

1899356
21977


In [11]:
print(len(ratings_ratebeer_matched[ratings_ratebeer_matched['id_beer'].notnull()]))
print(len(ratings_ratebeer_matched[ratings_ratebeer_matched['matched'].notnull()]))

1668528
21965


In [12]:
full_breweries[full_breweries['old_advocate_id']==37180].head(2)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
8190,England,Seven Bro7hers,10,37180.0,20891.0,8191


In [13]:
full_breweries[full_breweries['old_ratebeer_id']==20891].head(2)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
8190,England,Seven Bro7hers,10,37180.0,20891.0,8191
8190,England,Seven Bro7hers,10,45243.0,20891.0,8191


In [14]:
display(ratings_advocate_matched.head(1))
display(ratings_ratebeer_matched.head(1))

Unnamed: 0,date,appearance,aroma,palate,taste,overall,rating,text,dataset,id,id_brewery,id_user,id_beer,matched
0,1440064800,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",ad,7122075,8244,2754,,


Unnamed: 0,date,appearance,aroma,palate,taste,overall,rating,text,dataset,id,id_brewery,id_user,id_beer,matched
0,1461664800,2,4,2,4,8,2.0,"Puszka 0,33l dzięki Christoph . Kolor jasnozło...",rb,1,16713,3317.0,,


In [15]:
display(ratings_advocate_matched[ratings_advocate_matched['matched']==3169])
display(ratings_ratebeer_matched[ratings_ratebeer_matched['id']==3169])

Unnamed: 0,date,appearance,aroma,palate,taste,overall,rating,text,dataset,id,id_brewery,id_user,id_beer,matched
31,1315821600,4.0,2.5,3.5,2.5,2.5,2.69,"Appearance: Pours a clear, medium-brown body, ...",ad,7122106,1,57,,3169.0


Unnamed: 0,date,appearance,aroma,palate,taste,overall,rating,text,dataset,id,id_brewery,id_user,id_beer,matched
3168,1315821600,4,5,3,5,9,2.6,"Appearance: Pours a clear, medium-brown body, ...",rb,3169,1,57.0,,7122106.0
