In [1]:
import os
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

root = Path(os.getcwd()).parent.parent

#Change for each one of where your data is. For me in Dataset I have all the three folders
parent_directory = os.path.dirname(root)
dataset_path = os.path.join(root,'Dataset')

ADVOCATE = "advocate"
RATEBEER = "ratebeer"
MATCHED = "matched"
FULL = "full"

ADVOCATE_PATH = os.path.join(dataset_path,ADVOCATE)
RATEBEER_PATH = os.path.join(dataset_path,RATEBEER)
MATCHED_PATH = os.path.join(dataset_path,MATCHED)
FULL_PATH = os.path.join(dataset_path,FULL)

**Introduction**

The goal of this notebook is to found a way to transform every family dataset into one single dataset. At the end we should have one single user dataset, one single beer dataset and so on. Those datasets would be downloadable, except ratings where we want to create a function that we can run at the beginning of each script, this is done to avoid to have to download/load a much heavier dataset.

To facilitate our life we want to create a single id that could correspond accross the different datasets. It is important that there remains a way to connect to the dataset of origin, i.e that we do not drop the general id.

In [26]:
#Open this one already as it may become usefull for reweries already
beers_matched = pd.read_csv(os.path.join(MATCHED_PATH,'beers.csv'),header=1)

**Breweries dataset**

We saw that in data_understanding.ipynb, for breweries there were duplicates. So we need to treat them carefully.

In [27]:
breweries_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'breweries.csv'))
breweries_matched = pd.read_csv(os.path.join(MATCHED_PATH,'breweries.csv'),header=1)
breweries_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'breweries.csv'))

print("Length of the three datasets:\n-advocate:",len(breweries_advocate),"\n-matched:", len(breweries_matched),"\n-ratebeer:", len(breweries_ratebeer))

#Already delete the breweries presents in matched
breweries_ratebeer_solo = breweries_ratebeer[~breweries_ratebeer.id.isin(breweries_matched['id.1'])]
breweries_advocate_solo = breweries_advocate[~breweries_advocate.id.isin(breweries_matched['id'])]

Length of the three datasets:
-advocate: 16758 
-matched: 8281 
-ratebeer: 24189


Both advocate and ratebeer dataset have as columns ['id', 'location', 'name', 'nbr_beers']. Matched has those columns with .1 referring to ratebeer, they also have a diff and sim column. The end format we want to have: [general_id, old_id_advocate, 'old_id_ratebeer', location, name and nbr_beers]. However we need to be carefull to the duplicates of beers in the beers dataset. As the duplicates where for advocate and not Ratebeer we make the convention to take the name and location from Ratebeer. We also want to drop diff and sim.  
Let us first get the duplicates of breweries.

In [28]:
breweries_duplicates_ratebeer = breweries_matched[breweries_matched['id.1'].duplicated(keep=False)]
breweries__not_duplicated_ratebeer = breweries_matched[~breweries_matched['id.1'].duplicated(keep=False)]#For later
breweries_duplicates_advocate = breweries_matched[breweries_matched['id'].duplicated(keep=False)]

print("Number of duplicated/tripled Ratbeer breweries",len(breweries_duplicates_ratebeer))
print("Number of duplicated/tripled Advocate breweries",len(breweries_duplicates_advocate))

Number of duplicated/tripled Ratbeer breweries 91
Number of duplicated/tripled Advocate breweries 0


In [29]:
#We are going to build a new dataset instead of playing with the dataset that contains every information for duplicates.

unique_ratebeer_brewery_id = breweries_duplicates_ratebeer['id.1'].unique()
new_matched_brewery_duplicate = pd.DataFrame(columns=['location', 'name', 'nbr_beers','old_advocate_id','old_ratebeer_id'])

for id in unique_ratebeer_brewery_id:  # Replace with your actual loop condition
    # Define data for each row
    location = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['location.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    name = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['name.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    old_advocate_id = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['id']
    list_old_advocate_id = []
    for id_advocate in (old_advocate_id):
        list_old_advocate_id.append(id_advocate)
    old_ratebeer_id = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['id.1'].iloc[0]

    total_beers_advocate = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['nbr_beers'].sum()
    total_beers_ratebeer = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['nbr_beers.1'].iloc[0] #don't want to double the number of beers for the one duplicated

    nbr_beers = total_beers_advocate+ total_beers_ratebeer

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'location': location, 'name': name, 'nbr_beers': nbr_beers,'old_advocate_id':[list_old_advocate_id],'old_ratebeer_id':old_ratebeer_id})
    new_matched_brewery_duplicate = pd.concat([new_matched_brewery_duplicate, new_row], ignore_index=True)

new_matched_brewery_duplicate.sample(10)


Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
26,"United States, Virginia",Crooked Run Brewing,141,"[32661, 48463]",17177
0,England,Seven Bro7hers,11,"[37180, 45243]",20891
12,Australia,Cavalier Beer,98,"[25867, 36326]",12657
5,Canada,Skeena Brewing Company,2,"[38530, 13832]",11649
24,"United States, Maryland",DuClaw Brewing Company,307,"[1924, 16345]",779
17,"United States, Louisiana",Rikenjaks Brewing Company,6,"[44929, 970]",3860
15,Sweden,Carlsberg Sverige,265,"[5368, 10897]",765
37,"United States, Oregon",Pelican Pub & Brewery,202,"[48599, 1304]",1511
33,"United States, Ohio",Hoof Hearted Brewing,199,"[30179, 44305]",14572
29,"United States, North Carolina",Lynnwood Grill & Brewing Concern,60,"[47982, 33492]",19642


Did the next cell in a dumb way just need to rename the columns, would be faster.

In [30]:
unique_ratebeer_brewery_id = breweries__not_duplicated_ratebeer['id.1'].unique()
new_matched_brewery_non_duplicate = pd.DataFrame(columns=['location', 'name', 'nbr_beers','old_advocate_id','old_ratebeer_id'])

for id in unique_ratebeer_brewery_id:  # Replace with your actual loop condition
    # Define data for each row
    location = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['location.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    name = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['name.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    old_advocate_id = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['id'].iloc[0]
    old_ratebeer_id = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['id.1'].iloc[0]

    total_beers_advocate = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['nbr_beers'].sum()
    total_beers_ratebeer = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['nbr_beers.1'].sum()

    nbr_beers = total_beers_advocate+ total_beers_ratebeer

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'location': location, 'name': name, 'nbr_beers': nbr_beers,'old_advocate_id':[old_advocate_id],'old_ratebeer_id':old_ratebeer_id})
    new_matched_brewery_non_duplicate = pd.concat([new_matched_brewery_non_duplicate, new_row], ignore_index=True)

new_matched_brewery_non_duplicate.sample(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
3058,Czech Republic,Pivovar Chomout,41,39847,21344
6805,"United States, Oregon",Agrarian Ales,183,31704,15788
1352,Italy,Il Quarto dOra Granata (Alvemar srl),27,38859,16464


In [31]:
#Concat the two datasets together
new_matched_brewery = pd.concat([new_matched_brewery_non_duplicate, new_matched_brewery_duplicate], ignore_index=True)
new_matched_brewery.sample(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
910,Canada,Hell Or High Water Beer And Spirits Inc,2,39897,22602
2643,Spain,Monsieur Gordo Brewery,8,46050,21308
954,Canada,Phillips Brewing Co.,309,2675,1844


In [32]:
print(len(new_matched_brewery), len(new_matched_brewery_non_duplicate), len(new_matched_brewery_duplicate))
print(len(new_matched_brewery.old_ratebeer_id.unique()))

8235 8190 45
8235


Now we need to count the number of beers that there really is. We assume that a beer can only be matched if its brewery is matched too.

In [33]:
new_matched_brewery_good_amount_rating = new_matched_brewery.copy()
for id in beers_matched['brewery_id.1']:
    new_matched_brewery_good_amount_rating.loc[new_matched_brewery_good_amount_rating['old_ratebeer_id'] == id, 'nbr_beers'] -= 1
    


In [34]:
new_matched_brewery_good_amount_rating.sample(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
4066,"United States, California",Indian Joe Brewing,56,30546,15479
2146,Germany,Privatbrauerei Rogg,38,6790,9921
979,Canada,BRB &#40;Be Right Back - Big River&#41;,102,7527,3288


Now let's match the three datasets together.

In [35]:
breweries_ratebeer_solo = breweries_ratebeer_solo.rename(columns={'id': 'old_ratebeer_id'})
breweries_ratebeer_solo['old_advocate_id'] = np.nan
breweries_ratebeer_solo.sample(3)  

Unnamed: 0,old_ratebeer_id,location,name,nbr_beers,old_advocate_id
14465,5945,"United States, Washington",Elkhead Brewing Company,26,
10000,26170,Vietnam,Tê Tê Brewing Co.,1,
9296,28846,Austria,GROK,4,


In [36]:
breweries_advocate_solo = breweries_advocate_solo.rename(columns={'id': 'old_advocate_id'})
breweries_advocate_solo['old_ratebeer_id'] = np.nan
breweries_advocate_solo.sample(3)  

Unnamed: 0,old_advocate_id,location,name,nbr_beers,old_ratebeer_id
16189,1887,"United States, Pennsylvania",Jack's Mountain Restaurant and Brewery,6,
6580,34185,Brazil,Serra de Três Pontas Cervejaria Artesanal,4,
635,71,England,Fuller Smith & Turner PLC,61,


In [37]:
full_breweries = pd.concat([new_matched_brewery_good_amount_rating,breweries_advocate_solo, breweries_ratebeer_solo], ignore_index=True)
full_breweries['id'] = range(1, len(full_breweries) + 1)
full_breweries.sample(10)


Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
22424,Australia,Currumbin Valley Brewing,2,,29486.0,22425
31269,England,Tom Woods,51,,53.0,31270
32134,France,La Ferme de Kergador (Louis Pierre Le Meur),1,,27980.0,32135
25212,Netherlands,Hettingabier,10,,10224.0,25213
30572,England,Hop Studio,53,,14557.0,30573
19354,Italy,Terra del Sole,1,,13961.0,19355
9339,Canada,Axe & Barrel Brewing Company,6,43343.0,,9340
17391,Ukraine,Ohtyrskyj Pyvovarnyj Zavod (Obolon),10,,17472.0,17392
15726,"United States, Indiana",Mad Anthony's Old State Alehouse,0,21785.0,,15727
9694,Bulgaria,Ale House,1,21746.0,,9695


In [38]:
print(len(full_breweries))
print(len(breweries_matched),len(breweries_advocate),len(breweries_ratebeer))
print(-len(breweries_matched)+len(breweries_advocate)+len(breweries_ratebeer))

32666
8281 16758 24189
32666


This confirms the number of single breweries we computed in data_understanding.ipynb.

In [39]:
full_breweries.to_csv(os.path.join(FULL_PATH,'breweries.csv'),index=False)

In [40]:
test_breweri = pd.read_csv(os.path.join(FULL_PATH,'breweries.csv'))
print(len(test_breweri))
del test_breweri

32666


**Users dataset**

We prefer to consider the users approx rather than the users of the matched dataset. We saw that the users do have duplicates (unlike the normal file). We decided to take the different users in the approx file as a single user even for a lower similitude value (no sim value is below 0.8006407690254358). We consider the effect of this to be negligeable as it is a small percent of data. (like in the brewery file).

In [11]:
users_matched = pd.read_csv(os.path.join(MATCHED_PATH,'users_approx.csv'),header=1)
users_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'users.csv'))
users_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'users.csv'))

ratings_matched = pd.read_csv(os.path.join(MATCHED_PATH,'ratings.csv'),header=1)#needed 


print("Length of the three datasets:\n-advocate:",len(users_advocate),"\n-matched:", len(users_matched),"\n-ratebeer:", len(users_ratebeer))

#Already delete the breweries presents in matched
users_ratebeer_solo = users_ratebeer[~users_ratebeer.user_name.isin(users_matched['user_name.1'])]
users_advocate_solo = users_advocate[~users_advocate.user_name.isin(users_matched['user_name'])]

print("New length of:\n-advocate:",len(users_advocate_solo),"\n-ratebeer:", len(users_ratebeer_solo))

Length of the three datasets:
-advocate: 153704 
-matched: 3341 
-ratebeer: 70174
New length of:
-advocate: 150388 
-ratebeer: 66833


In [12]:
users_matched.head(2)

Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1,sim
0,1483009000.0,Spain,3,0,magicuenca.1185749,MAGICuenca,magicuenca,1484046000.0,Spain,89,442761,MAGICuenca91,magicuenca91,0.904534
1,1220868000.0,Germany,6,6,erzengel.248045,Erzengel,erzengel,1224324000.0,Germany,8781,83106,Erzengel,erzengel,1.0


In [13]:
users_duplicates_ratebeer = users_matched[users_matched['user_name.1'].duplicated(keep=False)]
users_duplicates_advocate = users_matched[users_matched['user_name'].duplicated(keep=False)]
users__not_duplicated_advocate = users_matched[~users_matched['user_name'].duplicated(keep=False)]#For later

print("Number of duplicated/tripled Ratbeer users",len(users_duplicates_ratebeer))
print("Number of duplicated/tripled Advocate users",len(users_duplicates_advocate))

Number of duplicated/tripled Ratbeer users 0
Number of duplicated/tripled Advocate users 47


Both advocate and ratebeer dataset have as columns ['user_id', 'location', 'user_name','user_name_lower', 'joined','nbr_ratings']. Advocate has nbr_reviews in addition. Matched has those columns with .1 referring to ratebeer. It also has a column 'sim' that we will drop. The end format we want to have: [general_id, old_user_id_advocate, old_user__id_ratebeer, location, user_name_lower, joined_advocate, joined_ratebeer and nbr_ratings]. We think that the other columns will not be usefull for our analysis. The next cell shows that the user_name_lower are equal between the two datasets. For the location we will in opposition to the brewery give priority to the advocate dataset, as it is this time the one having one correspondence to multiple ratebeer users.

In [14]:
users_duplicates_advocate.sample(2)

Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1,sim
1186,1392462000.0,Australia,1,0,azza.782374,Azza,azza,1415272000.0,Australia,53,343397,wazzawazza,wazzawazza,0.840168
10,1245751000.0,England,32,32,leighton.343447,leighton,leighton,1209204000.0,England,19568,74136,Leighton,leighton,1.0


In [15]:
users_duplicates_advocate[users_duplicates_advocate['user_id']=='lonestar.677281'].head(2)

Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1,sim
562,1337422000.0,"United States, Texas",1,1,lonestar.677281,Lonestar,lonestar,1081332000.0,"United States, Texas",13,11446,oneStar,onestar,0.822609
2059,1337422000.0,"United States, Texas",1,1,lonestar.677281,Lonestar,lonestar,1162984000.0,"United States, Texas",6,44744,LONESTAR,lonestar,1.0


In [16]:
#We are going to build a new dataset instead of playing with the dataset that contains every information for duplicates.

unique_advocate_user_user_name = users_duplicates_advocate['user_name'].unique()
new_matched_user_duplicate = pd.DataFrame(columns=['location', 'joined_advocate', 'old_user_id_advocate','joined_ratebeer','old_user_id_ratebeer','user_name','nbr_ratings_total'])

for user_name in unique_advocate_user_user_name:  # Replace with your actual loop condition
    # Define data for each row
    location = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['location'].iloc[0]#Always take the first as it is the same for the two/three of them
    joined_advocate = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['joined'].iloc[0]
    joined_ratebeer = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['joined.1'].min() #take min value of joined time (we want the first time they joined)

    old_user_id_advocate = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['user_id'].iloc[0]
    old_user_id_ratebeer = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['user_id.1']
    name = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['user_name'].iloc[0]

    list_old_ratebeer_id = []
    for id_ratbeer in (old_user_id_ratebeer):
        list_old_ratebeer_id.append(id_ratbeer)

    total_ratings_advocate = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['nbr_ratings'].iloc[0]
    total_ratings_ratebeer = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['nbr_ratings.1'].sum()

    nbr_ratings_total = total_ratings_advocate+ total_ratings_ratebeer

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'location': location, 'user_name': name,'old_user_id_ratebeer':[list_old_ratebeer_id],'old_user_id_advocate':old_user_id_advocate,'joined_advocate':joined_advocate,'joined_ratebeer':joined_ratebeer,'nbr_ratings_total':nbr_ratings_total})
    new_matched_user_duplicate = pd.concat([new_matched_user_duplicate, new_row], ignore_index=True)

new_matched_user_duplicate.sample(10)


  new_matched_user_duplicate = pd.concat([new_matched_user_duplicate, new_row], ignore_index=True)


Unnamed: 0,location,joined_advocate,old_user_id_advocate,joined_ratebeer,old_user_id_ratebeer,user_name,nbr_ratings_total
11,"United States, Indiana",1399975000.0,indianabeerman.799605,1070104000.0,"[9581, 283897]",indianabeerman,668
0,Spain,1483009000.0,magicuenca.1185749,1481108000.0,"[442761, 437310]",MAGICuenca,101
7,Canada,1156068000.0,nighthawk.93517,1150452000.0,"[124623, 38708]",nighthawk,212
12,Netherlands,1218103000.0,schuim.240905,1007118000.0,"[2462, 105981]",schuim,79
2,"United States, Florida",1350209000.0,porterporter.698876,1023790000.0,"[4759, 72217]",PorterPorter,7382
6,"United States, New York",1159956000.0,drinkinbuddy.101255,1052042000.0,"[7402, 7736]",DrinkinBuddy,175
13,"United States, Pennsylvania",1200568000.0,beerfinder.187713,1004177000.0,"[54721, 2078]",beerfinder,244
1,England,1245751000.0,leighton.343447,1209204000.0,"[74136, 257478]",leighton,19601
10,Australia,1392462000.0,azza.782374,1174903000.0,"[343397, 355085, 51942]",Azza,56
16,Norway,1334225000.0,morten.672301,1140606000.0,"[137013, 33840, 137302]",Morten,16


In [17]:
users_matched_not_duplicated = users__not_duplicated_advocate.drop(columns=['nbr_reviews','user_name.1','user_name_lower','location','user_name_lower.1','sim'])
users_matched_not_duplicated = users_matched_not_duplicated.rename(columns={'joined': 'joined_advocate','joined.1': 'joined_ratebeer','user_id': 'old_user_id_advocate','user_id.1': 'old_user_id_ratebeer','location.1':'location'})
users_matched_not_duplicated["nbr_ratings_total"] = users_matched_not_duplicated.nbr_ratings+users_matched_not_duplicated["nbr_ratings.1"]
users_matched_not_duplicated = users_matched_not_duplicated.drop(columns=['nbr_ratings','nbr_ratings.1'])
users_matched_not_duplicated.sample(5)

Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total
613,1383217000.0,treeoceaneast.761611,treeoceaneast,1378980000.0,Canada,279043,3
1356,1042110000.0,joss.1315,Joss,1348135000.0,Germany,218034,14
2543,1111662000.0,schooly.16779,Schooly,1106046000.0,"United States, Pennsylvania",19115,23
1355,1233745000.0,jgurley.294858,jgurley,1308478000.0,"United States, Florida",130408,72
2803,1205320000.0,skibumdc.202443,skibumdc,1197976000.0,"United States, Virginia",66059,11


In [18]:
#Concat the two datasets together
new_matched_user = pd.concat([users_matched_not_duplicated, new_matched_user_duplicate], ignore_index=True)
new_matched_user.sample(3)

Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total
2101,1053511000.0,hopjack13.2120,hopjack13,1049796000.0,"United States, California",7291,6
2828,1099044000.0,edbeered.9765,edbeered,1094378000.0,"United States, Minnesota",15032,169
1241,1373537000.0,fl1pzomg.741216,Fl1pzomg,1373450000.0,"United States, Washington",268492,3


In [19]:
users_matched_not_duplicated_good_amount_ratings = new_matched_user.copy()
for id in ratings_matched['user_id']:
    users_matched_not_duplicated_good_amount_ratings.loc[users_matched_not_duplicated_good_amount_ratings['old_user_id_advocate'] == id, 'nbr_ratings_total'] -= 1

print(new_matched_user.nbr_ratings_total.sum())
print(users_matched_not_duplicated_good_amount_ratings.nbr_ratings_total.sum())

1608147
1586183


In [20]:
users_ratebeer_solo = users_ratebeer_solo.rename(columns={'user_id': 'old_user_id_ratebeer','joined':'joined_ratebeer','nbr_ratings':'nbr_ratings_total'})
users_ratebeer_solo['old_user_id_advocate'] = np.nan
users_ratebeer_solo['joined_advocate'] = np.nan
users_ratebeer_solo.sample(3)  

Unnamed: 0,nbr_ratings_total,old_user_id_ratebeer,user_name,joined_ratebeer,location,old_user_id_advocate,joined_advocate
42328,8,75008,firstdraft,1210586000.0,,,
47237,3,258552,scott060462,1368439000.0,"United States, Illinois",,
47025,4,38896,jujutsuweasel,1150884000.0,"United States, Washington",,


In [21]:
users_advocate_solo = users_advocate_solo.rename(columns={'user_id': 'old_user_id_advocate','joined':'joined_advocate','nbr_ratings':'nbr_ratings_total'})
users_advocate_solo = users_advocate_solo.drop(columns='nbr_reviews')
users_advocate_solo['old_user_id_ratebeer'] = np.nan
users_advocate_solo['joined_ratebeer'] = np.nan
users_advocate_solo.sample(3)  

Unnamed: 0,nbr_ratings_total,old_user_id_advocate,user_name,joined_advocate,location,old_user_id_ratebeer,joined_ratebeer
21643,22,gary63.930312,Gary63,1421752000.0,"United States, Illinois",,
133912,1,_dreamchaser305.1014716,_dreamchaser305,1436954000.0,"United States, Florida",,
119189,9,mmiotke.683091,mmiotke,1341137000.0,"United States, Massachusetts",,


In [22]:
full_users = pd.concat([users_matched_not_duplicated_good_amount_ratings,users_ratebeer_solo, users_advocate_solo], ignore_index=True)
full_users['id'] = range(1, len(full_users) + 1)
full_users.sample(2)


Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total,id
54000,,,Beerman88,1453460000.0,"United States, Colorado",398424.0,1,54001
101779,1205060000.0,weapontheyfear.201605,WeaponTheyFear,,"United States, Connecticut",,32,101780


In [23]:
print(len(full_users))
print(len(users_matched),len(users_advocate),len(users_ratebeer))
print(len(users_matched_not_duplicated_good_amount_ratings)+len(users_advocate_solo)+len(users_ratebeer_solo))

220537
3341 153704 70174
220537


In [24]:
test = full_users[full_users['user_name'].duplicated(keep=False)]
test[test['user_name']=='Elwood']

Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total,id
990,1152871000.0,elwood.88673,Elwood,1374314000.0,Canada,270235,6,991
4952,,,Elwood,1235041000.0,"United States, Virginia",87609,2966,4953


Oh so in different dataset people can have the same user_name. We need to be carefull avout this. If we treat the data we need to use id.

In [25]:
full_users.to_csv(os.path.join(FULL_PATH,'users.csv'),index=False)

In [26]:
test_user = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))
print(len(test_user))
del test_user

220537


  test_user = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))


**Beer dataset**

In this dataset we saw that there were no duplicates. We do not need to make the carefull analysis we made before. However we still need to look at the matched dataset. We also need to link the new brewery id, but we also need to count the number of ratings.

In [202]:
beers_matched = pd.read_csv(os.path.join(MATCHED_PATH,'beers.csv'),header=1)
beers_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'beers.csv'))
beers_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'beers.csv'))

ratings_matched = pd.read_csv(os.path.join(MATCHED_PATH,'ratings.csv'),header=1)#needed 
full_breweries_for_beers = pd.read_csv(os.path.join(FULL_PATH,'breweries.csv'))#needed



print("Length of the three datasets:\n-advocate:",len(beers_advocate),"\n-matched:", len(beers_matched),"\n-ratebeer:", len(beers_ratebeer))

#Already delete the breweries presents in matched
beers_ratebeer_solo = beers_ratebeer[~beers_ratebeer.beer_id.isin(beers_matched['beer_id.1'])]
beers_advocate_solo = beers_advocate[~beers_advocate.beer_id.isin(beers_matched['beer_id'])]

print("New length of:\n-advocate:",len(beers_advocate_solo),"\n-ratebeer:", len(beers_ratebeer_solo))

Length of the three datasets:
-advocate: 280823 
-matched: 45640 
-ratebeer: 442081
New length of:
-advocate: 235183 
-ratebeer: 396441


In [203]:
beers_matched.sample(2)

Unnamed: 0,abv,avg,avg_computed,avg_matched_valid_ratings,ba_score,beer_id,beer_name,beer_wout_brewery_name,brewery_id,brewery_name,...,brewery_id.1,brewery_name.1,nbr_matched_valid_ratings.1,nbr_ratings.1,overall_score,style.1,style_score,zscore.1,diff,sim
36584,11.7,3.48,3.677955,3.470385,80.0,88047,Death,Death,22157,Rivertown Brewing Co.,...,11245,Rivertown Brewery,48,48,60.0,Imperial Stout,10.0,-0.198514,1.0,1.0
29005,5.2,3.63,3.675,3.6,,274565,Pauline,Pauline,48752,Two Plumbers Brewery + Arcade,...,30677,Two Plumbers Brewery + Arcade,2,2,,Golden Ale/Blond Ale,,-0.509111,1.0,1.0


In [204]:
beers_new = beers_matched.drop(columns=['avg','beer_wout_brewery_name.1','avg.1','avg_matched_valid_ratings.1','nbr_reviews','beer_name.1','brewery_name.1','brewery_name','avg_computed','avg_computed.1','avg_matched_valid_ratings','ba_score','beer_wout_brewery_name','sim','diff','zscore','zscore.1','overall_score','style_score','nbr_matched_valid_ratings','nbr_matched_valid_ratings.1'])

In [205]:
beers_new.sample(2)

Unnamed: 0,abv,beer_id,beer_name,brewery_id,bros_score,nbr_ratings,style,abv.1,beer_id.1,brewery_id.1,nbr_ratings.1,style.1
20951,4.5,141742,Spellbound Session Porter,34780,,11,American Porter,4.5,292681,21087,8,Porter
34248,6.6,235589,Hoity Toity,33255,,0,American Strong Ale,6.6,292421,18745,1,American Strong Ale


In [206]:
print((beers_new['abv'] == beers_new['abv.1']).all())

True


Can use just one

In [208]:
#We are going to build a new dataset instead of playing with the dataset that contains every information for duplicates.
#Not sure about valid ratings so prefer to keep it as it is for now
new_matched_beer = pd.DataFrame(columns=['abv', 'old_beer_id_advocate', 'old_beer_id_ratebeer','beer_name','brewery_id','bros_score','nbr_ratings','style_advocate','style_ratebeer'])

for index, row in beers_new.iterrows():
    # Define data for each row
    abv = row['abv']
    beer_id_advocate = row['beer_id']
    beer_id_ratebeer = row['beer_id.1']
    beer_name = row['beer_name']
    bros_score = row['bros_score']
    style_advocate = row['style']
    style_ratebeer = row['style.1']


    brewery_id = full_breweries_for_beers[full_breweries_for_beers['old_ratebeer_id']==row['brewery_id.1']]['id']


    total_ratings_advocate = row['nbr_ratings']
    total_ratings_ratebeer = row['nbr_ratings.1']

    nbr_ratings_total = total_ratings_advocate+ total_ratings_ratebeer

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'abv':abv, 'old_beer_id_advocate':beer_id_advocate, 'old_beer_id_ratebeer':beer_id_ratebeer,'beer_name':beer_name,'brewery_id':brewery_id,'bros_score':bros_score,'nbr_ratings':nbr_ratings_total,'style_advocate':style_advocate,'style_ratebeer':style_ratebeer})
    new_matched_beer = pd.concat([new_matched_beer, new_row], ignore_index=True)

new_matched_beer.sample(10)

  new_matched_beer = pd.concat([new_matched_beer, new_row], ignore_index=True)


Unnamed: 0,abv,old_beer_id_advocate,old_beer_id_ratebeer,beer_name,brewery_id,bros_score,nbr_ratings,style_advocate,style_ratebeer
29269,10.5,257321,466109,Middle Earth,5660,,3,American Double / Imperial Stout,Imperial Stout
9228,6.6,251223,419598,Cerveza Negra,2416,,8,Euro Dark Lager,Dunkel/Tmavý
33181,5.5,139671,284998,Strawberry Whale Cake,6128,,88,Cream Ale,Fruit Beer
41233,6.0,257110,477972,Cold Town,7100,,2,Cream Ale,Cream Ale
1230,4.2,103759,49634,Kilmington Best,278,,10,English Bitter,Bitter
30330,7.1,135044,279375,Peloton Saison,5789,,11,Saison / Farmhouse Ale,Saison
11064,5.0,15304,30183,Godelief,3194,,29,Belgian Pale Ale,Belgian Ale
6944,13.5,58896,107790,Xyauyù Etichetta Argento (Silver),1477,,127,English Barleywine,Barley Wine
45312,3.5,30385,62658,Mussel Ridge Summer Ale,8047,,2,American Blonde Ale,Golden Ale/Blond Ale
37796,3.0,231373,421988,Cucumber And Lime Maigre,6659,,28,Fruit / Vegetable Beer,Berliner Weisse


In [214]:
new_matched_beer_good_amount_ratings = new_matched_beer.copy()
for id in ratings_matched['beer_id']:
    new_matched_beer_good_amount_ratings.loc[new_matched_beer_good_amount_ratings['old_beer_id_advocate'] == id, 'nbr_ratings'] -= 1

print(new_matched_beer.nbr_ratings.sum())
print(new_matched_beer_good_amount_ratings.nbr_ratings.sum())

1976606
1954642


In [218]:
beers_ratebeer_solo_new = beers_ratebeer_solo.copy()
beers_ratebeer_solo_new = beers_ratebeer_solo_new.drop(columns=['brewery_name','overall_score','style_score','avg','avg_computed','zscore','nbr_matched_valid_ratings','avg_matched_valid_ratings'])
beers_ratebeer_solo_new = beers_ratebeer_solo_new.rename(columns={'beer_id': 'old_beer_id_ratebeer','style':'style_ratebeer'})

beers_ratebeer_solo_new = pd.merge(beers_ratebeer_solo_new,full_breweries_for_beers[['old_ratebeer_id', 'id']],how='left', left_on='brewery_id',right_on='old_ratebeer_id')
beers_ratebeer_solo_new = beers_ratebeer_solo_new.drop(columns=['brewery_id','old_ratebeer_id'])
beers_ratebeer_solo_new = beers_ratebeer_solo_new.rename(columns={'id': 'brewery_id'})

beers_ratebeer_solo_new['old_beer_id_advocate'] = np.nan
beers_ratebeer_solo_new['bros_score'] = np.nan
beers_ratebeer_solo_new['style_advocate']=np.nan

beers_ratebeer_solo_new.sample(3)

Unnamed: 0,old_beer_id_ratebeer,beer_name,style_ratebeer,nbr_ratings,abv,brewery_id,old_beer_id_advocate,bros_score,style_advocate
73145,187059,Steinfels Dunkel,Dunkelweizen,2,,2199,,,
160312,351104,Kuracali Session Brown,Brown Ale,1,4.5,3939,,,
172236,95407,Schooners Blueberry Wheat,Fruit Beer,2,,27149,,,


In [221]:
beers_advocate_solo.sample(1)

Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style,nbr_ratings,nbr_reviews,avg,ba_score,bros_score,abv,avg_computed,zscore,nbr_matched_valid_ratings,avg_matched_valid_ratings
190306,198703,Pound Of Flesh,35374,Haw River Farmhouse Ales,American Wild Ale,2,2,4.15,,,,4.155,,0,


In [240]:
beers_advocate_solo_new = beers_advocate_solo.copy()
beers_advocate_solo_new = beers_advocate_solo_new.drop(columns=['brewery_name','nbr_reviews','ba_score','avg','avg_computed','zscore','nbr_matched_valid_ratings','avg_matched_valid_ratings'])
beers_advocate_solo_new = beers_advocate_solo_new.rename(columns={'beer_id': 'old_beer_id_advocate','style':'style_advocate'})


full_breweries_for_beers_exploded = full_breweries_for_beers.explode('old_advocate_id')

beers_advocate_solo_new['brewery_id'] = beers_advocate_solo_new['brewery_id'].astype(str)
full_breweries_for_beers_exploded['old_advocate_id'] = full_breweries_for_beers_exploded['old_advocate_id'].astype(str)
full_breweries_for_beers_exploded['id'] = full_breweries_for_beers_exploded['id'].astype(str)

beers_advocate_solo_new = pd.merge(beers_advocate_solo_new,full_breweries_for_beers_exploded[['old_advocate_id', 'id']],how='left', left_on='brewery_id',right_on='old_advocate_id')
beers_advocate_solo_new = beers_advocate_solo_new.drop(columns=['brewery_id','old_advocate_id'])
beers_advocate_solo_new = beers_advocate_solo_new.rename(columns={'id': 'brewery_id'})

beers_advocate_solo_new['old_beer_id_ratebeer'] = np.nan
beers_advocate_solo_new['style_ratebeer']=np.nan

beers_advocate_solo_new.head()


Unnamed: 0,old_beer_id_advocate,beer_name,style_advocate,nbr_ratings,bros_score,abv,brewery_id,old_beer_id_ratebeer,style_ratebeer
0,166064,Nashe Moskovskoe,Euro Pale Lager,0,,4.7,8236,,
1,166065,Nashe Pivovskoe,Euro Pale Lager,0,,3.8,8236,,
2,166066,Nashe Shakhterskoe,Euro Pale Lager,0,,4.8,8236,,
3,166067,Nashe Zhigulevskoe,Euro Pale Lager,0,,4.0,8236,,
4,166063,Zhivoe,Euro Pale Lager,0,,4.5,8236,,


In [241]:
full_beers = pd.concat([new_matched_beer_good_amount_ratings,beers_ratebeer_solo_new, beers_advocate_solo_new], ignore_index=True)
full_beers['id'] = range(1, len(full_beers) + 1)
full_beers.sample(2)


Unnamed: 0,abv,old_beer_id_advocate,old_beer_id_ratebeer,beer_name,brewery_id,bros_score,nbr_ratings,style_advocate,style_ratebeer,id
584745,4.7,197818.0,,Belle Isle Blonde,13936,,4,American Blonde Ale,,584746
82481,7.2,,197333.0,Buonconvento Ambrio,18807,,0,,Abbey Dubbel,82482


In [242]:
print(len(full_beers))
print(len(beers_matched),len(beers_advocate),len(beers_ratebeer))
print(len(new_matched_beer_good_amount_ratings)+len(beers_advocate_solo_new)+len(beers_ratebeer_solo_new))

677264
45640 280823 442081
677264


In [244]:
full_beers.to_csv(os.path.join(FULL_PATH,'beers.csv'),index=False)

In [245]:
test_beer = pd.read_csv(os.path.join(FULL_PATH,'beers.csv'))
print(len(test_beer))
del test_beer

  test_beer = pd.read_csv(os.path.join(FULL_PATH,'beers.csv'))


677264


**Ratings**

In [2]:
ratings_matched = pd.read_csv(os.path.join(MATCHED_PATH,'ratings.csv'),header=1)
ratings_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'ratings-advocate.csv'))
ratings_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'ratings.csv'))

In [3]:
full_users = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))
full_beers = pd.read_csv(os.path.join(FULL_PATH,'beers.csv'))
full_breweries = pd.read_csv(os.path.join(FULL_PATH,'breweries.csv'))

  full_users = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))
  full_beers = pd.read_csv(os.path.join(FULL_PATH,'beers.csv'))


In [4]:
ratings_advocate_dropped_columns = ratings_advocate.drop(columns=['review','beer_name','brewery_name','style','user_name','abv'])
ratings_matched_dropped_columns = ratings_matched.drop(columns=['review','beer_name','beer_name.1','brewery_name','brewery_name.1','style','style.1','user_name','user_name.1','abv','abv.1'])
ratings_ratebeer_dropped_columns = ratings_ratebeer.drop(columns=['beer_name','brewery_name','style','user_name','abv'])


In contradiction to the previous method, we would like to keep both comments and grades given the the datasets for a macthed comments. We want to do this as the text might differ and the grades too. First we just want to add a column 'dataset' to remind from which dataset the data contain. We also want to add a comment duplicate, which will link the comment 'id' to its matched comment. Naturally a comment id column would be added too. Finally the columns beer_id, brewery_id et user_id would be link to the one of the new id given in full_users, full_breweries and full_beers.

In [5]:
ratings_ratebeer_dropped_columns['dataset'] = 'rb'
ratings_advocate_dropped_columns['dataset'] = 'ad'

final_id_ratebeer = len(ratings_ratebeer_dropped_columns) + 1
first_id_advocate = final_id_ratebeer
final_id_advocate = first_id_advocate + len(ratings_advocate_dropped_columns)
ratings_ratebeer_dropped_columns['id_rating'] = range(1, final_id_ratebeer) #For the moment call it id_rating, rename it later 
ratings_advocate_dropped_columns['id_rating'] = range(first_id_advocate, final_id_advocate)

assert ratings_ratebeer_dropped_columns.iloc[len(ratings_ratebeer_dropped_columns)-1]['id_rating']!=ratings_advocate_dropped_columns.iloc[0]['id_rating']

Give new rewery id

In [6]:
#ratings_ratebeer_dropped_columns['new_brewery_id'] = full_breweries[full_breweries.old_ratebeer_id == ratings_ratebeer_dropped_columns.brewery_id]['id']
ratings_ratebeer_new_brewery_id = ratings_ratebeer_dropped_columns.merge(full_breweries[['old_ratebeer_id', 'id']],how='left',left_on='brewery_id',right_on='old_ratebeer_id')
ratings_ratebeer_new_brewery_id = ratings_ratebeer_new_brewery_id.drop(columns='old_ratebeer_id')
ratings_ratebeer_new_brewery_id = ratings_ratebeer_new_brewery_id.rename(columns={'id':'id_brewery'})

In [7]:
full_breweries_for_ratings_exploded = full_breweries.explode('old_advocate_id')

ratings_advocate_dropped_columns['brewery_id'] = ratings_advocate_dropped_columns['brewery_id'].astype(str)
full_breweries_for_ratings_exploded['old_advocate_id'] = full_breweries_for_ratings_exploded['old_advocate_id'].astype(str)
full_breweries_for_ratings_exploded['id'] = full_breweries_for_ratings_exploded['id'].astype(str)

ratings_advocate_new_brewery_id = ratings_advocate_dropped_columns.merge(full_breweries_for_ratings_exploded[['old_advocate_id', 'id']],how='left',left_on='brewery_id',right_on='old_advocate_id')
ratings_advocate_new_brewery_id = ratings_advocate_new_brewery_id.drop(columns='old_advocate_id')
ratings_advocate_new_brewery_id = ratings_advocate_new_brewery_id.rename(columns={'id':'id_brewery'})

assert len(ratings_advocate_dropped_columns) == len(ratings_advocate_new_brewery_id)

Give new user id

In [8]:
full_users_for_ratings_exploded = full_users.explode('old_user_id_ratebeer')

ratings_ratebeer_new_brewery_id['user_id'] = ratings_ratebeer_new_brewery_id['user_id'].astype(str)
full_users_for_ratings_exploded['old_user_id_ratebeer'] = full_users_for_ratings_exploded['old_user_id_ratebeer'].astype(str)
full_users_for_ratings_exploded['id'] = full_users_for_ratings_exploded['id'].astype(str)

ratings_ratebeer_new_user_id = ratings_ratebeer_new_brewery_id.merge(full_users_for_ratings_exploded[['old_user_id_ratebeer', 'id']],how='left',left_on='user_id',right_on='old_user_id_ratebeer')
ratings_ratebeer_new_user_id = ratings_ratebeer_new_user_id.drop(columns='old_user_id_ratebeer')
ratings_ratebeer_new_user_id = ratings_ratebeer_new_user_id.rename(columns={'id':'id_user'})

In [9]:
ratings_advocate_new_user_id = ratings_advocate_new_brewery_id.merge(full_users[['old_user_id_advocate', 'id']],how='left',left_on='user_id',right_on='old_user_id_advocate')
ratings_advocate_new_user_id = ratings_advocate_new_user_id.drop(columns='old_user_id_advocate')
ratings_advocate_new_user_id = ratings_advocate_new_user_id.rename(columns={'id':'id_user'})

Give new beer id

In [10]:
ratings_ratebeer_new_beer_id = ratings_ratebeer_new_user_id.merge(full_breweries[['old_ratebeer_id', 'id']],how='left',left_on='beer_id',right_on='old_ratebeer_id')
ratings_ratebeer_new_beer_id = ratings_ratebeer_new_beer_id.drop(columns='old_ratebeer_id')
ratings_ratebeer_new_beer_id = ratings_ratebeer_new_beer_id.rename(columns={'id':'id_beer'})

In [11]:
full_users_for_ratings_exploded = full_breweries.explode('old_advocate_id')

ratings_advocate_new_user_id['beer_id'] = ratings_advocate_new_user_id['beer_id'].astype(str)
full_users_for_ratings_exploded['old_advocate_id'] = full_users_for_ratings_exploded['old_advocate_id'].astype(str)
full_users_for_ratings_exploded['id'] = full_users_for_ratings_exploded['id'].astype(str)

ratings_advocate_new_beer_id = ratings_advocate_new_user_id.merge(full_users_for_ratings_exploded[['old_advocate_id', 'id']],how='left',left_on='beer_id',right_on='old_advocate_id')
ratings_advocate_new_beer_id = ratings_advocate_new_beer_id.drop(columns='old_advocate_id')
ratings_advocate_new_beer_id = ratings_advocate_new_beer_id.rename(columns={'id':'id_beer'})

Now fin the matched comments

In [12]:
ratings_advocate_matched = ratings_advocate_new_beer_id.copy()
ratings_ratebeer_matched = ratings_ratebeer_new_beer_id.copy()

ratings_advocate_matched['matched'] = np.nan
ratings_ratebeer_matched['matched'] = np.nan

print(len(ratings_matched_dropped_columns))
for index, row in ratings_matched_dropped_columns.iterrows():
    print(index)
    beer_advocate = row.beer_id
    beer_ratebeer = row['beer_id.1']
    user_advocate = row.user_id
    user_ratebeer = row['user_id.1']

    id_advocate = ratings_advocate_matched[(ratings_advocate_matched.user_id==user_advocate)&(ratings_advocate_matched.beer_id==beer_advocate)]['id_rating']
    id_ratebeer = ratings_ratebeer_matched[(ratings_ratebeer_matched.user_id==user_ratebeer)&(ratings_ratebeer_matched.beer_id==beer_ratebeer)]['id_rating']

    #ratings_advocate_matched[ratings_advocate_matched.id_rating==id_advocate]['matched'] = id_ratebeer
    #ratings_ratebeer_matched[ratings_ratebeer_matched.id_rating==id_ratebeer]['matched'] = id_advocate
    if not id_advocate.empty and not id_ratebeer.empty:
        ratings_advocate_matched.loc[ratings_advocate_matched['id_rating'] == id_advocate.iloc[0], 'matched'] = id_ratebeer.iloc[0]
        ratings_ratebeer_matched.loc[ratings_ratebeer_matched['id_rating'] == id_ratebeer.iloc[0], 'matched'] = id_advocate.iloc[0]

ratings_advocate_matched = ratings_advocate_matched.drop(columns=['beer_id','brewery_id','user_id'])
ratings_ratebeer_matched = ratings_ratebeer_matched.drop(columns=['beer_id','brewery_id','user_id'])

ratings_advocate_matched =ratings_advocate_matched.rename(columns={'id_rating':'id'})
ratings_ratebeer_matched =ratings_ratebeer_matched.rename(columns={'id_rating':'id'})


21964
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


KeyboardInterrupt: 

In [141]:
ratings_ratebeer_new_beer_id.head(2)

Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text,dataset,id_rating,id_brewery,id_user,id_beer
0,410549,3198,1461664800,175852,2,4,2,4,8,2.0,"Puszka 0,33l dzięki Christoph . Kolor jasnozło...",rb,1,16713,3317.0,
1,105273,3198,1487329200,442761,2,3,2,4,8,1.9,Cerveza pale lager gabonesa. MÃ¡s floja que la...,rb,2,16713,,


In [136]:
ratings_advocate_new_beer_id.head(2)

Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text,dataset,id_rating,id_brewery,id_user,id_beer
0,142544,37262,1440064800,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",ad,7122075,8244,2754,
1,19590,10093,1235127600,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67,Pours pale copper with a thin head that quickl...,ad,7122076,1,70150,


In [133]:
full_breweries.head(2)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
0,Northern Ireland,Strangford Lough,6,10093,4959.0,1
1,Northern Ireland,Sheelin,5,32848,17616.0,2


In [97]:
ratings_advocate_new_user_id[ratings_advocate_new_user_id['id_brewery'].isna()].head()

Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text,dataset,id_rating,id_brewery,id_user
96730,143185,37180,1441533600,dispydnb.981403,3.75,4.0,4.0,4.25,4.0,4.09,,ad,7218805,,70849
96731,143185,37180,1426935600,stjamesgate.163714,3.75,3.75,3.5,4.0,4.0,3.88,Chartreuse with a finger of snowy froth. 3.75C...,ad,7218806,,70150
96732,143185,37180,1414321200,emperorbevis.621888,3.75,3.25,3.75,2.25,3.5,2.98,Cask pulled by an gram handpump at the first I...,ad,7218807,,70362
96733,273540,37180,1490353200,emperorbevis.621888,3.75,2.25,3.75,3.75,3.5,3.34,Bottled and possibly bottle conditionedPours a...,ad,7218808,,70362
96734,178323,37180,1461578400,tomcostello.1111455,3.0,3.0,3.0,3.0,3.0,3.0,,ad,7218809,,80353


In [102]:
full_breweries.head(2)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
0,Northern Ireland,Strangford Lough,6,10093,4959.0,1
1,Northern Ireland,Sheelin,5,32848,17616.0,2


Trying to understand th eerror we have

In [118]:
full_breweries[full_breweries['old_advocate_id']==37180].head(2)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id


In [117]:
full_breweries[full_breweries['old_ratebeer_id']==20891].head(2)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
8190,England,Seven Bro7hers,10,"[37180, 45243]",20891.0,8191


In [119]:
full_breweries_for_ratings_exploded[full_breweries_for_ratings_exploded['old_advocate_id']==37180]

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id


In [120]:
full_breweries_for_ratings_exploded[full_breweries_for_ratings_exploded['old_ratebeer_id']==20891]

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
8190,England,Seven Bro7hers,10,"[37180, 45243]",20891.0,8191


In [122]:
print(len(full_breweries_for_ratings_exploded),len(full_users))

32666 220537


In [113]:
breweries_advocate = pd.read_csv(os.path.join(MATCHED_PATH,'breweries.csv'),header=1)

In [115]:
breweries_advocate[breweries_advocate['id']==37180]

Unnamed: 0,id,location,name,nbr_beers,id.1,location.1,name.1,nbr_beers.1,diff,sim
219,37180,England,Seven Bro7hers,4,20891,England,Seven Bro7hers,7,0.528802,1.0


In [114]:
breweries_advocate.head()

Unnamed: 0,id,location,name,nbr_beers,id.1,location.1,name.1,nbr_beers.1,diff,sim
0,10093,Northern Ireland,Strangford Lough Brewing Company Ltd,5,4959,Northern Ireland,Strangford Lough,5,0.431275,0.889062
1,32848,Northern Ireland,The Sheelin Brewery,4,17616,Northern Ireland,Sheelin,2,0.526388,0.863596
2,40360,Northern Ireland,Walled City Brewing Company,6,24866,Northern Ireland,Walled City,3,0.527852,0.954183
3,40309,Northern Ireland,Ards Brewing Company,7,13538,Northern Ireland,Ards Brewing Co.,13,0.554395,0.896098
4,41205,Northern Ireland,Barrahooley Brewery,3,22304,Northern Ireland,Barrahooley Craft Brewery,4,0.602544,0.896205
