# Make a single dataset

The goal of this notebook is to combine both datasets in a single one with matched beers.

## Breweries data

In [1]:
import os
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import copy

dataset_path = Path(os.getcwd())

ADVOCATE = "BeerAdvocate"
RATEBEER = "RateBeer"
MATCHED = "Matched"
FULL = "Full"

ADVOCATE_PATH = os.path.join(dataset_path,ADVOCATE)
RATEBEER_PATH = os.path.join(dataset_path,RATEBEER)
MATCHED_PATH = os.path.join(dataset_path,MATCHED)
FULL_PATH = os.path.join(dataset_path,FULL)

In [2]:
#Open this one already as it may become usefull for reweries already
beers_matched = pd.read_csv(os.path.join(MATCHED_PATH,'beers.csv'),header=1)

In [3]:
breweries_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'breweries.csv'))
breweries_matched = pd.read_csv(os.path.join(MATCHED_PATH,'breweries.csv'),header=1)
breweries_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'breweries.csv'))

print("Length of the three datasets:\n-advocate:",len(breweries_advocate),"\n-matched:", len(breweries_matched),"\n-ratebeer:", len(breweries_ratebeer))

#Already delete the breweries presents in matched
breweries_ratebeer_solo = breweries_ratebeer[~breweries_ratebeer.id.isin(breweries_matched['id.1'])]
breweries_advocate_solo = breweries_advocate[~breweries_advocate.id.isin(breweries_matched['id'])]

Length of the three datasets:
-advocate: 16758 
-matched: 8281 
-ratebeer: 24189


In [4]:
breweries_duplicates_ratebeer = breweries_matched[breweries_matched['id.1'].duplicated(keep=False)]
breweries__not_duplicated_ratebeer = breweries_matched[~breweries_matched['id.1'].duplicated(keep=False)]#For later
breweries_duplicates_advocate = breweries_matched[breweries_matched['id'].duplicated(keep=False)]

print("Number of duplicated/tripled Ratbeer breweries",len(breweries_duplicates_ratebeer))
print("Number of duplicated/tripled Advocate breweries",len(breweries_duplicates_advocate))

Number of duplicated/tripled Ratbeer breweries 91
Number of duplicated/tripled Advocate breweries 0


In [5]:
#We are going to build a new dataset instead of playing with the dataset that contains every information for duplicates.

unique_ratebeer_brewery_id = breweries_duplicates_ratebeer['id.1'].unique()
new_matched_brewery_duplicate = pd.DataFrame(columns=['location', 'name', 'nbr_beers','old_advocate_id','old_ratebeer_id'])

for id in unique_ratebeer_brewery_id:
    # Define data for each row
    location = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['location.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    name = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['name.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    old_advocate_id = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['id']
    list_old_advocate_id = []
    for id_advocate in (old_advocate_id):
        list_old_advocate_id.append(id_advocate)
    old_ratebeer_id = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['id.1'].iloc[0]

    total_beers_advocate = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['nbr_beers'].sum()
    total_beers_ratebeer = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['nbr_beers.1'].iloc[0] #don't want to double the number of beers for the one duplicated

    nbr_beers = total_beers_ratebeer + total_beers_advocate

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'location': location, 'name': name, 'nbr_beers': nbr_beers,'old_advocate_id':[list_old_advocate_id],'old_ratebeer_id':old_ratebeer_id})
    new_matched_brewery_duplicate = pd.concat([new_matched_brewery_duplicate, new_row], ignore_index=True)

new_matched_brewery_duplicate.head(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
0,England,Seven Bro7hers,11,"[37180, 45243]",20891
1,England,Dartmoor,19,"[25939, 22832]",3480
2,China,Great Leap Brewing,77,"[24935, 32111]",12325


In [6]:
new_matched_brewery_non_duplicate = copy.deepcopy(breweries__not_duplicated_ratebeer)
new_matched_brewery_non_duplicate['nbr_beers'] = new_matched_brewery_non_duplicate[['nbr_beers', 'nbr_beers.1']].sum(axis=1)
new_matched_brewery_non_duplicate = new_matched_brewery_non_duplicate.drop(columns=['location','name','diff', 'sim', 'nbr_beers.1'])
new_matched_brewery_non_duplicate = new_matched_brewery_non_duplicate.rename(columns={'location.1':'location','name.1':'name','id':'old_advocate_id', 'id.1':'old_ratebeer_id',})
new_matched_brewery_non_duplicate = new_matched_brewery_non_duplicate[['location', 'name', 'nbr_beers', 'old_advocate_id', 'old_ratebeer_id']]

display(new_matched_brewery_non_duplicate.head(3))

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
0,Northern Ireland,Strangford Lough,10,10093,4959
1,Northern Ireland,Sheelin,6,32848,17616
2,Northern Ireland,Walled City,9,40360,24866


In [7]:
#Concat the two datasets together
new_matched_brewery = pd.concat([new_matched_brewery_non_duplicate, new_matched_brewery_duplicate], ignore_index=True)
new_matched_brewery.sample(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
5617,"United States, Missouri",Third Wheel Brewing,15,49391,31676
5752,"United States, Iowa",Albia Brewing Company,48,30714,17110
2665,Spain,Cerveza Dolina,9,43053,19496


### Check the previous split and concatenation operations

In [8]:
print(len(new_matched_brewery), len(new_matched_brewery_non_duplicate), len(new_matched_brewery_duplicate))
print(len(new_matched_brewery.old_ratebeer_id.unique()))

8235 8190 45
8235


### Number of beers

Now we need to count the number of beers that there really is. We assume that a beer can only be matched if its brewery is matched too.

In [9]:
new_matched_brewery_good_amount_rating = copy.deepcopy(new_matched_brewery)

# Count occurrences of each 'old_ratebeer_id' in 'beers_matched'
ratebeer_id_counts = beers_matched['brewery_id.1'].value_counts()

# Decrement 'nbr_beers' by the counts of each 'old_ratebeer_id' to remove the amount of beers that are already in the dataset
new_matched_brewery_good_amount_rating['nbr_beers'] -= new_matched_brewery_good_amount_rating['old_ratebeer_id'].map(ratebeer_id_counts).fillna(0).astype(int)

display(new_matched_brewery_good_amount_rating.head(3))

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
0,Northern Ireland,Strangford Lough,6,10093,4959
1,Northern Ireland,Sheelin,5,32848,17616
2,Northern Ireland,Walled City,8,40360,24866


In [10]:
breweries_ratebeer_solo = breweries_ratebeer_solo.rename(columns={'id': 'old_ratebeer_id'})
breweries_ratebeer_solo['old_advocate_id'] = np.nan
display(breweries_ratebeer_solo.sample(3))

breweries_advocate_solo = breweries_advocate_solo.rename(columns={'id': 'old_advocate_id'})
breweries_advocate_solo['old_ratebeer_id'] = np.nan
display(breweries_advocate_solo.sample(3))

full_breweries = pd.concat([new_matched_brewery_good_amount_rating,breweries_advocate_solo, breweries_ratebeer_solo], ignore_index=True)
full_breweries['id'] = range(1, len(full_breweries) + 1)
display(full_breweries.sample(10))

Unnamed: 0,old_ratebeer_id,location,name,nbr_beers,old_advocate_id
1583,20446,Canada,Dominion City Brewing Co.,54,
11767,23686,Latvia,Viedi,13,
17391,22826,"United States, Minnesota",Union Pizza & Brewing Company,5,


Unnamed: 0,old_advocate_id,location,name,nbr_beers,old_ratebeer_id
16192,1725,"United States, Virginia",Hilltop Brewing Company,21,
5725,1320,Australia,Scharers Little Brewery,3,
4,39916,Kyrgyzstan,Kellers Bier,2,


Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
28677,"United States, New Mexico",Milagro Brewery,14,,3839.0,28678
26771,"United States, California",Armstrong Brewing Company,15,,16334.0,26772
5623,"United States, Missouri",Charleville Vineyard & Microbrewery,77,10549.0,8504.0,5624
26786,"United States, California",Bear Republic Brewing Company,253,,284.0,26787
27588,"United States, New York",Table 41 Brewing Company,7,,28084.0,27589
27052,"United States, California",The Great Beer Company,2,,11455.0,27053
12076,Netherlands,Delftse Stadsbrouwerij De Koperen Kat,12,47483.0,,12077
25774,Sweden,Dykes Brewery,9,,18356.0,25775
26433,Norway,Håndbryggeriet En Liten Øl,4,,28240.0,26434
21983,Wales,Bragdyr Bryn Cyf,3,,6267.0,21984


In [11]:
print(len(full_breweries))
print(len(breweries_matched),len(breweries_advocate),len(breweries_ratebeer))
print(-len(breweries_matched)+len(breweries_advocate)+len(breweries_ratebeer))

32666
8281 16758 24189
32666


In [12]:
full_breweries.to_csv(os.path.join(FULL_PATH,'breweries.csv'),index=False)

test_breweri = pd.read_csv(os.path.join(FULL_PATH,'breweries.csv'))
print(len(test_breweri))
del test_breweri

32666


## Users data

We prefer to consider the users approx rather than the users of the matched dataset. We saw that the users do have duplicates (unlike the normal file). We decided to take the different users in the approx file as a single user even for a lower similitude value (no sim value is below 0.8006407690254358). We consider the effect of this to be negligeable as it is a small percent of data. (like in the brewery file).

In [13]:
users_matched = pd.read_csv(os.path.join(MATCHED_PATH,'users_approx.csv'),header=1)
users_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'users.csv'))
users_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'users.csv'))

ratings_matched = pd.read_csv(os.path.join(MATCHED_PATH,'ratings.csv'),header=1)#needed 


print("Length of the three datasets:\n-advocate:",len(users_advocate),"\n-matched:", len(users_matched),"\n-ratebeer:", len(users_ratebeer))

#Already delete the breweries presents in matched
users_ratebeer_solo = users_ratebeer[~users_ratebeer.user_name.isin(users_matched['user_name.1'])]
users_advocate_solo = users_advocate[~users_advocate.user_name.isin(users_matched['user_name'])]

print("New length of:\n-advocate:",len(users_advocate_solo),"\n-ratebeer:", len(users_ratebeer_solo))

display(users_matched.head(2))

Length of the three datasets:
-advocate: 153704 
-matched: 3341 
-ratebeer: 70174
New length of:
-advocate: 150388 
-ratebeer: 66833


Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1,sim
0,1483009000.0,Spain,3,0,magicuenca.1185749,MAGICuenca,magicuenca,1484046000.0,Spain,89,442761,MAGICuenca91,magicuenca91,0.904534
1,1220868000.0,Germany,6,6,erzengel.248045,Erzengel,erzengel,1224324000.0,Germany,8781,83106,Erzengel,erzengel,1.0


In [14]:
users_duplicates_ratebeer = users_matched[users_matched['user_name.1'].duplicated(keep=False)]
users_duplicates_advocate = users_matched[users_matched['user_name'].duplicated(keep=False)]
users__not_duplicated_advocate = users_matched[~users_matched['user_name'].duplicated(keep=False)]#For later

print("Number of duplicated/tripled Ratbeer users",len(users_duplicates_ratebeer))
print("Number of duplicated/tripled Advocate users",len(users_duplicates_advocate))

Number of duplicated/tripled Ratbeer users 0
Number of duplicated/tripled Advocate users 47


Both advocate and ratebeer dataset have as columns ['user_id', 'location', 'user_name','user_name_lower', 'joined','nbr_ratings']. Advocate has nbr_reviews in addition. Matched has those columns with .1 referring to ratebeer. It also has a column 'sim' that we will drop. The end format we want to have: [general_id, old_user_id_advocate, old_user__id_ratebeer, location, user_name_lower, joined_advocate, joined_ratebeer and nbr_ratings]. We think that the other columns will not be usefull for our analysis. The next cell shows that the user_name_lower are equal between the two datasets. For the location we will in opposition to the brewery give priority to the advocate dataset, as it is this time the one having one correspondence to multiple ratebeer users.

In [15]:
display(users_duplicates_advocate.sample(3))
users_duplicates_advocate[users_duplicates_advocate['user_id']=='lonestar.677281'].head(2)

Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1,sim
2280,1089886000.0,"United States, California",444,443,brewmaster.7302,BrewMaster,brewmaster,1309169000.0,"United States, California",1,130842,brewmaster99,brewmaster99,0.904534
821,1156068000.0,Canada,207,207,nighthawk.93517,nighthawk,nighthawk,1299150000.0,Canada,4,124623,nighthawk55,nighthawk55,0.894427
10,1245751000.0,England,32,32,leighton.343447,leighton,leighton,1209204000.0,England,19568,74136,Leighton,leighton,1.0


Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1,sim
562,1337422000.0,"United States, Texas",1,1,lonestar.677281,Lonestar,lonestar,1081332000.0,"United States, Texas",13,11446,oneStar,onestar,0.822609
2059,1337422000.0,"United States, Texas",1,1,lonestar.677281,Lonestar,lonestar,1162984000.0,"United States, Texas",6,44744,LONESTAR,lonestar,1.0


In [16]:
#We are going to build a new dataset instead of playing with the dataset that contains every information for duplicates.

unique_advocate_user_user_name = users_duplicates_advocate['user_name'].unique()
new_matched_user_duplicate = pd.DataFrame(columns=['location', 'joined_advocate', 'old_user_id_advocate','joined_ratebeer','old_user_id_ratebeer','user_name','nbr_ratings_total'])

for user_name in unique_advocate_user_user_name:  # Replace with your actual loop condition
    # Define data for each row
    location = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['location'].iloc[0]#Always take the first as it is the same for the two/three of them
    joined_advocate = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['joined'].iloc[0]
    joined_ratebeer = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['joined.1'].min() #take min value of joined time (we want the first time they joined)

    old_user_id_advocate = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['user_id'].iloc[0]
    old_user_id_ratebeer = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['user_id.1']
    name = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['user_name'].iloc[0]

    list_old_ratebeer_id = []
    for id_ratbeer in (old_user_id_ratebeer):
        list_old_ratebeer_id.append(id_ratbeer)

    total_ratings_advocate = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['nbr_ratings'].iloc[0]
    total_ratings_ratebeer = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['nbr_ratings.1'].sum()

    nbr_ratings_total = total_ratings_advocate+ total_ratings_ratebeer

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'location': location, 'user_name': name,'old_user_id_ratebeer':[list_old_ratebeer_id],'old_user_id_advocate':old_user_id_advocate,'joined_advocate':joined_advocate,'joined_ratebeer':joined_ratebeer,'nbr_ratings_total':nbr_ratings_total})
    new_matched_user_duplicate = pd.concat([new_matched_user_duplicate, new_row], ignore_index=True)

new_matched_user_duplicate.sample(10)


  new_matched_user_duplicate = pd.concat([new_matched_user_duplicate, new_row], ignore_index=True)


Unnamed: 0,location,joined_advocate,old_user_id_advocate,joined_ratebeer,old_user_id_ratebeer,user_name,nbr_ratings_total
0,Spain,1483009000.0,magicuenca.1185749,1481108000.0,"[442761, 437310]",MAGICuenca,101
18,"United States, Wisconsin",1339409000.0,jro.680480,1238580000.0,"[179356, 89508]",JRO,32
14,Canada,1121422000.0,beers.29246,1185444000.0,"[58154, 241070, 130784]",Beers,13
3,Sweden,1260702000.0,mattias.403838,1076411000.0,"[116100, 10562]",Mattias,2771
9,Canada,1407578000.0,spikedlemon.842005,1075115000.0,"[10289, 128469]",spikedlemon,9
21,"United States, Ohio",1104318000.0,beerbeerbeerbeer.12454,1107256000.0,"[19505, 411715]",beerbeerbeerbeer,8
16,Norway,1334225000.0,morten.672301,1140606000.0,"[137013, 33840, 137302]",Morten,16
8,"United States, Massachusetts",1336298000.0,maximusmaximus.675527,1108897000.0,"[20099, 356497]",Maximusmaximus,112
1,England,1245751000.0,leighton.343447,1209204000.0,"[74136, 257478]",leighton,19601
17,Norway,1421492000.0,ketil.928679,1292584000.0,"[119220, 256180]",Ketil,8


In [17]:
users_matched_not_duplicated = users__not_duplicated_advocate.drop(columns=['nbr_reviews','user_name.1','user_name_lower','location','user_name_lower.1','sim'])
users_matched_not_duplicated = users_matched_not_duplicated.rename(columns={'joined': 'joined_advocate','joined.1': 'joined_ratebeer','user_id': 'old_user_id_advocate','user_id.1': 'old_user_id_ratebeer','location.1':'location'})
users_matched_not_duplicated["nbr_ratings_total"] = users_matched_not_duplicated.nbr_ratings+users_matched_not_duplicated["nbr_ratings.1"]
users_matched_not_duplicated = users_matched_not_duplicated.drop(columns=['nbr_ratings','nbr_ratings.1'])
users_matched_not_duplicated.sample(5)

Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total
3336,1095847000.0,ypsifly.8833,ypsifly,1139569000.0,"United States, Michigan",33393,588
87,1330081000.0,matt.661184,Matt,1315476000.0,"United States, California",135122,750
931,1435658000.0,mambossa.1007501,mambossa,1418728000.0,"United States, Ohio",348869,238
1958,1147082000.0,kzoobrew.77815,kzoobrew,1282903000.0,"United States, Michigan",112421,557
282,1410862000.0,celikelf.864388,celikelf,1378030000.0,Turkey,277480,139


In [18]:
#Concat the two datasets together
new_matched_user = pd.concat([users_matched_not_duplicated, new_matched_user_duplicate], ignore_index=True)
new_matched_user.sample(3)

Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total
1938,1417604000.0,whileseated.902760,whileseated,1240481000.0,"United States, Georgia",90310,3
948,1312884000.0,capman62.615020,Capman62,1313402000.0,"United States, New York",133682,25
662,1135076000.0,baggio.56031,baggio,1173092000.0,Sweden,50889,660


In [19]:
users_matched_not_duplicated_good_amount_ratings = copy.deepcopy(new_matched_user)

# Count occurrences of each 'old_ratebeer_id' in 'beers_matched'
ratings_matched_id_counts = ratings_matched['user_id'].value_counts()

# Decrement 'nbr_beers' by the counts of each 'old_ratebeer_id' to remove the amount of beers that are already in the dataset
users_matched_not_duplicated_good_amount_ratings['nbr_ratings_total'] -= users_matched_not_duplicated_good_amount_ratings['old_user_id_advocate'].map(ratings_matched_id_counts).fillna(0).astype(int)

print(new_matched_user.nbr_ratings_total.sum())
print(users_matched_not_duplicated_good_amount_ratings.nbr_ratings_total.sum())

1608147
1586183


In [20]:
# Refactor the dataframe to prepare for the merge
users_ratebeer_solo = users_ratebeer_solo.rename(columns={'user_id': 'old_user_id_ratebeer','joined':'joined_ratebeer','nbr_ratings':'nbr_ratings_total'})
users_ratebeer_solo['old_user_id_advocate'] = np.nan
users_ratebeer_solo['joined_advocate'] = np.nan

users_advocate_solo = users_advocate_solo.rename(columns={'user_id': 'old_user_id_advocate','joined':'joined_advocate','nbr_ratings':'nbr_ratings_total'})
users_advocate_solo['old_user_id_ratebeer'] = np.nan
users_advocate_solo['joined_ratebeer'] = np.nan

# Check the former operations
print("Sample of users only present in Ratebeer:")
display(users_ratebeer_solo.sample(3))

print("Sample of users only present in BeerAdvocate:")
display(users_advocate_solo.sample(3))

# Merge the three dataframes
full_users = pd.concat([users_matched_not_duplicated_good_amount_ratings,users_ratebeer_solo, users_advocate_solo], ignore_index=True)
full_users['id'] = range(1, len(full_users) + 1)

# Check the former operation
print("Sample of all users:")
display(full_users.sample(2))

print("Total number of users: ", len(full_users))
print("Number of users present in both datasets: ", len(users_matched))
print("Number of users present in BeerAdvocate: ",len(users_advocate))
print("Number of users present in RateBeer: ",len(users_ratebeer))
print("Sum of all the users (for chekcing purposes): ", len(users_matched_not_duplicated_good_amount_ratings)+len(users_advocate_solo)+len(users_ratebeer_solo))

Sample of users only present in Ratebeer:


Unnamed: 0,nbr_ratings_total,old_user_id_ratebeer,user_name,joined_ratebeer,location,old_user_id_advocate,joined_advocate
37218,5,3758,KiwiKid2199,1016276000.0,,,
50022,3,2169,bunyan,1005131000.0,"United States, Connecticut",,
3217,23,3147,jmusial99,1012648000.0,,,


Sample of users only present in BeerAdvocate:


Unnamed: 0,nbr_ratings_total,nbr_reviews,old_user_id_advocate,user_name,joined_advocate,location,old_user_id_ratebeer,joined_ratebeer
145177,1,1,muddywolf.1041998,MuddyWolf,1442138000.0,"United States, Washington",,
78416,19,3,haoletoyou.790623,HaoleToYou,1395832000.0,"United States, Hawaii",,
8606,16,16,opalchemist.511761,opalchemist,1286446000.0,United Arab Emirates,,


Sample of all users:


Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total,nbr_reviews,id
187819,1376042000.0,wraynathan.747765,wraynathan,,"United States, Pennsylvania",,5,5.0,187820
59889,,,dgarratt,1374142000.0,"United States, New York",269908.0,1,,59890


Total number of users:  220537
Number of users present in both datasets:  3341
Number of users present in BeerAdvocate:  153704
Number of users present in RateBeer:  70174
Sum of all the users (for chekcing purposes):  220537


In [21]:
test = full_users[full_users['user_name'].duplicated(keep=False)]
test[test['user_name']=='Elwood']

Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total,nbr_reviews,id
990,1152871000.0,elwood.88673,Elwood,1374314000.0,Canada,270235,6,,991
4952,,,Elwood,1235041000.0,"United States, Virginia",87609,2966,,4953


Oh so in different dataset people can have the same user_name. We need to be carefull about this. If we treat the data we need to use id.

In [22]:
# Save the dataframe to a csv file
full_users.to_csv(os.path.join(FULL_PATH,'users.csv'),index=False)

# Check the former operation
test_user = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))
assert len(test_user) == len(full_users), f"Expected {len(full_users)}, but got {len(test_user)}"
del test_user

  test_user = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))


## Beer data

In this dataset we saw that there were no duplicates. We do not need to make the carefull analysis we made before. However we still need to look at the matched dataset. We also need to link the new brewery id, but we also need to count the number of ratings.

In [23]:
beers_matched = pd.read_csv(os.path.join(MATCHED_PATH,'beers.csv'),header=1)
beers_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'beers.csv'))
beers_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'beers.csv'))

ratings_matched = pd.read_csv(os.path.join(MATCHED_PATH,'ratings.csv'),header=1)#needed 
full_breweries_for_beers = pd.read_csv(os.path.join(FULL_PATH,'breweries.csv'))#needed


print("Length of the three datasets:\n-advocate:",len(beers_advocate),"\n-matched:", len(beers_matched),"\n-ratebeer:", len(beers_ratebeer))

#Already delete the breweries presents in matched
beers_ratebeer_solo = beers_ratebeer[~beers_ratebeer.beer_id.isin(beers_matched['beer_id.1'])]
beers_advocate_solo = beers_advocate[~beers_advocate.beer_id.isin(beers_matched['beer_id'])]

print("New length of:\n-advocate:",len(beers_advocate_solo),"\n-ratebeer:", len(beers_ratebeer_solo))

Length of the three datasets:
-advocate: 280823 
-matched: 45640 
-ratebeer: 442081
New length of:
-advocate: 235183 
-ratebeer: 396441


In [24]:
display(beers_matched.sample(2))
beers_new = beers_matched.drop(columns=['avg','beer_wout_brewery_name.1','avg.1','avg_matched_valid_ratings.1','nbr_reviews','beer_name.1','brewery_name.1','brewery_name','avg_computed','avg_computed.1','avg_matched_valid_ratings','ba_score','beer_wout_brewery_name','sim','diff','zscore','zscore.1','overall_score','style_score','nbr_matched_valid_ratings','nbr_matched_valid_ratings.1'])
display(beers_new.sample(2))
print((beers_new['abv'] == beers_new['abv.1']).all())

Unnamed: 0,abv,avg,avg_computed,avg_matched_valid_ratings,ba_score,beer_id,beer_name,beer_wout_brewery_name,brewery_id,brewery_name,...,brewery_id.1,brewery_name.1,nbr_matched_valid_ratings.1,nbr_ratings.1,overall_score,style.1,style_score,zscore.1,diff,sim
26075,5.5,3.82,3.86,3.65,,35111,Woolslayer Alt,Alt Woolslayer,603,Church Brew Works,...,620,Church Brew Works,5,5,,Altbier,,-0.170699,0.565198,1.0
38106,5.3,3.5,3.5,,,139201,Mocha Porter,Mocha Porter,30567,Rockford Brewing Company,...,15758,Rockford Brewing Company,1,1,,Porter,,-0.347271,0.610818,1.0


Unnamed: 0,abv,beer_id,beer_name,brewery_id,bros_score,nbr_ratings,style,abv.1,beer_id.1,brewery_id.1,nbr_ratings.1,style.1
2672,6.0,257429,Hop Hooligans Crowd Control,45376,,11,American IPA,6.0,465981,27019,21,India Pale Ale (IPA)
11568,10.5,38684,Trumpet,8284,,2,English Barleywine,10.5,78591,1774,24,Barley Wine


True


The alcohol by volume is exactly the same between the two datasets.

In [25]:
#We are going to build a new dataset instead of playing with the dataset that contains every information for duplicates.
#Not sure about valid ratings so prefer to keep it as it is for now
new_matched_beer = pd.DataFrame(columns=['abv', 'old_beer_id_advocate', 'old_beer_id_ratebeer','beer_name','brewery_id','bros_score','nbr_ratings','style_advocate','style_ratebeer'])

# Use a list to collect rows and create the DataFrame at once
rows = []

for index, row in beers_new.iterrows():
    # Define data for each row
    abv = row['abv']
    beer_id_advocate = row['beer_id']
    beer_id_ratebeer = row['beer_id.1']
    beer_name = row['beer_name']
    bros_score = row['bros_score']
    style_advocate = row['style']
    style_ratebeer = row['style.1']

    brewery_id = full_breweries_for_beers[full_breweries_for_beers['old_ratebeer_id']==row['brewery_id.1']]['id'].values[0]

    total_ratings_advocate = row['nbr_ratings']
    total_ratings_ratebeer = row['nbr_ratings.1']

    nbr_ratings_total = total_ratings_advocate + total_ratings_ratebeer

    # Append the new row to the list
    rows.append({'abv': abv, 'old_beer_id_advocate': beer_id_advocate, 'old_beer_id_ratebeer': beer_id_ratebeer, 'beer_name': beer_name, 'brewery_id': brewery_id, 'bros_score': bros_score, 'nbr_ratings': nbr_ratings_total, 'style_advocate': style_advocate, 'style_ratebeer': style_ratebeer})

# Create the DataFrame from the list of rows
new_matched_beer = pd.DataFrame(rows)

new_matched_beer.head(10)

Unnamed: 0,abv,old_beer_id_advocate,old_beer_id_ratebeer,beer_name,brewery_id,bros_score,nbr_ratings,style_advocate,style_ratebeer
0,4.8,19827,37923,Legbiter,1,80.0,164,English Pale Ale,Golden Ale/Blond Ale
1,6.0,20841,41286,St. Patrick's Ale,1,,19,English Pale Ale,Irish Ale
2,4.2,20842,41287,St. Patrick's Best,1,90.0,138,English Bitter,Bitter
3,4.8,22659,41285,St. Patrick's Gold,1,,5,American Pale Wheat Ale,Amber Ale
4,4.5,178681,230283,Sheelin Stout,2,,2,Irish Dry Stout,Mild Ale
5,4.2,178689,368966,Boom,3,,3,American Pale Ale (APA),American Pale Ale
6,4.6,169948,155699,Bally Black Stout,4,,6,Irish Dry Stout,Stout
7,5.2,169950,160664,Pig Island Pale Ale,4,,4,English Pale Ale,Bitter
8,4.4,169949,177517,Rockin’ Goose,4,,3,English Pale Mild Ale,Irish Ale
9,4.9,169951,299091,Scrabo Gold,4,,2,English Pale Ale,Golden Ale/Blond Ale


In [26]:
new_matched_beer_good_amount_ratings = copy.deepcopy(new_matched_beer)

# Count occurrences of each 'old_ratebeer_id' in 'beers_matched'
ratings_matched_beer_id_counts = ratings_matched['beer_id'].value_counts()

# Decrement 'nbr_beers' by the counts of each 'old_ratebeer_id' to remove the amount of beers that are already in the dataset
new_matched_beer_good_amount_ratings['nbr_ratings'] -= new_matched_beer_good_amount_ratings['old_beer_id_advocate'].map(ratings_matched_beer_id_counts).fillna(0).astype(int)

print(new_matched_beer.nbr_ratings.sum())
print(new_matched_beer_good_amount_ratings.nbr_ratings.sum())

1976606
1954642


In [27]:
beers_ratebeer_solo_new = beers_ratebeer_solo.copy()
beers_ratebeer_solo_new = beers_ratebeer_solo_new.drop(columns=['brewery_name','overall_score','style_score','avg','avg_computed','zscore','nbr_matched_valid_ratings','avg_matched_valid_ratings'])
beers_ratebeer_solo_new = beers_ratebeer_solo_new.rename(columns={'beer_id': 'old_beer_id_ratebeer','style':'style_ratebeer'})

beers_ratebeer_solo_new = pd.merge(beers_ratebeer_solo_new,full_breweries_for_beers[['old_ratebeer_id', 'id']],how='left', left_on='brewery_id',right_on='old_ratebeer_id')
beers_ratebeer_solo_new = beers_ratebeer_solo_new.drop(columns=['brewery_id','old_ratebeer_id'])
beers_ratebeer_solo_new = beers_ratebeer_solo_new.rename(columns={'id': 'brewery_id'})

beers_ratebeer_solo_new['old_beer_id_advocate'] = np.nan
beers_ratebeer_solo_new['bros_score'] = np.nan
beers_ratebeer_solo_new['style_advocate']=np.nan

beers_ratebeer_solo_new.sample(3)

Unnamed: 0,old_beer_id_ratebeer,beer_name,style_ratebeer,nbr_ratings,abv,brewery_id,old_beer_id_advocate,bros_score,style_advocate
62816,177160,Bucher Hefe-Weizen,German Hefeweizen,1,5.0,20577,,,
67451,139208,Engel Kicker,Oktoberfest/Märzen,31,5.6,2139,,,
176956,265912,Lazy Boy Blonde Ale,Golden Ale/Blond Ale,1,5.0,4462,,,


In [28]:
display(beers_advocate_solo.sample(1))
beers_advocate_solo_new = copy.deepcopy(beers_advocate_solo)
beers_advocate_solo_new = beers_advocate_solo_new.drop(columns=['brewery_name','nbr_reviews','ba_score','avg','avg_computed','zscore','nbr_matched_valid_ratings','avg_matched_valid_ratings'])
beers_advocate_solo_new = beers_advocate_solo_new.rename(columns={'beer_id': 'old_beer_id_advocate','style':'style_advocate'})

display(full_breweries_for_beers[full_breweries_for_beers['old_ratebeer_id']==20891].head(2))

# Create a dictionary that links each 'id' to 'old_ratebeer_id' and 'old_advocate_id'
id_dict = full_breweries_for_beers.groupby('id').agg({
    'old_ratebeer_id': 'first',
    'old_advocate_id': lambda x: list(x)
}).to_dict('index')

print(str(id_dict[8191]['old_ratebeer_id']))

beers_advocate_solo_new['brewery_id'] = beers_advocate_solo_new['brewery_id'].astype(str)
full_breweries_for_beers['old_advocate_id'] = full_breweries_for_beers['old_advocate_id'].astype(str)
full_breweries_for_beers['id'] = full_breweries_for_beers['id'].astype(str)

beers_advocate_solo_new = pd.merge(beers_advocate_solo_new,full_breweries_for_beers[['old_advocate_id', 'id']],how='left', left_on='brewery_id',right_on='old_advocate_id')
beers_advocate_solo_new = beers_advocate_solo_new.drop(columns=['brewery_id','old_advocate_id'])
beers_advocate_solo_new = beers_advocate_solo_new.rename(columns={'id': 'brewery_id'})

beers_advocate_solo_new['old_beer_id_ratebeer'] = np.nan
beers_advocate_solo_new['style_ratebeer']=np.nan

display(beers_advocate_solo_new.head())

Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style,nbr_ratings,nbr_reviews,avg,ba_score,bros_score,abv,avg_computed,zscore,nbr_matched_valid_ratings,avg_matched_valid_ratings
132421,183694,Rind & Pepperberry,18120,Trinity Brewing Company,Saison / Farmhouse Ale,1,0,3.91,,,6.8,3.91,,0,


Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
8190,England,Seven Bro7hers,10,"[37180, 45243]",20891.0,8191


20891.0


Unnamed: 0,old_beer_id_advocate,beer_name,style_advocate,nbr_ratings,bros_score,abv,brewery_id,old_beer_id_ratebeer,style_ratebeer
0,166064,Nashe Moskovskoe,Euro Pale Lager,0,,4.7,8236,,
1,166065,Nashe Pivovskoe,Euro Pale Lager,0,,3.8,8236,,
2,166066,Nashe Shakhterskoe,Euro Pale Lager,0,,4.8,8236,,
3,166067,Nashe Zhigulevskoe,Euro Pale Lager,0,,4.0,8236,,
4,166063,Zhivoe,Euro Pale Lager,0,,4.5,8236,,


In [29]:
full_beers = pd.concat([new_matched_beer_good_amount_ratings,beers_ratebeer_solo_new, beers_advocate_solo_new], ignore_index=True)
full_beers['id'] = range(1, len(full_beers) + 1)
full_beers.sample(2)

Unnamed: 0,abv,old_beer_id_advocate,old_beer_id_ratebeer,beer_name,brewery_id,bros_score,nbr_ratings,style_advocate,style_ratebeer,id
677120,6.1,61458.0,,Hop Farmer IPA,16487,,3,American IPA,,677121
493734,5.7,251670.0,,Leon Steiner Premium Lager Beer Unpasteurised,12343,,3,Euro Pale Lager,,493735


In [30]:
print(len(full_beers))
print(len(beers_matched),len(beers_advocate),len(beers_ratebeer))
print(len(new_matched_beer_good_amount_ratings)+len(beers_advocate_solo_new)+len(beers_ratebeer_solo_new))

677264
45640 280823 442081
677264


In [31]:
full_beers.to_csv(os.path.join(FULL_PATH,'beers.csv'),index=False)

In [32]:
test_beer = pd.read_csv(os.path.join(FULL_PATH,'beers.csv'))
assert len(test_beer) == len(full_beers), f"Expected {len(full_beers)}, but got {len(test_beer)}"
del test_beer

  test_beer = pd.read_csv(os.path.join(FULL_PATH,'beers.csv'))


## Ratings

In [2]:
ratings_matched = pd.read_csv(os.path.join(MATCHED_PATH,'ratings.csv'),header=1)
ratings_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'ratings-advocate.csv'))
ratings_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'ratings.csv'))

print("Length of the three datasets:\n-advocate:",len(ratings_advocate),"\n-matched:", len(ratings_matched),"\n-ratebeer:", len(ratings_ratebeer))

Length of the three datasets:
-advocate: 8393032 
-matched: 21964 
-ratebeer: 7122074


In [3]:
full_users = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))
full_beers = pd.read_csv(os.path.join(FULL_PATH,'beers.csv'))
full_breweries = pd.read_csv(os.path.join(FULL_PATH,'breweries.csv'))

  full_users = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))
  full_beers = pd.read_csv(os.path.join(FULL_PATH,'beers.csv'))


In [4]:
ratings_advocate = ratings_advocate.drop(columns=['review','beer_name','brewery_name','style','user_name','abv'])
ratings_matched = ratings_matched.drop(columns=['review','beer_name','beer_name.1','brewery_name','brewery_name.1','style','style.1','user_name','user_name.1','abv','abv.1'])
ratings_ratebeer = ratings_ratebeer.drop(columns=['beer_name','brewery_name','style','user_name','abv'])

In [5]:
display(ratings_advocate.head(10))
display(ratings_ratebeer.head(10))
display(ratings_matched.head(10))

Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text
0,142544,37262,1440064800,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ..."
1,19590,10093,1235127600,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67,Pours pale copper with a thin head that quickl...
2,19590,10093,1142247600,mdagnew.19527,4.0,3.5,3.5,4.0,3.5,3.73,"500ml Bottle bought from The Vintage, Antrim....."
3,19590,10093,1101898800,helloloser12345.10867,4.0,3.5,4.0,4.0,4.5,3.98,Serving: 500ml brown bottlePour: Good head wit...
4,19590,10093,1093860000,cypressbob.3708,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ..."
5,19827,10093,1417431600,hellpop65.48993,,,,,,3.25,
6,19827,10093,1401357600,latarnik.52897,,,,,,3.5,
7,19827,10093,1393412400,rochefortchris.697017,,,,,,3.5,
8,19827,10093,1392030000,okcnittany.144868,,,,,,3.75,
9,19827,10093,1390647600,jaydoc.265507,,,,,,3.25,


Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text
0,410549,3198,1461664800,175852,2,4,2,4,8,2.0,"Puszka 0,33l dzięki Christoph . Kolor jasnozło..."
1,105273,3198,1487329200,442761,2,3,2,4,8,1.9,Cerveza pale lager gabonesa. MÃ¡s floja que la...
2,105273,3198,1466762400,288889,3,3,2,3,5,1.6,"Kolor- złoty, klarowny. Piana - drobna, średni..."
3,105273,3198,1451646000,250510,4,3,1,2,5,1.5,"Botella, de GabÃ³n regalo familiar.31/01/2015C..."
4,105273,3198,1445594400,122778,2,4,2,4,7,1.9,Many thanks for this beer to Erzengel. Pours l...
5,105273,3198,1444644000,227834,2,3,2,3,9,1.9,Many thanks to Erzengel for sharing! Pours cle...
6,105273,3198,1444212000,83106,4,5,4,5,8,2.6,Many thanks to Travlr for this can! Grainy no...
7,105273,3198,1430820000,175852,2,4,2,3,6,1.7,"Puszka pita w Gabonie. Kolor jasnosłomkowy, pi..."
8,105273,3198,1401357600,37316,3,5,2,3,7,2.0,"330ml bottle. 29-05-2014.From Gabon, courtesy ..."
9,105273,3198,1374141600,75452,2,3,3,4,6,1.8,Light yellow with quickly disappearing foam to...


Unnamed: 0,appearance,aroma,beer_id,brewery_id,date,overall,palate,rating,taste,text,...,aroma.1,beer_id.1,brewery_id.1,date.1,overall.1,palate.1,rating.1,taste.1,text.1,user_id.1
0,4.5,4.5,645,207,1324810800,5.0,4.5,4.8,5.0,Best before 27.07.2016Directly reviewed in com...,...,10.0,2360,406,1387710000,19.0,4.0,4.6,9.0,a) Geruch malzig-schwer-sÃ¼Ã. Riecht schon ...,83106
1,,,28191,9369,1322650800,,,3.0,,,...,3.0,17109,2921,1322564400,6.0,2.0,1.7,4.0,"Can. Weak and watery, not the best beer of the...",91324
2,3.5,3.5,57911,388,1344074400,4.0,4.0,3.85,4.0,"Bottle @ One Pint Pub, Helsinki. 2006 vintage....",...,8.0,35298,1069,1353582000,17.0,4.0,4.1,8.0,"Bottle @ One Pint Pub, Helsinki. Originally ra...",98624
3,4.0,3.5,57913,388,1344074400,4.0,4.0,3.68,3.5,"Originally rated on 16.11.2009, draught @ Pikk...",...,8.0,113596,1069,1416222000,16.0,4.0,4.1,9.0,"Draught @Â Pikkulintu, Helsinki, Finland. A pr...",98624
4,4.0,4.0,81125,2216,1346234400,4.0,4.0,4.0,4.0,"750ml bottle, originally rated on 18.8.2012.Bo...",...,8.0,173481,2058,1345284000,16.0,4.0,4.0,8.0,750ml bottleBottling date: 2011/02/17 - Pours ...,98624
5,4.0,4.0,67932,388,1344074400,4.0,4.5,4.05,4.0,"375ml bottle @ Pikkulintu, Helsinki. Originall...",...,8.0,59194,1069,1353754800,16.0,5.0,4.1,8.0,"375ml bottle @ Pikkulintu, HelsinkiPours orang...",98624
6,3.5,3.0,32787,11941,1217498400,4.0,2.5,2.98,2.5,Got this beer from my friend who just visited ...,...,6.0,12040,2242,1231585200,11.0,2.0,2.7,4.0,500 ml can. From Hungary (H).Nice dark brown c...,82173
7,4.25,4.25,171095,187,1438164000,4.25,4.25,4.35,4.5,Bottle shared in London - many thanks to Paul....,...,9.0,330384,62,1438164000,18.0,4.0,4.3,8.0,Bottle shared in London - many thanks to Paul....,74136
8,4.75,4.75,81697,22511,1384858800,4.75,4.75,4.75,4.75,"Bottle at THE Sour and Saison Tasting, London....",...,9.0,175070,11233,1351159200,19.0,5.0,4.6,9.0,"Bottle at THE Sour and Saison Tasting, London....",74136
9,4.0,4.0,76421,23222,1456830000,4.0,4.0,4.0,4.0,Draft. Pours oily black with a creamy brown he...,...,8.0,151817,11242,1455620400,17.0,3.0,4.0,8.0,Draft at RateBeer Best 2016. Pours oily black ...,74136


In contradiction to the previous method, we would like to keep both comments and grades given the the datasets for a macthed comments. We want to do this as the text might differ and the grades too. First we just want to add a column 'dataset' to remind from which dataset the data contain. We also want to add a comment duplicate, which will link the comment 'id' to its matched comment. Naturally a comment id column would be added too. Finally the columns beer_id, brewery_id et user_id would be link to the one of the new id given in full_users, full_breweries and full_beers.

In [6]:
ratings_ratebeer['dataset'] = 'rb'
ratings_advocate['dataset'] = 'ad'

final_id_ratebeer = len(ratings_ratebeer) + 1
first_id_advocate = final_id_ratebeer
final_id_advocate = first_id_advocate + len(ratings_advocate)
ratings_ratebeer['id_rating'] = range(1, final_id_ratebeer) #For the moment call it id_rating, rename it later 
ratings_advocate['id_rating'] = range(first_id_advocate, final_id_advocate)

assert ratings_ratebeer.iloc[len(ratings_ratebeer)-1]['id_rating']!=ratings_advocate.iloc[0]['id_rating']

In [7]:
display(ratings_ratebeer.head(1))
display(ratings_ratebeer.tail(1))
display(ratings_advocate.head(1))
display(ratings_advocate.tail(1))

display(full_breweries[full_breweries['old_ratebeer_id']==3198])

Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text,dataset,id_rating
0,410549,3198,1461664800,175852,2,4,2,4,8,2.0,"Puszka 0,33l dzięki Christoph . Kolor jasnozło...",rb,1


Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text,dataset,id_rating
7122073,220898,17155,1385895600,10233,4,4,2,4,8,2.2,"Tap @brewpub, TiraneDark brown color, nice cre...",rb,7122074


Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text,dataset,id_rating
0,142544,37262,1440064800,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",ad,7122075


Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text,dataset,id_rating
8393031,19140,885,1140346800,dithyramb.4413,3.0,3.0,3.0,3.0,3.0,3.0,Dark brown brew served on tap at the brewpub. ...,ad,15515106


Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
16712,Gabon,Sobraga,3,,3198.0,16713


Give new brewery id

In [8]:
ratings_ratebeer = ratings_ratebeer.merge(full_breweries[['old_ratebeer_id', 'id']],how='left',left_on='brewery_id',right_on='old_ratebeer_id')
ratings_ratebeer = ratings_ratebeer.drop(columns='old_ratebeer_id')
ratings_ratebeer = ratings_ratebeer.rename(columns={'id':'id_brewery'})

display(ratings_ratebeer.head(1))

Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text,dataset,id_rating,id_brewery
0,410549,3198,1461664800,175852,2,4,2,4,8,2.0,"Puszka 0,33l dzięki Christoph . Kolor jasnozło...",rb,1,16713


In [None]:
display(full_breweries.tail(1))
display(full_breweries.iloc[32665])

full_breweries['old_advocate_id'] = full_breweries['old_advocate_id'].apply(
    lambda x: [x] if pd.notna(x) and not isinstance(x, list) else x if isinstance(x, list) else []
)

print(full_breweries['old_advocate_id'].apply(type).unique())

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
32665,Albania,Rozafa Brewery,1,[],9928.0,32666


location                  Albania
name               Rozafa Brewery
nbr_beers                       1
old_advocate_id                []
old_ratebeer_id            9928.0
id                          32666
Name: 32665, dtype: object

[<class 'list'>]
<bound method IndexOpsMixin.value_counts of 0        [10093]
1        [32848]
2        [40360]
3        [40309]
4        [41205]
          ...   
32661         []
32662         []
32663         []
32664         []
32665         []
Name: old_advocate_id, Length: 32666, dtype: object>


  lambda x: [x] if pd.notna(x) and not isinstance(x, list) else x if isinstance(x, list) else []


In [10]:
full_breweries_for_ratings_exploded = full_breweries.explode('old_advocate_id')

display(full_breweries_for_ratings_exploded.head(1))

ratings_advocate['brewery_id'] = ratings_advocate['brewery_id'].astype(str)
full_breweries_for_ratings_exploded['old_advocate_id'] = full_breweries_for_ratings_exploded['old_advocate_id'].astype(str)
full_breweries_for_ratings_exploded['id'] = full_breweries_for_ratings_exploded['id'].astype(str)

ratings_advocate = ratings_advocate.merge(full_breweries_for_ratings_exploded[['old_advocate_id', 'id']],how='left',left_on='brewery_id',right_on='old_advocate_id')
ratings_advocate = ratings_advocate.drop(columns='old_advocate_id')
ratings_advocate = ratings_advocate.rename(columns={'id':'id_brewery'})

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
0,Northern Ireland,Strangford Lough,6,10093,4959.0,1


Give new user id

In [11]:
full_users['old_user_id_ratebeer'] = full_users['old_user_id_ratebeer'].apply(
    lambda x: [x] if pd.notna(x) and not isinstance(x, list) else x if isinstance(x, list) else []
)

full_users_for_ratings_exploded = full_users.explode('old_user_id_ratebeer')

ratings_ratebeer['user_id'] = ratings_ratebeer['user_id'].astype(str)
full_users_for_ratings_exploded['old_user_id_ratebeer'] = full_users_for_ratings_exploded['old_user_id_ratebeer'].astype(str)
full_users_for_ratings_exploded['id'] = full_users_for_ratings_exploded['id'].astype(str)

ratings_ratebeer = ratings_ratebeer.merge(full_users_for_ratings_exploded[['old_user_id_ratebeer', 'id']],how='left',left_on='user_id',right_on='old_user_id_ratebeer')
ratings_ratebeer = ratings_ratebeer.drop(columns='old_user_id_ratebeer')
ratings_ratebeer = ratings_ratebeer.rename(columns={'id':'id_user'})

In [12]:
ratings_advocate = ratings_advocate.merge(full_users[['old_user_id_advocate', 'id']],how='left',left_on='user_id',right_on='old_user_id_advocate')
ratings_advocate = ratings_advocate.drop(columns='old_user_id_advocate')
ratings_advocate = ratings_advocate.rename(columns={'id':'id_user'})

Give new beer id

In [13]:
ratings_ratebeer = ratings_ratebeer.merge(full_breweries[['old_ratebeer_id', 'id']],how='left',left_on='beer_id',right_on='old_ratebeer_id')
ratings_ratebeer = ratings_ratebeer.drop(columns='old_ratebeer_id')
ratings_ratebeer = ratings_ratebeer.rename(columns={'id':'id_beer'})

In [None]:
full_breweries['old_advocate_id'] = full_breweries['old_advocate_id'].apply(
    lambda x: [x] if pd.notna(x) and not isinstance(x, list) else x if isinstance(x, list) else []
)

print(full_breweries['old_advocate_id'].value_counts())

full_users_for_ratings_exploded = full_breweries.explode('old_advocate_id')

ratings_advocate['beer_id'] = ratings_advocate['beer_id'].astype(str)
full_users_for_ratings_exploded['old_advocate_id'] = full_users_for_ratings_exploded['old_advocate_id'].astype(str)
full_users_for_ratings_exploded['id'] = full_users_for_ratings_exploded['id'].astype(str)

ratings_advocate = ratings_advocate.merge(full_users_for_ratings_exploded[['old_advocate_id', 'id']],how='left',left_on='beer_id',right_on='old_advocate_id')
ratings_advocate = ratings_advocate.drop(columns='old_advocate_id')
ratings_advocate = ratings_advocate.rename(columns={'id':'id_beer'})

  lambda x: [x] if pd.notna(x) and not isinstance(x, list) else x if isinstance(x, list) else []


old_advocate_id
[]         15954
[28398]        1
[28289]        1
[6484]         1
[9090]         1
           ...  
[45336]        1
[37910]        1
[43115]        1
[35855]        1
[31831]        1
Name: count, Length: 16713, dtype: int64


In [15]:
ratings_advocate_matched = copy.deepcopy(ratings_advocate)
ratings_ratebeer_matched = copy.deepcopy(ratings_ratebeer)

ratings_advocate_matched['matched'] = np.nan
ratings_ratebeer_matched['matched'] = np.nan

# Create a dictionary to map (user_id, beer_id) to id_rating for both datasets
advocate_dict = ratings_advocate_matched.set_index(['user_id', 'beer_id'])['id_rating'].to_dict()
ratebeer_dict = ratings_ratebeer_matched.set_index(['user_id', 'beer_id'])['id_rating'].to_dict()

for index, row in ratings_matched.iterrows():
    beer_advocate = row.beer_id
    beer_ratebeer = row['beer_id.1']
    user_advocate = row.user_id
    user_ratebeer = row['user_id.1']

    id_advocate = advocate_dict.get((user_advocate, beer_advocate))
    id_ratebeer = ratebeer_dict.get((user_ratebeer, beer_ratebeer))

    if id_advocate is not None and id_ratebeer is not None:
        ratings_advocate_matched.loc[ratings_advocate_matched['id_rating'] == id_advocate, 'matched'] = id_ratebeer
        ratings_ratebeer_matched.loc[ratings_ratebeer_matched['id_rating'] == id_ratebeer, 'matched'] = id_advocate

# Drop unnecessary columns and rename 'id_rating' to 'id'
ratings_advocate_matched = ratings_advocate_matched.drop(columns=['beer_id', 'brewery_id', 'user_id'])
ratings_ratebeer_matched = ratings_ratebeer_matched.drop(columns=['beer_id', 'brewery_id', 'user_id'])

ratings_advocate_matched = ratings_advocate_matched.rename(columns={'id_rating': 'id'})
ratings_ratebeer_matched = ratings_ratebeer_matched.rename(columns={'id_rating': 'id'})


In [16]:
ratings_advocate_matched.head(10)

Unnamed: 0,date,appearance,aroma,palate,taste,overall,rating,text,dataset,id,id_brewery,id_user,id_beer,matched
0,1440064800,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",ad,7122075,8244,2754,,
1,1235127600,3.0,3.5,3.5,4.0,3.5,3.67,Pours pale copper with a thin head that quickl...,ad,7122076,1,70150,,
2,1142247600,4.0,3.5,3.5,4.0,3.5,3.73,"500ml Bottle bought from The Vintage, Antrim.....",ad,7122077,1,70151,,
3,1101898800,4.0,3.5,4.0,4.0,4.5,3.98,Serving: 500ml brown bottlePour: Good head wit...,ad,7122078,1,70152,,
4,1093860000,4.0,4.0,4.0,4.0,4.0,4.0,"500ml bottlePours with a light, slightly hazy ...",ad,7122079,1,70153,,
5,1417431600,,,,,,3.25,,ad,7122080,1,70154,,
6,1401357600,,,,,,3.5,,ad,7122081,1,70155,,
7,1393412400,,,,,,3.5,,ad,7122082,1,70156,,
8,1392030000,,,,,,3.75,,ad,7122083,1,70157,,
9,1390647600,,,,,,3.25,,ad,7122084,1,70158,,


In [17]:
ratings_ratebeer_matched.head(10)

Unnamed: 0,date,appearance,aroma,palate,taste,overall,rating,text,dataset,id,id_brewery,id_user,id_beer,matched
0,1461664800,2,4,2,4,8,2.0,"Puszka 0,33l dzięki Christoph . Kolor jasnozło...",rb,1,16713,3317.0,,
1,1487329200,2,3,2,4,8,1.9,Cerveza pale lager gabonesa. MÃ¡s floja que la...,rb,2,16713,,,
2,1466762400,3,3,2,3,5,1.6,"Kolor- złoty, klarowny. Piana - drobna, średni...",rb,3,16713,3318.0,,
3,1451646000,4,3,1,2,5,1.5,"Botella, de GabÃ³n regalo familiar.31/01/2015C...",rb,4,16713,3319.0,,
4,1445594400,2,4,2,4,7,1.9,Many thanks for this beer to Erzengel. Pours l...,rb,5,16713,3320.0,,
5,1444644000,2,3,2,3,9,1.9,Many thanks to Erzengel for sharing! Pours cle...,rb,6,16713,3321.0,,
6,1444212000,4,5,4,5,8,2.6,Many thanks to Travlr for this can! Grainy no...,rb,7,16713,1.0,,
7,1430820000,2,4,2,3,6,1.7,"Puszka pita w Gabonie. Kolor jasnosłomkowy, pi...",rb,8,16713,3317.0,,
8,1401357600,3,5,2,3,7,2.0,"330ml bottle. 29-05-2014.From Gabon, courtesy ...",rb,9,16713,3322.0,,
9,1374141600,2,3,3,4,6,1.8,Light yellow with quickly disappearing foam to...,rb,10,16713,3323.0,,


In [18]:
ratings_ratebeer.head(2)

Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text,dataset,id_rating,id_brewery,id_user,id_beer
0,410549,3198,1461664800,175852,2,4,2,4,8,2.0,"Puszka 0,33l dzięki Christoph . Kolor jasnozło...",rb,1,16713,3317.0,
1,105273,3198,1487329200,442761,2,3,2,4,8,1.9,Cerveza pale lager gabonesa. MÃ¡s floja que la...,rb,2,16713,,


In [19]:
ratings_advocate.head(2)

Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text,dataset,id_rating,id_brewery,id_user,id_beer
0,142544,37262,1440064800,nmann08.184925,3.25,2.75,3.25,2.75,3.0,2.88,"From a bottle, pours a piss yellow color with ...",ad,7122075,8244,2754,
1,19590,10093,1235127600,stjamesgate.163714,3.0,3.5,3.5,4.0,3.5,3.67,Pours pale copper with a thin head that quickl...,ad,7122076,1,70150,


In [20]:
full_breweries.head(2)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
0,Northern Ireland,Strangford Lough,6,[10093],4959.0,1
1,Northern Ireland,Sheelin,5,[32848],17616.0,2


In [21]:
ratings_advocate[ratings_advocate['id_brewery'].isna()].head()

Unnamed: 0,beer_id,brewery_id,date,user_id,appearance,aroma,palate,taste,overall,rating,text,dataset,id_rating,id_brewery,id_user,id_beer
96730,143185,37180,1441533600,dispydnb.981403,3.75,4.0,4.0,4.25,4.0,4.09,,ad,7218805,,70849,
96731,143185,37180,1426935600,stjamesgate.163714,3.75,3.75,3.5,4.0,4.0,3.88,Chartreuse with a finger of snowy froth. 3.75C...,ad,7218806,,70150,
96732,143185,37180,1414321200,emperorbevis.621888,3.75,3.25,3.75,2.25,3.5,2.98,Cask pulled by an gram handpump at the first I...,ad,7218807,,70362,
96733,273540,37180,1490353200,emperorbevis.621888,3.75,2.25,3.75,3.75,3.5,3.34,Bottled and possibly bottle conditionedPours a...,ad,7218808,,70362,
96734,178323,37180,1461578400,tomcostello.1111455,3.0,3.0,3.0,3.0,3.0,3.0,,ad,7218809,,80353,


In [22]:
full_breweries.head(2)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
0,Northern Ireland,Strangford Lough,6,[10093],4959.0,1
1,Northern Ireland,Sheelin,5,[32848],17616.0,2


In [23]:
full_breweries[full_breweries['old_advocate_id']==37180].head(2)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id


In [24]:
full_breweries[full_breweries['old_ratebeer_id']==20891].head(2)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
8190,England,Seven Bro7hers,10,"[[37180, 45243]]",20891.0,8191


In [25]:
print(len(full_breweries_for_ratings_exploded),len(full_users))

32666 220537


In [26]:
breweries_advocate = pd.read_csv(os.path.join(MATCHED_PATH,'breweries.csv'),header=1)

In [27]:
breweries_advocate[breweries_advocate['id']==37180]

Unnamed: 0,id,location,name,nbr_beers,id.1,location.1,name.1,nbr_beers.1,diff,sim
219,37180,England,Seven Bro7hers,4,20891,England,Seven Bro7hers,7,0.528802,1.0


In [28]:
breweries_advocate.head()

Unnamed: 0,id,location,name,nbr_beers,id.1,location.1,name.1,nbr_beers.1,diff,sim
0,10093,Northern Ireland,Strangford Lough Brewing Company Ltd,5,4959,Northern Ireland,Strangford Lough,5,0.431275,0.889062
1,32848,Northern Ireland,The Sheelin Brewery,4,17616,Northern Ireland,Sheelin,2,0.526388,0.863596
2,40360,Northern Ireland,Walled City Brewing Company,6,24866,Northern Ireland,Walled City,3,0.527852,0.954183
3,40309,Northern Ireland,Ards Brewing Company,7,13538,Northern Ireland,Ards Brewing Co.,13,0.554395,0.896098
4,41205,Northern Ireland,Barrahooley Brewery,3,22304,Northern Ireland,Barrahooley Craft Brewery,4,0.602544,0.896205
