In [152]:
import os
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

root = Path(os.getcwd()).parent.parent

#Change for each one of where your data is. For me in Dataset I have all the three folders
parent_directory = os.path.dirname(root)
dataset_path = os.path.join(root,'Dataset')

ADVOCATE = "advocate"
RATEBEER = "ratebeer"
MATCHED = "matched"
FULL = "full"

ADVOCATE_PATH = os.path.join(dataset_path,ADVOCATE)
RATEBEER_PATH = os.path.join(dataset_path,RATEBEER)
MATCHED_PATH = os.path.join(dataset_path,MATCHED)
FULL_PATH = os.path.join(dataset_path,FULL)

**Introduction**

The goal of this notebook is to found a way to transform every family dataset into one single dataset. At the end we should have one single user dataset, one single beer dataset and so on. Those datasets would be downloadable, except ratings where we want to create a function that we can run at the beginning of each script, this is done to avoid to have to download/load a much heavier dataset.

To facilitate our life we want to create a single id that could correspond accross the different datasets. It is important that there remains a way to connect to the dataset of origin, i.e that we do not drop the general id.

In [29]:
#Open this one already as it may become usefull for reweries already
beers_matched = pd.read_csv(os.path.join(MATCHED_PATH,'beers.csv'),header=1)

**Breweries dataset**

We saw that in data_understanding.ipynb, for breweries there were duplicates. So we need to treat them carefully.

In [24]:
breweries_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'breweries.csv'))
breweries_matched = pd.read_csv(os.path.join(MATCHED_PATH,'breweries.csv'),header=1)
breweries_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'breweries.csv'))

print("Length of the three datasets:\n-advocate:",len(breweries_advocate),"\n-matched:", len(breweries_matched),"\n-ratebeer:", len(breweries_ratebeer))

#Already delete the breweries presents in matched
breweries_ratebeer_solo = breweries_ratebeer[~breweries_ratebeer.id.isin(breweries_matched['id.1'])]
breweries_advocate_solo = breweries_advocate[~breweries_advocate.id.isin(breweries_matched['id'])]

Length of the three datasets:
-advocate: 16758 
-matched: 8281 
-ratebeer: 24189


Both advocate and ratebeer dataset have as columns ['id', 'location', 'name', 'nbr_beers']. Matched has those columns with .1 referring to ratebeer, they also have a diff and sim column. The end format we want to have: [general_id, old_id_advocate, 'old_id_ratebeer', location, name and nbr_beers]. However we need to be carefull to the duplicates of beers in the beers dataset. As the duplicates where for advocate and not Ratebeer we make the convention to take the name and location from Ratebeer. We also want to drop diff and sim.  
Let us first get the duplicates of breweries.

In [116]:
breweries_duplicates_ratebeer = breweries_matched[breweries_matched['id.1'].duplicated(keep=False)]
breweries__not_duplicated_ratebeer = breweries_matched[~breweries_matched['id.1'].duplicated(keep=False)]#For later
breweries_duplicates_advocate = breweries_matched[breweries_matched['id'].duplicated(keep=False)]

print("Number of duplicated/tripled Ratbeer breweries",len(breweries_duplicates_ratebeer))
print("Number of duplicated/tripled Advocate breweries",len(breweries_duplicates_advocate))

Number of duplicated/tripled Ratbeer breweries 91
Number of duplicated/tripled Advocate breweries 0


In [None]:
#We are going to build a new dataset instead of playing with the dataset that contains every information for duplicates.

unique_ratebeer_brewery_id = breweries_duplicates_ratebeer['id.1'].unique()
new_matched_brewery_duplicate = pd.DataFrame(columns=['location', 'name', 'nbr_beers','old_advocate_id','old_ratebeer_id'])

for id in unique_ratebeer_brewery_id:  # Replace with your actual loop condition
    # Define data for each row
    location = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['location.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    name = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['name.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    old_advocate_id = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['id']
    list_old_advocate_id = []
    for id_advocate in (old_advocate_id):
        list_old_advocate_id.append(id_advocate)
    old_ratebeer_id = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['id.1'].iloc[0]

    total_beers_advocate = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['nbr_beers'].sum()
    total_beers_ratebeer = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['nbr_beers.1'].sum()

    nbr_beers = total_beers_advocate+ total_beers_ratebeer

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'location': location, 'name': name, 'nbr_beers': nbr_beers,'old_advocate_id':[list_old_advocate_id],'old_ratebeer_id':old_ratebeer_id})
    new_matched_brewery_duplicate = pd.concat([new_matched_brewery_duplicate, new_row], ignore_index=True)

new_matched_brewery_duplicate.sample(10)


Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
15,Sweden,Carlsberg Sverige,421,"[5368, 10897]",765
7,Denmark,Heimdal-Bryg,23,"[15048, 13444]",5786
9,Germany,Vetter Alt Heidelberger Brauhaus,42,"[4490, 5562]",2370
6,Italy,Il Giardino della Birra,10,"[7731, 18088]",4089
3,Mexico,Cervecería Agua Mala,42,"[34454, 34439]",15949
8,Germany,Gold Ochsen Brauerei,53,"[6789, 3180]",1845
44,"United States, Colorado",Crested Butte Brewing,29,"[15634, 5894, 23082]",2142
36,"United States, Nevada",High Sierra Brewing Company,72,"[38006, 24605]",13047
34,"United States, Michigan",Walldorff Brewpub & Bistro,149,"[14177, 25947]",7731
18,"United States, California",Elevation 66 Brewing Company,58,"[25256, 29622]",13383


In [125]:
unique_ratebeer_brewery_id = breweries__not_duplicated_ratebeer['id.1'].unique()
new_matched_brewery_non_duplicate = pd.DataFrame(columns=['location', 'name', 'nbr_beers','old_advocate_id','old_ratebeer_id'])

for id in unique_ratebeer_brewery_id:  # Replace with your actual loop condition
    # Define data for each row
    location = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['location.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    name = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['name.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    old_advocate_id = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['id'].iloc[0]
    old_ratebeer_id = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['id.1'].iloc[0]

    total_beers_advocate = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['nbr_beers'].sum()
    total_beers_ratebeer = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['nbr_beers.1'].sum()

    nbr_beers = total_beers_advocate+ total_beers_ratebeer

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'location': location, 'name': name, 'nbr_beers': nbr_beers,'old_advocate_id':[old_advocate_id],'old_ratebeer_id':old_ratebeer_id})
    new_matched_brewery_non_duplicate = pd.concat([new_matched_brewery_non_duplicate, new_row], ignore_index=True)

new_matched_brewery_non_duplicate.sample(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
3379,Finland,Takatalo & Tompuri Brewery,6,49387,29496
7949,"United States, Connecticut",Tullycross Tavern & Microbrewery,97,25204,12629
5355,"United States, Indiana",TwoDEEP Brewing Company,57,36177,20192


In [126]:
#Concat the two datasets together
new_matched_brewery = pd.concat([new_matched_brewery_non_duplicate, new_matched_brewery_duplicate], ignore_index=True)
new_matched_brewery.sample(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
2055,Germany,Neanderle Craft Biere,2,46978,28514
6724,"United States, Nevada",Able Baker Brewing,12,40162,29774
1139,Canada,Square Timber Brewing Company,15,37966,20469


In [131]:
print(len(new_matched_brewery), len(new_matched_brewery_non_duplicate), len(new_matched_brewery_duplicate))
print(len(new_matched_brewery.old_ratebeer_id.unique()))

8235 8190 45
8235


Now we need to count the number of beers that there really is. We assume that a beer can only be matched if its brewery is matched too.

In [None]:
for id in beers_matched['brewery_id.1']:
    new_matched_brewery.loc[new_matched_brewery['old_ratebeer_id'] == id, 'nbr_beers'] -= 1
    


In [136]:
new_matched_brewery.sample(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
2519,Australia,HopDog BeerWorks,82,26646,13298
3231,Netherlands,Lowlander Beer,7,44478,24001
2750,Spain,Magister Fabrica de Cervezas,10,18260,4041


Now let's match the three datasets together.

In [140]:
breweries_ratebeer_solo = breweries_ratebeer_solo.rename(columns={'id': 'old_ratebeer_id'})
breweries_ratebeer_solo['old_advocate_id'] = np.nan
breweries_ratebeer_solo.sample(3)  

Unnamed: 0,old_ratebeer_id,location,name,nbr_beers,old_advocate_id
11879,23717,Finland,Paloaseman Panimo,9,
9718,30198,Brazil,Cervejaria Red Door,1,
23081,17047,France,Biercors,10,


In [141]:
breweries_advocate_solo = breweries_advocate_solo.rename(columns={'id': 'old_advocate_id'})
breweries_advocate_solo['old_ratebeer_id'] = np.nan
breweries_advocate_solo.sample(3)  

Unnamed: 0,old_advocate_id,location,name,nbr_beers,old_ratebeer_id
3510,3340,Germany,Dinkelacker-Schwabenbraeu AG,32,
16115,2655,"United States, Michigan",Zig's Kettle & Brew,2,
13554,16862,"United States, Oregon",Deschutes Brewery & Public House,431,


In [151]:
full_breweries = pd.concat([new_matched_brewery,breweries_advocate_solo, breweries_ratebeer_solo], ignore_index=True)
full_breweries['id'] = range(1, len(full_breweries) + 1)
full_breweries.sample(10)


Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
6048,"United States, North Carolina",Wise Man Brewing,30,47200.0,30121.0,6049
26417,Norway,Fjord Bryggeri,5,,26890.0,26418
21577,Switzerland,Faiseurs de Bière,24,,1374.0,21578
29183,"United States, Oregon",Alameda Brewing Co.,81,,1471.0,29184
8327,England,Bankes Arms Country Inn / Isle of Purbeck Brewery,3,17888.0,,8328
27293,"United States, Washington",Twin Rivers Brewing/Adams Northwest Bistro,23,,656.0,27294
7150,"United States, Illinois",Millrose Restaurant and Brewing Co.,22,5442.0,3841.0,7151
26406,Norway,Eik & Tid,14,,28818.0,26407
8070,"United States, Oklahoma",Redbud Brewing Company,14,24925.0,12431.0,8071
7583,France,L&L Alphand,6,9409.0,1441.0,7584


In [None]:
print(len(full_breweries))
print(len(breweries_matched),len(breweries_advocate),len(breweries_ratebeer))
print(-len(breweries_matched)+len(breweries_advocate)+len(breweries_ratebeer))

32666
8281 16758 24189
32666


This confirms the number of single breweries we computed in data_understanding.ipynb.

In [153]:
full_breweries.to_csv(os.path.join(FULL_PATH,'breweries.csv'),index=False)

**Beer dataset**

In [6]:
beers_matched = pd.read_csv(os.path.join(MATCHED_PATH,'beers.csv'),header=1)
beers_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'beers.csv'))
beers_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'beers.csv'))

print(len(beers_matched))
print(len(beers_advocate))
print(len(beers_ratebeer))

45640
280823
442081


In [None]:
beers_matched.sample(10)

Unnamed: 0,abv,avg,avg_computed,avg_matched_valid_ratings,ba_score,beer_id,beer_name,beer_wout_brewery_name,brewery_id,brewery_name,...,brewery_id.1,brewery_name.1,nbr_matched_valid_ratings.1,nbr_ratings.1,overall_score,style.1,style_score,zscore.1,diff,sim
16147,5.7,4.07,3.883333,4.15,,97173,Extraterrestrial Space Beer,Extraterrestrial Space Beer,32092,Intergalactic Brewing Co.,...,17642,Intergalactic Brewing Company,1,1,,Premium Bitter/ESB,,-0.195097,0.802319,1.0
15834,12.0,4.23,4.233333,4.95,,145222,AleSmith Speedway Stout - Honey Graham,Graham Speedway Honey Stout,396,AleSmith Brewing Company,...,432,AleSmith Brewing Company,5,5,,Imperial Stout,,0.691466,0.722944,1.0
7954,5.6,3.56,3.540526,3.48,83.0,98054,Ratsherrn Pale Ale,Pale Ale,31793,Ratsherrn Brauerei GmbH,...,14561,Ratsherrn Brauerei,182,182,45.0,American Pale Ale,37.0,-0.541521,0.731194,1.0
42941,6.0,3.61,3.585614,3.475,83.0,129335,Arjuna,Arjuna,30164,Anthem Brewing Company,...,14617,Anthem Brewing Company,29,29,69.0,Witbier,92.0,-0.09156,0.460728,1.0
40569,6.2,3.64,3.531316,3.68,83.0,83345,Beardy Guard,Guard Beardy,29438,Rogness Brewing Company,...,14294,Rogness Brewing Company,29,29,47.0,Bière de Garde,50.0,-0.461041,1.0,1.0
9202,4.6,3.24,3.24,3.24,,48754,Effen Lager,Lager Effen,14014,Southern Bay Brewing Company,...,8056,Southern Bay Brewing Company,10,10,29.0,Pale Lager,98.0,-0.93121,0.448773,0.808393
3362,5.4,,,,,155915,Équinoxe Pale Ale,Pale Ale Équinoxe,1141,Brasserie Dieu du Ciel!,...,364,Dieu du Ciel,2,2,,American Pale Ale,,-0.044463,0.426658,1.0
11123,0.5,1.65,1.65,1.65,,89257,Royal Club Shandy,Royal Shandy Club,81,Heineken Nederland B.V.,...,9,Heineken Nederland,18,18,5.0,Low Alcohol,59.0,-2.374874,0.438918,1.0
2529,4.1,3.41,3.472,3.38,,63691,THAT,THAT,9568,Teme Valley Brewery,...,3086,Teme Valley,58,58,27.0,Bitter,30.0,-0.832641,1.0,1.0
20455,7.2,4.51,4.409186,4.532941,94.0,255380,Party Wave,Party Wave,26676,Kane Brewing Company,...,13267,Kane Brewing Company,15,15,96.0,India Pale Ale (IPA),96.0,0.822648,1.0,1.0
