In [1]:
import os
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

root = Path(os.getcwd()).parent.parent

#Change for each one of where your data is. For me in Dataset I have all the three folders
parent_directory = os.path.dirname(root)
dataset_path = os.path.join(root,'Dataset')

ADVOCATE = "advocate"
RATEBEER = "ratebeer"
MATCHED = "matched"
FULL = "full"

ADVOCATE_PATH = os.path.join(dataset_path,ADVOCATE)
RATEBEER_PATH = os.path.join(dataset_path,RATEBEER)
MATCHED_PATH = os.path.join(dataset_path,MATCHED)
FULL_PATH = os.path.join(dataset_path,FULL)

**Introduction**

The goal of this notebook is to found a way to transform every family dataset into one single dataset. At the end we should have one single user dataset, one single beer dataset and so on. Those datasets would be downloadable, except ratings where we want to create a function that we can run at the beginning of each script, this is done to avoid to have to download/load a much heavier dataset.

To facilitate our life we want to create a single id that could correspond accross the different datasets. It is important that there remains a way to connect to the dataset of origin, i.e that we do not drop the general id.

In [26]:
#Open this one already as it may become usefull for reweries already
beers_matched = pd.read_csv(os.path.join(MATCHED_PATH,'beers.csv'),header=1)

**Breweries dataset**

We saw that in data_understanding.ipynb, for breweries there were duplicates. So we need to treat them carefully.

In [27]:
breweries_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'breweries.csv'))
breweries_matched = pd.read_csv(os.path.join(MATCHED_PATH,'breweries.csv'),header=1)
breweries_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'breweries.csv'))

print("Length of the three datasets:\n-advocate:",len(breweries_advocate),"\n-matched:", len(breweries_matched),"\n-ratebeer:", len(breweries_ratebeer))

#Already delete the breweries presents in matched
breweries_ratebeer_solo = breweries_ratebeer[~breweries_ratebeer.id.isin(breweries_matched['id.1'])]
breweries_advocate_solo = breweries_advocate[~breweries_advocate.id.isin(breweries_matched['id'])]

Length of the three datasets:
-advocate: 16758 
-matched: 8281 
-ratebeer: 24189


Both advocate and ratebeer dataset have as columns ['id', 'location', 'name', 'nbr_beers']. Matched has those columns with .1 referring to ratebeer, they also have a diff and sim column. The end format we want to have: [general_id, old_id_advocate, 'old_id_ratebeer', location, name and nbr_beers]. However we need to be carefull to the duplicates of beers in the beers dataset. As the duplicates where for advocate and not Ratebeer we make the convention to take the name and location from Ratebeer. We also want to drop diff and sim.  
Let us first get the duplicates of breweries.

In [28]:
breweries_duplicates_ratebeer = breweries_matched[breweries_matched['id.1'].duplicated(keep=False)]
breweries__not_duplicated_ratebeer = breweries_matched[~breweries_matched['id.1'].duplicated(keep=False)]#For later
breweries_duplicates_advocate = breweries_matched[breweries_matched['id'].duplicated(keep=False)]

print("Number of duplicated/tripled Ratbeer breweries",len(breweries_duplicates_ratebeer))
print("Number of duplicated/tripled Advocate breweries",len(breweries_duplicates_advocate))

Number of duplicated/tripled Ratbeer breweries 91
Number of duplicated/tripled Advocate breweries 0


In [29]:
#We are going to build a new dataset instead of playing with the dataset that contains every information for duplicates.

unique_ratebeer_brewery_id = breweries_duplicates_ratebeer['id.1'].unique()
new_matched_brewery_duplicate = pd.DataFrame(columns=['location', 'name', 'nbr_beers','old_advocate_id','old_ratebeer_id'])

for id in unique_ratebeer_brewery_id:  # Replace with your actual loop condition
    # Define data for each row
    location = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['location.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    name = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['name.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    old_advocate_id = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['id']
    list_old_advocate_id = []
    for id_advocate in (old_advocate_id):
        list_old_advocate_id.append(id_advocate)
    old_ratebeer_id = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['id.1'].iloc[0]

    total_beers_advocate = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['nbr_beers'].sum()
    total_beers_ratebeer = breweries_duplicates_ratebeer[breweries_duplicates_ratebeer['id.1']==id]['nbr_beers.1'].iloc[0] #don't want to double the number of beers for the one duplicated

    nbr_beers = total_beers_advocate+ total_beers_ratebeer

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'location': location, 'name': name, 'nbr_beers': nbr_beers,'old_advocate_id':[list_old_advocate_id],'old_ratebeer_id':old_ratebeer_id})
    new_matched_brewery_duplicate = pd.concat([new_matched_brewery_duplicate, new_row], ignore_index=True)

new_matched_brewery_duplicate.sample(10)


Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
26,"United States, Virginia",Crooked Run Brewing,141,"[32661, 48463]",17177
0,England,Seven Bro7hers,11,"[37180, 45243]",20891
12,Australia,Cavalier Beer,98,"[25867, 36326]",12657
5,Canada,Skeena Brewing Company,2,"[38530, 13832]",11649
24,"United States, Maryland",DuClaw Brewing Company,307,"[1924, 16345]",779
17,"United States, Louisiana",Rikenjaks Brewing Company,6,"[44929, 970]",3860
15,Sweden,Carlsberg Sverige,265,"[5368, 10897]",765
37,"United States, Oregon",Pelican Pub & Brewery,202,"[48599, 1304]",1511
33,"United States, Ohio",Hoof Hearted Brewing,199,"[30179, 44305]",14572
29,"United States, North Carolina",Lynnwood Grill & Brewing Concern,60,"[47982, 33492]",19642


Did the next cell in a dumb way just need to rename the columns, would be faster.

In [30]:
unique_ratebeer_brewery_id = breweries__not_duplicated_ratebeer['id.1'].unique()
new_matched_brewery_non_duplicate = pd.DataFrame(columns=['location', 'name', 'nbr_beers','old_advocate_id','old_ratebeer_id'])

for id in unique_ratebeer_brewery_id:  # Replace with your actual loop condition
    # Define data for each row
    location = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['location.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    name = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['name.1'].iloc[0]#Always take the first as it is the same for the two/three of them
    old_advocate_id = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['id'].iloc[0]
    old_ratebeer_id = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['id.1'].iloc[0]

    total_beers_advocate = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['nbr_beers'].sum()
    total_beers_ratebeer = breweries__not_duplicated_ratebeer[breweries__not_duplicated_ratebeer['id.1']==id]['nbr_beers.1'].sum()

    nbr_beers = total_beers_advocate+ total_beers_ratebeer

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'location': location, 'name': name, 'nbr_beers': nbr_beers,'old_advocate_id':[old_advocate_id],'old_ratebeer_id':old_ratebeer_id})
    new_matched_brewery_non_duplicate = pd.concat([new_matched_brewery_non_duplicate, new_row], ignore_index=True)

new_matched_brewery_non_duplicate.sample(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
3058,Czech Republic,Pivovar Chomout,41,39847,21344
6805,"United States, Oregon",Agrarian Ales,183,31704,15788
1352,Italy,Il Quarto dOra Granata (Alvemar srl),27,38859,16464


In [31]:
#Concat the two datasets together
new_matched_brewery = pd.concat([new_matched_brewery_non_duplicate, new_matched_brewery_duplicate], ignore_index=True)
new_matched_brewery.sample(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
910,Canada,Hell Or High Water Beer And Spirits Inc,2,39897,22602
2643,Spain,Monsieur Gordo Brewery,8,46050,21308
954,Canada,Phillips Brewing Co.,309,2675,1844


In [32]:
print(len(new_matched_brewery), len(new_matched_brewery_non_duplicate), len(new_matched_brewery_duplicate))
print(len(new_matched_brewery.old_ratebeer_id.unique()))

8235 8190 45
8235


Now we need to count the number of beers that there really is. We assume that a beer can only be matched if its brewery is matched too.

In [33]:
new_matched_brewery_good_amount_rating = new_matched_brewery.copy()
for id in beers_matched['brewery_id.1']:
    new_matched_brewery_good_amount_rating.loc[new_matched_brewery_good_amount_rating['old_ratebeer_id'] == id, 'nbr_beers'] -= 1
    


In [34]:
new_matched_brewery_good_amount_rating.sample(3)

Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id
4066,"United States, California",Indian Joe Brewing,56,30546,15479
2146,Germany,Privatbrauerei Rogg,38,6790,9921
979,Canada,BRB &#40;Be Right Back - Big River&#41;,102,7527,3288


Now let's match the three datasets together.

In [35]:
breweries_ratebeer_solo = breweries_ratebeer_solo.rename(columns={'id': 'old_ratebeer_id'})
breweries_ratebeer_solo['old_advocate_id'] = np.nan
breweries_ratebeer_solo.sample(3)  

Unnamed: 0,old_ratebeer_id,location,name,nbr_beers,old_advocate_id
14465,5945,"United States, Washington",Elkhead Brewing Company,26,
10000,26170,Vietnam,Tê Tê Brewing Co.,1,
9296,28846,Austria,GROK,4,


In [36]:
breweries_advocate_solo = breweries_advocate_solo.rename(columns={'id': 'old_advocate_id'})
breweries_advocate_solo['old_ratebeer_id'] = np.nan
breweries_advocate_solo.sample(3)  

Unnamed: 0,old_advocate_id,location,name,nbr_beers,old_ratebeer_id
16189,1887,"United States, Pennsylvania",Jack's Mountain Restaurant and Brewery,6,
6580,34185,Brazil,Serra de Três Pontas Cervejaria Artesanal,4,
635,71,England,Fuller Smith & Turner PLC,61,


In [37]:
full_breweries = pd.concat([new_matched_brewery_good_amount_rating,breweries_advocate_solo, breweries_ratebeer_solo], ignore_index=True)
full_breweries['id'] = range(1, len(full_breweries) + 1)
full_breweries.sample(10)


Unnamed: 0,location,name,nbr_beers,old_advocate_id,old_ratebeer_id,id
22424,Australia,Currumbin Valley Brewing,2,,29486.0,22425
31269,England,Tom Woods,51,,53.0,31270
32134,France,La Ferme de Kergador (Louis Pierre Le Meur),1,,27980.0,32135
25212,Netherlands,Hettingabier,10,,10224.0,25213
30572,England,Hop Studio,53,,14557.0,30573
19354,Italy,Terra del Sole,1,,13961.0,19355
9339,Canada,Axe & Barrel Brewing Company,6,43343.0,,9340
17391,Ukraine,Ohtyrskyj Pyvovarnyj Zavod (Obolon),10,,17472.0,17392
15726,"United States, Indiana",Mad Anthony's Old State Alehouse,0,21785.0,,15727
9694,Bulgaria,Ale House,1,21746.0,,9695


In [38]:
print(len(full_breweries))
print(len(breweries_matched),len(breweries_advocate),len(breweries_ratebeer))
print(-len(breweries_matched)+len(breweries_advocate)+len(breweries_ratebeer))

32666
8281 16758 24189
32666


This confirms the number of single breweries we computed in data_understanding.ipynb.

In [39]:
full_breweries.to_csv(os.path.join(FULL_PATH,'breweries.csv'),index=False)

In [40]:
test_breweri = pd.read_csv(os.path.join(FULL_PATH,'breweries.csv'))
print(len(test_breweri))
del test_breweri

32666


**Users dataset**

We prefer to consider the users approx rather than the users of the matched dataset. We saw that the users do have duplicates (unlike the normal file). We decided to take the different users in the approx file as a single user even for a lower similitude value (no sim value is below 0.8006407690254358). We consider the effect of this to be negligeable as it is a small percent of data. (like in the brewery file).

In [91]:
users_matched = pd.read_csv(os.path.join(MATCHED_PATH,'users_approx.csv'),header=1)
users_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'users.csv'))
users_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'users.csv'))

ratings_matched = pd.read_csv(os.path.join(MATCHED_PATH,'ratings.csv'),header=1)#needed 


print("Length of the three datasets:\n-advocate:",len(users_advocate),"\n-matched:", len(users_matched),"\n-ratebeer:", len(users_ratebeer))

#Already delete the breweries presents in matched
users_ratebeer_solo = users_ratebeer[~users_ratebeer.user_name.isin(users_matched['user_name.1'])]
users_advocate_solo = users_advocate[~users_advocate.user_name.isin(users_matched['user_name'])]

print("New length of:\n-advocate:",len(users_advocate_solo),"\n-ratebeer:", len(users_ratebeer_solo))

Length of the three datasets:
-advocate: 153704 
-matched: 3341 
-ratebeer: 70174
New length of:
-advocate: 150388 
-ratebeer: 66833


In [92]:
users_matched.head(2)

Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1,sim
0,1483009000.0,Spain,3,0,magicuenca.1185749,MAGICuenca,magicuenca,1484046000.0,Spain,89,442761,MAGICuenca91,magicuenca91,0.904534
1,1220868000.0,Germany,6,6,erzengel.248045,Erzengel,erzengel,1224324000.0,Germany,8781,83106,Erzengel,erzengel,1.0


In [93]:
users_duplicates_ratebeer = users_matched[users_matched['user_name.1'].duplicated(keep=False)]
users_duplicates_advocate = users_matched[users_matched['user_name'].duplicated(keep=False)]
users__not_duplicated_advocate = users_matched[~users_matched['user_name'].duplicated(keep=False)]#For later

print("Number of duplicated/tripled Ratbeer users",len(users_duplicates_ratebeer))
print("Number of duplicated/tripled Advocate users",len(users_duplicates_advocate))

Number of duplicated/tripled Ratbeer users 0
Number of duplicated/tripled Advocate users 47


Both advocate and ratebeer dataset have as columns ['user_id', 'location', 'user_name','user_name_lower', 'joined','nbr_ratings']. Advocate has nbr_reviews in addition. Matched has those columns with .1 referring to ratebeer. It also has a column 'sim' that we will drop. The end format we want to have: [general_id, old_user_id_advocate, old_user__id_ratebeer, location, user_name_lower, joined_advocate, joined_ratebeer and nbr_ratings]. We think that the other columns will not be usefull for our analysis. The next cell shows that the user_name_lower are equal between the two datasets. For the location we will in opposition to the brewery give priority to the advocate dataset, as it is this time the one having one correspondence to multiple ratebeer users.

In [94]:
users_duplicates_advocate.sample(2)

Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1,sim
1243,1350209000.0,"United States, Florida",1,1,porterporter.698876,PorterPorter,porterporter,1206443000.0,"United States, Florida",29,72217,porter4porter,porter4porter,0.891695
1399,1200568000.0,"United States, Pennsylvania",211,16,beerfinder.187713,beerfinder,beerfinder,1179828000.0,"United States, Pennsylvania",31,54721,beerfinder1,beerfinder1,0.957427


In [95]:
users_duplicates_advocate[users_duplicates_advocate['user_id']=='lonestar.677281'].head(2)

Unnamed: 0,joined,location,nbr_ratings,nbr_reviews,user_id,user_name,user_name_lower,joined.1,location.1,nbr_ratings.1,user_id.1,user_name.1,user_name_lower.1,sim
562,1337422000.0,"United States, Texas",1,1,lonestar.677281,Lonestar,lonestar,1081332000.0,"United States, Texas",13,11446,oneStar,onestar,0.822609
2059,1337422000.0,"United States, Texas",1,1,lonestar.677281,Lonestar,lonestar,1162984000.0,"United States, Texas",6,44744,LONESTAR,lonestar,1.0


In [97]:
#We are going to build a new dataset instead of playing with the dataset that contains every information for duplicates.

unique_advocate_user_user_name = users_duplicates_advocate['user_name'].unique()
new_matched_user_duplicate = pd.DataFrame(columns=['location', 'joined_advocate', 'old_user_id_advocate','joined_ratebeer','old_user_id_ratebeer','user_name','nbr_ratings_total'])

for user_name in unique_advocate_user_user_name:  # Replace with your actual loop condition
    # Define data for each row
    location = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['location'].iloc[0]#Always take the first as it is the same for the two/three of them
    joined_advocate = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['joined'].iloc[0]
    joined_ratebeer = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['joined.1'].min() #take min value of joined time (we want the first time they joined)

    old_user_id_advocate = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['user_id'].iloc[0]
    old_user_id_ratebeer = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['user_id.1']
    name = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['user_name'].iloc[0]

    list_old_ratebeer_id = []
    for id_ratbeer in (old_user_id_ratebeer):
        list_old_ratebeer_id.append(id_ratbeer)

    total_ratings_advocate = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['nbr_ratings'].iloc[0]
    total_ratings_ratebeer = users_duplicates_advocate[users_duplicates_advocate['user_name']==user_name]['nbr_ratings.1'].sum()

    nbr_ratings_total = total_ratings_advocate+ total_ratings_ratebeer

    # Append the new row to the DataFrame
    new_row = pd.DataFrame({'location': location, 'user_name': name,'old_user_id_ratebeer':[list_old_ratebeer_id],'old_user_id_advocate':old_user_id_advocate,'joined_advocate':joined_advocate,'joined_ratebeer':joined_ratebeer,'nbr_ratings_total':nbr_ratings_total})
    new_matched_user_duplicate = pd.concat([new_matched_brewery_duplicate, new_row], ignore_index=True)

new_matched_user_duplicate.sample(10)


Unnamed: 0,location,joined_advocate,old_user_id_advocate,joined_ratebeer,old_user_id_ratebeer,user_name,nbr_ratings_total
12,Netherlands,1218103000.0,schuim.240905,1007118000.0,"[2462, 105981]",schuim,79
10,Australia,1392462000.0,azza.782374,1174903000.0,"[343397, 355085, 51942]",Azza,56
14,Canada,1121422000.0,beers.29246,1185444000.0,"[58154, 241070, 130784]",Beers,13
17,Norway,1421492000.0,ketil.928679,1292584000.0,"[119220, 256180]",Ketil,8
19,"United States, Idaho",1434622000.0,ralphie9.1001278,1160388000.0,"[363652, 43370]",Ralphie9,12
15,Brazil,1365502000.0,marcelo.727734,1146823000.0,"[294202, 36866]",Marcelo,11
7,Canada,1156068000.0,nighthawk.93517,1150452000.0,"[124623, 38708]",nighthawk,212
4,"United States, Texas",1337422000.0,lonestar.677281,1081332000.0,"[11446, 44744]",Lonestar,20
21,"United States, Ohio",1104318000.0,beerbeerbeerbeer.12454,1107256000.0,"[19505, 411715]",beerbeerbeerbeer,8
5,"United States, California",1295521000.0,chadski.555343,1381054000.0,"[282710, 397860]",chadski,406


In [98]:
users_matched_not_duplicated = users__not_duplicated_advocate.drop(columns=['nbr_reviews','user_name.1','user_name_lower','location','user_name_lower.1','sim'])
users_matched_not_duplicated = users_matched_not_duplicated.rename(columns={'joined': 'joined_advocate','joined.1': 'joined_ratebeer','user_id': 'old_user_id_advocate','user_id.1': 'old_user_id_ratebeer','location.1':'location'})
users_matched_not_duplicated["nbr_ratings_total"] = users_matched_not_duplicated.nbr_ratings+users_matched_not_duplicated["nbr_ratings.1"]
users_matched_not_duplicated = users_matched_not_duplicated.drop(columns=['nbr_ratings','nbr_ratings.1'])
users_matched_not_duplicated.sample(5)

Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total
1862,1348308000.0,dcmike.695547,dcmike,1403345000.0,"United States, Florida",323131,2
1610,1322737000.0,hubbabub.640049,Hubbabub,1287137000.0,Sweden,115190,12
2236,1347098000.0,ford.693336,Ford,1348394000.0,"United States, Texas",218595,403
2003,1397124000.0,werd10101.793733,werd10101,1399025000.0,"United States, New Jersey",316064,8
2595,1289819000.0,efisher63.526072,efisher63,1346494000.0,"United States, Pennsylvania",214529,6


In [99]:
#Concat the two datasets together
new_matched_user = pd.concat([users_matched_not_duplicated, new_matched_user_duplicate], ignore_index=True)
new_matched_user.sample(3)

Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total
2302,1365934000.0,adamwn.728614,adamwn,1365934000.0,"United States, South Carolina",254177,71
2612,1281953000.0,dbraz45.492323,dbraz45,1183975000.0,"United States, Arizona",57210,15
2320,1196334000.0,thebeerdoctor.175911,thebeerdoctor,1198667000.0,"United States, Ohio",66474,79


In [100]:
users_matched_not_duplicated_good_amount_ratings = new_matched_user.copy()
for id in ratings_matched['user_id']:
    users_matched_not_duplicated_good_amount_ratings.loc[users_matched_not_duplicated_good_amount_ratings['old_user_id_advocate'] == id, 'nbr_ratings_total'] -= 1

print(new_matched_user.nbr_ratings_total.sum())
print(users_matched_not_duplicated_good_amount_ratings.nbr_ratings_total.sum())

1608155
1586191


In [101]:
users_ratebeer_solo = users_ratebeer_solo.rename(columns={'user_id': 'old_user_id_ratebeer','joined':'joined_ratebeer','nbr_ratings':'nbr_ratings_total'})
users_ratebeer_solo['old_user_id_advocate'] = np.nan
users_ratebeer_solo['joined_advocate'] = np.nan
users_ratebeer_solo.sample(3)  

Unnamed: 0,nbr_ratings_total,old_user_id_ratebeer,user_name,joined_ratebeer,location,old_user_id_advocate,joined_advocate
31375,2,321145,jualbuquerque,1402135000.0,,,
50757,3,6523,cubedbee,1041678000.0,,,
29817,11,333431,Terebia,1409479000.0,,,


In [102]:
users_advocate_solo = users_advocate_solo.rename(columns={'user_id': 'old_user_id_advocate','joined':'joined_advocate','nbr_ratings':'nbr_ratings_total'})
users_advocate_solo['old_user_id_ratebeer'] = np.nan
users_advocate_solo['joined_ratebeer'] = np.nan
users_advocate_solo.sample(3)  

Unnamed: 0,nbr_ratings_total,nbr_reviews,old_user_id_advocate,user_name,joined_advocate,location,old_user_id_ratebeer,joined_ratebeer
36488,4,0,gallaghercm.1105925,gallaghercm,1454929000.0,Canada,,
57743,8,1,rossioak.176919,rossioak,1196766000.0,"United States, Wisconsin",,
47990,1,0,iatcnc01949.999916,iatcnc01949,1434276000.0,"United States, New York",,


In [103]:
full_users = pd.concat([users_matched_not_duplicated_good_amount_ratings,users_ratebeer_solo, users_advocate_solo], ignore_index=True)
full_users['id'] = range(1, len(full_users) + 1)
full_users.sample(2)


Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total,nbr_reviews,id
121270,1364641000.0,igorquad.726313,igorquad,,England,,6,1.0,121271
207181,1426590000.0,fishsticks85.960036,Fishsticks85,,,,2,0.0,207182


In [104]:
print(len(full_users))
print(len(users_matched),len(users_advocate),len(users_ratebeer))
print(len(users_matched)+len(users_advocate_solo)+len(users_ratebeer_solo))

220538
3341 153704 70174
220562


In [121]:
test = full_users[full_users['user_name'].duplicated(keep=False)]
test[test['user_name']=='Elwood']

Unnamed: 0,joined_advocate,old_user_id_advocate,user_name,joined_ratebeer,location,old_user_id_ratebeer,nbr_ratings_total,nbr_reviews,id
990,1152871000.0,elwood.88673,Elwood,1374314000.0,Canada,270235,6,,991
4953,,,Elwood,1235041000.0,"United States, Virginia",87609,2966,,4954


Oh so in different dataset people can have the same user_name. We need to be carefull avout this. If we treat the data we need to use id.

In [122]:
full_users.to_csv(os.path.join(FULL_PATH,'users.csv'),index=False)

In [123]:
test_user = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))
print(len(test_user))
del test_user

220538


  test_user = pd.read_csv(os.path.join(FULL_PATH,'users.csv'))


**Beer dataset**

In this dataset we saw that there were no duplicates. We do not need to make the carefull analysis we made before. However we still need to look at the matched dataset. We also need to link the new brewery id.

In [6]:
beers_matched = pd.read_csv(os.path.join(MATCHED_PATH,'beers.csv'),header=1)
beers_advocate = pd.read_csv(os.path.join(ADVOCATE_PATH,'beers.csv'))
beers_ratebeer = pd.read_csv(os.path.join(RATEBEER_PATH,'beers.csv'))

print(len(beers_matched))
print(len(beers_advocate))
print(len(beers_ratebeer))

45640
280823
442081


In [None]:
beers_matched.sample(10)

Unnamed: 0,abv,avg,avg_computed,avg_matched_valid_ratings,ba_score,beer_id,beer_name,beer_wout_brewery_name,brewery_id,brewery_name,...,brewery_id.1,brewery_name.1,nbr_matched_valid_ratings.1,nbr_ratings.1,overall_score,style.1,style_score,zscore.1,diff,sim
16147,5.7,4.07,3.883333,4.15,,97173,Extraterrestrial Space Beer,Extraterrestrial Space Beer,32092,Intergalactic Brewing Co.,...,17642,Intergalactic Brewing Company,1,1,,Premium Bitter/ESB,,-0.195097,0.802319,1.0
15834,12.0,4.23,4.233333,4.95,,145222,AleSmith Speedway Stout - Honey Graham,Graham Speedway Honey Stout,396,AleSmith Brewing Company,...,432,AleSmith Brewing Company,5,5,,Imperial Stout,,0.691466,0.722944,1.0
7954,5.6,3.56,3.540526,3.48,83.0,98054,Ratsherrn Pale Ale,Pale Ale,31793,Ratsherrn Brauerei GmbH,...,14561,Ratsherrn Brauerei,182,182,45.0,American Pale Ale,37.0,-0.541521,0.731194,1.0
42941,6.0,3.61,3.585614,3.475,83.0,129335,Arjuna,Arjuna,30164,Anthem Brewing Company,...,14617,Anthem Brewing Company,29,29,69.0,Witbier,92.0,-0.09156,0.460728,1.0
40569,6.2,3.64,3.531316,3.68,83.0,83345,Beardy Guard,Guard Beardy,29438,Rogness Brewing Company,...,14294,Rogness Brewing Company,29,29,47.0,Bière de Garde,50.0,-0.461041,1.0,1.0
9202,4.6,3.24,3.24,3.24,,48754,Effen Lager,Lager Effen,14014,Southern Bay Brewing Company,...,8056,Southern Bay Brewing Company,10,10,29.0,Pale Lager,98.0,-0.93121,0.448773,0.808393
3362,5.4,,,,,155915,Équinoxe Pale Ale,Pale Ale Équinoxe,1141,Brasserie Dieu du Ciel!,...,364,Dieu du Ciel,2,2,,American Pale Ale,,-0.044463,0.426658,1.0
11123,0.5,1.65,1.65,1.65,,89257,Royal Club Shandy,Royal Shandy Club,81,Heineken Nederland B.V.,...,9,Heineken Nederland,18,18,5.0,Low Alcohol,59.0,-2.374874,0.438918,1.0
2529,4.1,3.41,3.472,3.38,,63691,THAT,THAT,9568,Teme Valley Brewery,...,3086,Teme Valley,58,58,27.0,Bitter,30.0,-0.832641,1.0,1.0
20455,7.2,4.51,4.409186,4.532941,94.0,255380,Party Wave,Party Wave,26676,Kane Brewing Company,...,13267,Kane Brewing Company,15,15,96.0,India Pale Ale (IPA),96.0,0.822648,1.0,1.0
