In [16]:
import pandas as pd
from tqdm import tqdm

## Preprocess ratings from txt to csv while keeping only essential information

In [17]:
# RateBeer (<60s on my computer, around 22Gb of ram)
ratings_list = []
rating_dic = {}
with open("data/RateBeer/ratings.txt", encoding= "utf8") as f:
    for i, line in tqdm(enumerate(f)):
        field = line.split(": ")[0]
        if field == "\n":
            ratings_list.append(rating_dic)
            rating_dic = {}
            continue
        content = line.split(": ")[1:]
        content = ": ".join(content)
        rating_dic[field] = content.strip()
            
ratings_RB = pd.DataFrame.from_dict(ratings_list)
ratings_RB.to_csv("data/RateBeer/ratings.csv", index=False)

3635363it [00:04, 883399.67it/s]


KeyboardInterrupt: 

In [None]:
del ratings_RB, ratings_list

In [None]:
# BeerAdvocate (<60s on my computer, around 28Gb of ram)
ratings_list = []
rating_dic = {}
with open("data/BeerAdvocate/ratings.txt", encoding= "utf8") as f:
    for i, line in tqdm(enumerate(f)):
        field = line.split(": ")[0]
        if field == "\n":
            ratings_list.append(rating_dic)
            rating_dic = {}
            continue
        content = line.split(": ")[1:]
        content = ": ".join(content)
        rating_dic[field] = content.strip()
            
ratings_BA = pd.DataFrame.from_dict(ratings_list)
ratings_BA.to_csv("data/BeerAdvocate/ratings.csv", index=False)

In [None]:
del ratings_BA, ratings_list

## Load data

In [13]:
# Data from RateBeer 'RB'
beers_RB = pd.read_table("./data/RateBeer/beers.csv", sep=",")
breweries_RB = pd.read_table("./data/RateBeer/breweries.csv", sep=",").rename(columns={"id":"brewery_id","location":"brewery_location","name":"brewery_name"})
users_RB = pd.read_table("./data/RateBeer/users.csv", sep=",").rename(columns={"location":"user_location"})
ratings_RB = pd.read_table("./data/RateBeer/ratings.csv", sep=",")

# Data from BeerAdvocate 'BA'
beers_BA = pd.read_table("./data/BeerAdvocate/beers.csv", sep=",")
breweries_BA = pd.read_table("./data/BeerAdvocate/breweries.csv", sep=",").rename(columns={"id":"brewery_id","location":"brewery_location","name":"brewery_name"})
users_BA = pd.read_table("./data/BeerAdvocate/users.csv", sep=",").rename(columns={"location":"user_location"})
ratings_BA = pd.read_table("./data/BeerAdvocate/ratings.csv", sep=",")

# Data from MixedDataset 'MD'
beers_MD = pd.read_table("./data/matched_beer_data/beers.csv", sep=",")
breweries_MD = pd.read_table("./data/matched_beer_data/breweries.csv", sep=",")
users_MD = pd.read_table("./data/matched_beer_data/users.csv", sep=",")
users_approx_MD = pd.read_table("./data/matched_beer_data/users_approx.csv", sep=",")
ratings_MD = pd.read_table("./data/matched_beer_data/ratings.csv", sep=",")

  beers_MD = pd.read_table("./data/matched_beer_data/beers.csv", sep=",")
  ratings_MD = pd.read_table("./data/matched_beer_data/ratings.csv", sep=",")


## Merge data by dataset (every information available populates ratings)

In [14]:
# Data from RateBeer 'RB'
ratings_RBm = ratings_RB.merge(users_RB ,on="user_id", how="left", suffixes=('', '_drop'))
ratings_RBm = ratings_RBm.merge(breweries_RB,on="brewery_id", how="left", suffixes=('', '_drop'))
ratings_RBm = ratings_RBm.merge(beers_RB,on="beer_id", how="left", suffixes=('', '_drop'))
ratings_RBm.drop([col for col in ratings_RBm.columns if 'drop' in col], axis=1, inplace=True)
ratings_RBm['dataset']="RB"

# Data from BeerAdvocate 'BA'
ratings_BAm = ratings_BA.merge(users_BA ,on="user_id", how="left", suffixes=('', '_drop'))
ratings_BAm = ratings_BAm.merge(breweries_BA,on="brewery_id", how="left", suffixes=('', '_drop'))
ratings_BAm = ratings_BAm.merge(beers_BA,on="beer_id", how="left", suffixes=('', '_drop'))
ratings_BAm.drop([col for col in ratings_BAm.columns if 'drop' in col], axis=1, inplace=True)
ratings_BAm['dataset']="BA"

## Merge ratings_BAm and ratings_RBm

In [15]:
ratings_mixed = pd.concat([ratings_BAm, ratings_RBm], axis=0, ignore_index=True)
cols = [
    'beer_name',
    'beer_id',
    'style',
    'abv',
    'nbr_ratings',
    'nbr_reviews',
    'avg',
    'ba_score',
    'bros_score',
    'avg_computed',
    'zscore',
    'overall_score',
    'style_score',
    'nbr_matched_valid_ratings',
    'avg_matched_valid_ratings',
    'joined',
    'brewery_name',
    'brewery_id',
    'brewery_location',
    'nbr_beers',
    'date',
    'user_name',
    'user_id',
    'user_location',
    'appearance',
    'aroma',
    'palate',
    'taste',
    'overall',
    'rating',
    'text',
    'review',
    'dataset'
]
ratings_mixed = ratings_mixed[cols]
ratings_mixed.to_csv("data/ratings_BAm_RBm.csv", index=False)
ratings_mixed.columns

Index(['beer_name', 'beer_id', 'style', 'abv', 'nbr_ratings', 'nbr_reviews',
       'avg', 'ba_score', 'bros_score', 'avg_computed', 'zscore',
       'overall_score', 'style_score', 'nbr_matched_valid_ratings',
       'avg_matched_valid_ratings', 'joined', 'brewery_name', 'brewery_id',
       'brewery_location', 'nbr_beers', 'date', 'user_name', 'user_id',
       'user_location', 'appearance', 'aroma', 'palate', 'taste', 'overall',
       'rating', 'text', 'review', 'dataset'],
      dtype='object')

In [5]:
ratings_mixed_subsample = ratings_mixed.sample(frac = 0.1, random_state=0)
ratings_mixed_subsample.to_csv("data/ratings_BAm_RB_sample.csv", index=False)