In [1]:
import pandas as pd

In [2]:
BASE_PATH = "./Data"

FOLDERS = {
    "ba": "BeerAdvocate",
    "rb": "RateBeer",
    "mbd": "MatchedBeerData"
}

FILES = {
    "ba": {
        "beers": "beers.csv",
        "breweries": "breweries.csv",
        "users": "users.csv",
        "ratings": "ratings.txt.gz",
        "reviews": "reviews.txt.gz"
    },
    "rb": {
        "beers": "beers.csv",
        "breweries": "breweries.csv",
        "users": "users.csv",
        "ratings": "ratings.txt.gz",
        "reviews": "reviews.txt.gz"
    },
    "mbd": {
        "beers": "beers.csv",
        "breweries": "breweries.csv",
        "ratings": "ratings.csv",
        "users": "users.csv",
        "users approx": "users_approx.csv"}
}

In [3]:
def build_path(folderind: str, fileind: str, basepath=BASE_PATH):
    return "/".join([basepath, FOLDERS[folderind], FILES[folderind][fileind]])

def sample(df, n=10, keepna=True):
    shuffled = df.sample(frac=1) # shuffles rows
    return (shuffled.dropna() if not keepna else shuffled).head(n)

# Beer Advocate

### BeerAdvocate: Beers

In [14]:
beeradvoc_beers = pd.read_csv(build_path("ba", "beers"), nrows=100)
beeradvoc_beers.columns

Index(['beer_id', 'beer_name', 'brewery_id', 'brewery_name', 'style',
       'nbr_ratings', 'nbr_reviews', 'avg', 'ba_score', 'bros_score', 'abv',
       'avg_computed', 'zscore', 'nbr_matched_valid_ratings',
       'avg_matched_valid_ratings'],
      dtype='object')

In [15]:
sample(beeradvoc_beers)

Unnamed: 0,beer_id,beer_name,brewery_id,brewery_name,style,nbr_ratings,nbr_reviews,avg,ba_score,bros_score,abv,avg_computed,zscore,nbr_matched_valid_ratings,avg_matched_valid_ratings
57,19322,Nut Brown,3415,Whitewater Brewing Co,English Brown Ale,1,1,4.1,,,4.0,4.1,,0,
46,38838,Belfast Lager,3415,Whitewater Brewing Co,Munich Helles Lager,17,11,3.6,83.0,,4.5,3.632353,,0,
52,89841,Hoppelhammer,3415,Whitewater Brewing Co,English India Pale Ale (IPA),13,5,3.85,84.0,,6.0,3.870769,,0,
55,20627,Mill Ale,3415,Whitewater Brewing Co,English Pale Ale,1,1,3.87,,,3.7,3.87,,0,
39,178690,Stitch,40360,Walled City Brewing Company,American IPA,1,0,3.73,,,5.0,3.73,,0,
83,219504,D'Etre Deux,40307,Boundary Brewing Cooperative,Saison / Farmhouse Ale,0,0,,,,6.7,,,0,
80,206041,Brett Pale Ale,40307,Boundary Brewing Cooperative,American Pale Ale (APA),0,0,,,,3.5,,,0,
68,169951,Scrabo Gold,40309,Ards Brewing Company,English Pale Ale,1,0,3.58,,,4.9,3.58,-0.748954,0,
73,206039,A Berliner Vice #1,40307,Boundary Brewing Cooperative,Berliner Weissbier,0,0,,,,2.7,,,0,
67,169949,Rockin’ Goose,40309,Ards Brewing Company,English Pale Mild Ale,0,0,,,,4.4,,,0,


### BeerAdvocate: Breweries

In [6]:
beeradvoc_breweries = pd.read_csv(build_path("ba", "breweries"), nrows=100)
beeradvoc_breweries.columns

Index(['id', 'location', 'name', 'nbr_beers'], dtype='object')

In [7]:
sample(beeradvoc_breweries)

Unnamed: 0,id,location,name,nbr_beers
34,8803,Wales,Snowdonia Park Pub,0
46,13093,Wales,Vale of Glamorgan Brewery,6
18,45082,Northern Ireland,Bullhouse Brewing Company,6
90,31842,Wales,Gwaun Valley Brewery,5
40,8406,Wales,The Joiners Arms / Swansea Brewery,0
2,39914,Kyrgyzstan,Bear Beer,4
70,34367,Wales,Rotters Brewery,3
25,3414,Northern Ireland,Hilden Brewing Company / Taproom,14
84,13900,Wales,Evan Evans Brewery,4
27,38547,Northern Ireland,Inishmacsaint Brewing Company,3


### BeerAdvocate: Users

In [8]:
beeradvoc_users = pd.read_csv(build_path("ba", "users"), nrows=100)
beeradvoc_users.columns

Index(['nbr_ratings', 'nbr_reviews', 'user_id', 'user_name', 'joined',
       'location'],
      dtype='object')

In [9]:
sample(beeradvoc_users)

Unnamed: 0,nbr_ratings,nbr_reviews,user_id,user_name,joined,location
53,1241,1231,jayli.193663,Jayli,1202468000.0,"United States, Massachusetts"
35,914,890,jays2629.517433,JayS2629,1287828000.0,"United States, Alabama"
11,2329,5,graduatedcashew.730089,GraduatedCashew,1366884000.0,"United States, California"
0,7820,465,nmann08.184925,nmann08,1199704000.0,"United States, Washington"
42,1888,1887,beeradvocate.1,BeerAdvocate,840794400.0,"United States, Massachusetts"
91,3586,2,sendbeer.446332,sendbeer,1270721000.0,"United States, Georgia"
70,7346,780,oriolesfan4.212193,oriolesfan4,1208340000.0,"United States, Maryland"
75,116,116,stevenbilodeau.408838,StevenBilodeau,1261825000.0,"United States, Connecticut"
40,1949,1948,ronaldtheriot.241704,RonaldTheriot,1218449000.0,"United States, Louisiana"
3,31,31,helloloser12345.10867,helloloser12345,1101380000.0,Northern Ireland


### BeerAdvocate: Ratings

In [10]:
beeradvoc_ratings = pd.read_fwf(build_path("ba", "ratings"))#, encoding_errors="replace")
beeradvoc_ratings.columns

FileNotFoundError: [Errno 2] No such file or directory: './Data/BeerAdvocate/ratings.txt.gz'

### BeerAdvocate: Reviews

# RateBeer

### RateBeer: Beers

In [None]:
ratebeer_beers = pd.read_csv(build_path("rb", "beers"), nrows=100)
ratebeer_beers.columns

In [None]:
sample(ratebeer_beers)

### RateBeer: Breweries

In [None]:
ratebeer_breweries = pd.read_csv(build_path("rb", "breweries"), nrows=100)
ratebeer_breweries.columns

In [None]:
sample(ratebeer_breweries)

### RateBeer: Users

In [None]:
ratebeer_users = pd.read_csv(build_path("rb", "users"), nrows=100)
ratebeer_users.columns

In [None]:
sample(ratebeer_users)

# MatchedBeerData

### MatchedBeerData: Beers

In [None]:
matchedbeer_beers = pd.read_csv(build_path("mbd", "beers"), nrows=1)
matchedbeer_beers.columns

In [None]:
matchedbeer_beers = pd.read_csv(build_path("mbd", "beers"), nrows=100, skiprows=1)
matchedbeer_beers.columns

In [None]:
sample(matchedbeer_beers)

### MatchedBeerData: Breweries

In [None]:
matchedbeer_breweries = pd.read_csv(build_path("mbd", "breweries"), nrows=100, skiprows=1)
matchedbeer_breweries.columns

In [None]:
sample(matchedbeer_breweries)

### MatchedBeerData: Ratings

In [None]:
matchedbeer_ratings = pd.read_csv(build_path("mbd", "ratings"), nrows=100, skiprows=1)
matchedbeer_ratings.columns

In [None]:
sample(matchedbeer_ratings)

### MatchedBeerData: Users

In [None]:
matchedbeer_users = pd.read_csv(build_path("mbd", "users"), nrows=100, skiprows=1)
matchedbeer_users.columns

In [None]:
sample(matchedbeer_users)

### MatchedBeerData: Users Approx

In [None]:
matchedbeer_usersapprox = pd.read_csv(build_path("mbd", "users approx"))#, nrows=100, skiprows=1)
matchedbeer_usersapprox.columns

In [None]:
matchedbeer_usersapprox = pd.read_csv(build_path("mbd", "users approx"), nrows=100, skiprows=1)
matchedbeer_usersapprox.columns

In [None]:
sample(matchedbeer_usersapprox)