In [1]:
import os
os.chdir('../..')

In [26]:
from classes.matching import Matching
from classes.helpers import *
import numpy as np
import json
import pandas as pd
from datetime import datetime
import gzip

import ast

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

data_folder = '../data/'

# Properties

We check the properties between the matched beers. We try to show that the loved beers are not always the same. (Beers are indistinguishable)

In [32]:
ts = pd.read_csv(data_folder + 'tmp/time_series_1_valid.csv', header=[0,1])

# Transform strings into arrays
for i in ts.index:
    for key1 in ['ba', 'rb']:
        for key2 in ['dates', 'ratings', 'z_scores']:
            arr = ast.literal_eval(ts.loc[i][key1][key2])
            ts.set_value(i, (key1, key2), arr)

df = pd.read_csv(data_folder + 'matched/beers.csv', header=[0,1])

In [42]:
diffs = {'ba': [], 'rb': []}
for i in ts.index:
    row = ts.iloc[i]
    for key in ['rb', 'ba']:
        diffs[key].append(row[key]['z_scores'][0])
        
thresholds = {}
for key in ['ba', 'rb']:
    thresholds[key] = {}
    thresholds[key]['low'] = np.percentile(diffs[key], 15)
    thresholds[key]['high'] = np.percentile(diffs[key], 85)
    
for key in ['ba', 'rb']:
    tmp = []
    nbr_rats = []
    
    for i in ts.index:      
        if diffs[key][i] > thresholds[key]['high']:
            tmp.append('H')
        elif diffs[key][i] >= thresholds[key]['low']:
            tmp.append('M')
        else:
            tmp.append('L')
            
        nbr_rats.append(len(ts.loc[i][key]['ratings']))

    ts.loc[:, ((key, 'class'))] = tmp
    ts.loc[:, ((key, 'nbr_ratings'))] = nbr_rats

### Classes (Check)

In [37]:
classes = {}

for i in ts.index:    
    ba = diffs['ba'][i]
    rb = diffs['rb'][i]


    if ba > thresholds['ba']['high']:
        class_ = 'H'
    elif ba >= thresholds['ba']['low']:
        class_ = 'M'
    else:
        class_ = 'L'

    class_ += '-'

    if rb > thresholds['rb']['high']:
        class_ += 'H'
    elif rb >= thresholds['rb']['low']:
        class_ += 'M'
    else:
        class_ += 'L' 

    if class_ not in classes.keys():
        classes[class_] = 0

    classes[class_] += 1

In [41]:
classes

{'H-H': 1290,
 'H-L': 300,
 'H-M': 2499,
 'L-H': 234,
 'L-L': 1351,
 'L-M': 2529,
 'M-H': 2560,
 'M-L': 2451,
 'M-M': 14218}

### Isolate the beers

In [46]:
h_ba_l_rb = ts[(ts['ba']['class'] == 'H') & (ts['rb']['class'] == 'L')]
l_ba_h_rb = ts[(ts['ba']['class'] == 'L') & (ts['rb']['class'] == 'H')]

In [50]:
h_ba_id = np.array(h_ba_l_rb['ba']['beer_id'])
l_ba_id = np.array(l_ba_h_rb['ba']['beer_id'])

In [54]:
h_ba = df[df['ba']['beer_id'].isin(h_ba_id)]['ba'][['style', 'abv', 'beer_name', 'brewery_name']]
l_ba = df[df['ba']['beer_id'].isin(l_ba_id)]['ba'][['style', 'abv', 'beer_name', 'brewery_name']]

### Style

In [63]:
h_ba['style'].describe()

count              300
unique              74
top       American IPA
freq                28
Name: style, dtype: object

In [64]:
l_ba['style'].describe()

count              234
unique              69
top       American IPA
freq                20
Name: style, dtype: object

In [68]:
h_ba['style'].value_counts().head(10)

American IPA                      28
American Amber / Red Ale          16
American Double / Imperial IPA    15
American Pale Ale (APA)           15
Saison / Farmhouse Ale            13
Fruit / Vegetable Beer            10
American Wild Ale                 10
American Stout                     9
English Pale Ale                   8
German Pilsener                    8
Name: style, dtype: int64

In [69]:
l_ba['style'].value_counts().head(10)

American IPA                      20
Saison / Farmhouse Ale            16
American Wild Ale                 14
American Double / Imperial IPA    13
American Pale Ale (APA)            8
American Stout                     7
American Blonde Ale                7
Russian Imperial Stout             7
American Porter                    6
English Porter                     6
Name: style, dtype: int64

### ABV

In [70]:
h_ba['abv'].describe()

count    300.000000
mean       6.288433
std        2.057763
min        2.500000
25%        5.000000
50%        5.750000
75%        7.500000
max       15.000000
Name: abv, dtype: float64

In [71]:
l_ba['abv'].describe()

count    234.000000
mean       7.038120
std        2.447357
min        2.400000
25%        5.200000
50%        6.400000
75%        8.500000
max       16.000000
Name: abv, dtype: float64

In [72]:
h_ba['abv'].value_counts().head(10)

5.0    32
6.0    16
5.5    15
4.5    13
6.5    13
8.0    10
7.0     9
7.5     8
5.2     8
4.7     8
Name: abv, dtype: int64

In [73]:
l_ba['abv'].value_counts().head(10)

5.0     18
5.5     16
7.0     14
6.5     10
5.2      9
4.8      8
8.0      7
12.0     6
9.0      6
7.5      6
Name: abv, dtype: int64

### Brewery

In [74]:
h_ba['brewery_name'].describe()

count                         300
unique                        277
top       Equinox Brewing Company
freq                            6
Name: brewery_name, dtype: object

In [75]:
l_ba['brewery_name'].describe()

count       234
unique      210
top       To Øl
freq          3
Name: brewery_name, dtype: object

In [76]:
h_ba['brewery_name'].value_counts().head(10)

Equinox Brewing Company                      6
J. Wakefield Brewing                         3
Brauerei Ried                                2
Shebeen Brewing Company                      2
Jester King Brewery                          2
Pateros Creek Brewing Co.                    2
Sixpoint Brewery                             2
Empyrean Brewing Company                     2
D9 Brewing Company                           2
Herzoglich Bayerisches Brauhaus Tegernsee    2
Name: brewery_name, dtype: int64

In [77]:
l_ba['brewery_name'].value_counts().head(10)

To Øl                           3
Silverton Brewery               2
Angry Chair Brewing             2
Birrificio L'Orso Verde         2
Renegade Brewing Company        2
Southern Bay Brewing Company    2
Tempel Brygghus                 2
Brasserie Cantillon             2
AleSmith Brewing Company        2
Shady Oak Barrel House          2
Name: brewery_name, dtype: int64

In [87]:
(set(h_ba['brewery_name']) & set(l_ba['brewery_name']))

{'August Schell Brewing Company',
 'Beerbliotek',
 'Brasserie Cantillon',
 'Brasserie Fantôme',
 'Brauerei Hirt GmbH',
 'Brouwerij Anders',
 'Burnt City Brewing',
 'Buxton Brewery',
 'Cape Cod Beer',
 'Dead Frog Brewing',
 'Feral Brewing Co.',
 'Half Acre Beer Company',
 'Interboro Spirits and Ales',
 'Marz Community Brewing',
 'Rockyard Brewing',
 'The North Brewery'}

In [89]:
16/min(len(set(h_ba['brewery_name'])), len(set(l_ba['brewery_name'])))*100

7.6190476190476195

### Provenance

In [90]:
brew = pd.read_csv(data_folder + 'ba/breweries.csv')

In [111]:
locations = []
for i in h_ba.index:
    brew_name = h_ba.loc[i]['brewery_name']
    
    subdf = brew[brew['name'] == brew_name]['location']
    
    loc = subdf.loc[subdf.index[0]]
    
    if 'United States' in loc:
        loc = 'United States'
        
    locations.append(loc)
    
h_ba['location'] = locations

locations = []
for i in l_ba.index:
    brew_name = l_ba.loc[i]['brewery_name']
    
    subdf = brew[brew['name'] == brew_name]['location']
    
    loc = subdf.loc[subdf.index[0]]
    
    if 'United States' in loc:
        loc = 'United States'
        
    locations.append(loc)
    
l_ba['location'] = locations

In [113]:
h_ba['location'].describe()

count               300
unique               27
top       United States
freq                208
Name: location, dtype: object

In [114]:
l_ba['location'].describe()

count               234
unique               26
top       United States
freq                123
Name: location, dtype: object

In [115]:
h_ba['location'].value_counts().head(10)

United States    208
Canada            16
Belgium           12
England           11
Germany           11
Austria            5
Australia          5
Ireland            5
Sweden             4
Spain              3
Name: location, dtype: int64

In [116]:
l_ba['location'].value_counts().head(10)

United States    123
England           15
Canada            14
Italy             10
Australia         10
Sweden             9
Denmark            6
Germany            6
Belgium            6
Poland             4
Name: location, dtype: int64