In [1]:
import os
os.chdir('../..')

In [2]:
from classes.matching import Matching
from classes.helpers import *
import numpy as np
import json
import pandas as pd
from datetime import datetime
import gzip

import ast

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

data_folder = '../data/'

In [3]:
ts = pd.read_csv(data_folder + 'tmp/time_series_1_valid.csv', header=[0,1])

# Transform strings into arrays
for i in ts.index:
    for key1 in ['ba', 'rb']:
        for key2 in ['dates', 'ratings', 'z_scores']:
            arr = ast.literal_eval(ts.loc[i][key1][key2])
            ts.set_value(i, (key1, key2), arr)

df = pd.read_csv(data_folder + 'matched/beers.csv', header=[0,1])

brews = {'ba': None, 'rb': None}
brew['ba'] = pd.read_csv(data_folder + 'ba/breweries.csv')
brew['rb'] = pd.read_csv(data_folder + 'rb/breweries.csv')

In [11]:
diffs = {'ba': [], 'rb': []}
for i in ts.index:
    row = ts.iloc[i]
    for key in ['rb', 'ba']:
        diffs[key].append(row[key]['z_scores'][0])
        
thresholds = {}
for key in ['ba', 'rb']:
    thresholds[key] = {}
    thresholds[key]['low'] = np.percentile(diffs[key], 15)
    thresholds[key]['high'] = np.percentile(diffs[key], 85)
    
for key in ['ba', 'rb']:
    tmp = []
    nbr_rats = []
    
    for i in ts.index:      
        if diffs[key][i] > thresholds[key]['high']:
            tmp.append('H')
        elif diffs[key][i] >= thresholds[key]['low']:
            tmp.append('M')
        else:
            tmp.append('L')
            
        nbr_rats.append(len(ts.loc[i][key]['ratings']))

    ts.loc[:, ((key, 'class'))] = tmp
    ts.loc[:, ((key, 'nbr_ratings'))] = nbr_rats

In [49]:
dfs = {'ba': None, 'rb': None}

for key in dfs.keys():
    dfs[key] = ts[key][['beer_id', 'class']]

    dfs[key]['style'] = np.array(df[df[key]['beer_id'].isin(dfs[key]['beer_id'])][key]['style'])
    dfs[key]['brewery_name'] = np.array(df[df[key]['beer_id'].isin(dfs[key]['beer_id'])][key]['brewery_name'])

    locations = []
    for i in dfs[key].index:
        brew_name = dfs[key].loc[i]['brewery_name']

        subdf = brews[key][brews[key]['name'] == brew_name]['location']

        loc = subdf.loc[subdf.index[0]]

        if 'United States' in loc:
            loc = 'United States'

        locations.append(loc)

    dfs[key]['location'] = locations
    
dfs['rb']['style'] = dfs['ba']['style']

# Style

In [149]:
styles = dfs['ba']['style'].unique()

In [150]:
df_json = {'ba': {},
           'rb': {}}

classes = ['H-L', 'L-H', 'H-M', 'M-H', 'M-L', 'L-M']

for key in df_json.keys():
    for cl in classes:
        df_json[key][cl] = []

In [151]:
for cl in classes:
    idx = np.array((dfs['ba']['class'] == cl[0]) & (dfs['rb']['class'] == cl[-1]))
    
    for key in df_json.keys():
        subdf = dfs[key][idx]
        
        if key == 'rb':
            cl = cl[::-1]
            
        df_json[key][cl].append(len(subdf))
        
        for s in styles:
            subsubdf = subdf[subdf['style'] == s]
            
            df_json[key][cl].append(len(subsubdf))

In [152]:
styles = np.insert(styles, 0, 'Total')

In [153]:
df_json['styles'] = {}
df_json['styles']['-'] = styles

In [154]:
df_styles = pd.DataFrame.from_dict({(i, j): df_json[i][j] 
                                    for i in df_json.keys() 
                                    for j in df_json[i].keys()})

In [155]:
df_styles.to_csv(data_folder + 'tmp/styles.csv', index=False)

# Country

In [170]:
countries = dfs['ba']['location'].unique()

In [171]:
df_json = {'ba': {},
           'rb': {}}

classes = ['H-L', 'L-H', 'H-M', 'M-H', 'M-L', 'L-M']

for key in df_json.keys():
    for cl in classes:
        df_json[key][cl] = []

In [172]:
for cl in classes:
    idx = np.array((dfs['ba']['class'] == cl[0]) & (dfs['rb']['class'] == cl[-1]))
    
    for key in df_json.keys():
        subdf = dfs[key][idx]
        
        if key == 'rb':
            cl = cl[::-1]
            
        df_json[key][cl].append(len(subdf))
        
        for c in countries:
            subsubdf = subdf[subdf['location'] == c]
            
            df_json[key][cl].append(len(subsubdf))

In [173]:
countries = np.insert(countries, 0, 'Total')

In [174]:
df_json['countries'] = {}
df_json['countries']['-'] = countries

In [175]:
df_countries = pd.DataFrame.from_dict({(i, j): df_json[i][j] 
                                       for i in df_json.keys() 
                                       for j in df_json[i].keys()})

In [178]:
df_countries.to_csv(data_folder + 'tmp/countries.csv', index=False)

In [179]:
df_countries

Unnamed: 0_level_0,ba,ba,ba,ba,ba,ba,countries,rb,rb,rb,rb,rb,rb
Unnamed: 0_level_1,H-L,H-M,L-H,L-M,M-H,M-L,-,H-L,H-M,L-H,L-M,M-H,M-L
0,300,2499,234,2529,2560,2451,Total,234,2560,300,2451,2499,2529
1,0,2,0,3,0,2,Northern Ireland,0,0,0,2,2,3
2,1,2,1,14,9,12,Wales,1,9,1,12,2,14
3,0,2,2,36,19,15,Scotland,2,19,0,15,2,36
4,11,76,15,142,101,134,England,15,101,11,134,76,142
5,0,0,0,1,0,0,Singapore,0,0,0,0,0,1
6,1,0,0,1,1,1,China,0,1,1,1,0,1
7,0,0,0,0,0,1,Saint Lucia,0,0,0,1,0,0
8,1,0,3,1,10,3,Romania,3,10,1,3,0,1
9,0,1,0,2,0,1,South Korea,0,0,0,1,1,2
