In [1]:
import os
os.chdir('../..')

In [2]:
from classes.matching import Matching
from classes.helpers import *
import numpy as np
import json
import pandas as pd
from datetime import datetime
import gzip

import ast

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

data_folder = '../data/'

In [4]:
ts = pd.read_csv(data_folder + 'tmp/time_series_5_valid.csv', header=[0,1])

# Transform strings into arrays
for i in ts.index:
    for key1 in ['ba', 'rb']:
        for key2 in ['dates', 'ratings', 'z_scores']:
            arr = ast.literal_eval(ts.loc[i][key1][key2])
            ts.set_value(i, (key1, key2), arr)

df = pd.read_csv(data_folder + 'matched/beers.csv', header=[0,1])

brews = {'ba': None, 'rb': None}
brews['ba'] = pd.read_csv(data_folder + 'ba/breweries.csv')
brews['rb'] = pd.read_csv(data_folder + 'rb/breweries.csv')

In [5]:
diffs = {'ba': [], 'rb': []}
for i in ts.index:
    row = ts.iloc[i]
    for key in ['rb', 'ba']:
        diffs[key].append(row[key]['z_scores'][0])
        
thresholds = {}
for key in ['ba', 'rb']:
    thresholds[key] = {}
    thresholds[key]['low'] = np.percentile(diffs[key], 15)
    thresholds[key]['high'] = np.percentile(diffs[key], 85)
    
for key in ['ba', 'rb']:
    tmp = []
    nbr_rats = []
    
    for i in ts.index:      
        if diffs[key][i] > thresholds[key]['high']:
            tmp.append('H')
        elif diffs[key][i] >= thresholds[key]['low']:
            tmp.append('M')
        else:
            tmp.append('L')
            
        nbr_rats.append(len(ts.loc[i][key]['ratings']))

    ts.loc[:, ((key, 'class'))] = tmp
    ts.loc[:, ((key, 'nbr_ratings'))] = nbr_rats

In [6]:
dfs = {'ba': None, 'rb': None}

for key in dfs.keys():
    dfs[key] = ts[key][['beer_id', 'class']]

    dfs[key]['style'] = np.array(df[df[key]['beer_id'].isin(dfs[key]['beer_id'])][key]['style'])
    dfs[key]['brewery_name'] = np.array(df[df[key]['beer_id'].isin(dfs[key]['beer_id'])][key]['brewery_name'])

    locations = []
    for i in dfs[key].index:
        brew_name = dfs[key].loc[i]['brewery_name']

        subdf = brews[key][brews[key]['name'] == brew_name]['location']

        loc = subdf.loc[subdf.index[0]]

        if 'United States' in loc:
            loc = 'United States'

        locations.append(loc)

    dfs[key]['location'] = locations
    
dfs['rb']['style'] = dfs['ba']['style']

# Style

In [37]:
styles = dfs['ba']['style'].unique()

In [38]:
df_json = {}

classes = ['H-L', 'H-M', 'L-M']

for cl in classes:
    df_json[cl] = []
    df_json[cl[::-1]] = []

In [39]:
for cl in classes:
    idx1 = np.array((dfs['ba']['class'] == cl[0]) & (dfs['rb']['class'] == cl[-1]))
    subdf1 = dfs['ba'][idx1]
    subdf1.index = range(len(subdf1))
    
    idx2 = np.array((dfs['ba']['class'] == cl[-1]) & (dfs['rb']['class'] == cl[0]))
    subdf2 = dfs['ba'][idx2]
    subdf2.index = range(len(subdf2))

    
    nbr = min(len(subdf1), len(subdf2))
    
    subdf1 = subdf1.loc[np.random.choice(range(len(subdf1)), nbr, replace=False)]
    subdf1.index = range(nbr)
    
    subdf2 = subdf2.loc[np.random.choice(range(len(subdf2)), nbr, replace=False)]
    subdf2.index = range(nbr)    
    
    df_json[cl].append(nbr)
    df_json[cl[::-1]].append(nbr)

    for s in styles:
        subsubdf1 = subdf1[subdf1['style'] == s]
        df_json[cl].append(len(subsubdf1))
        
        subsubdf2 = subdf2[subdf2['style'] == s]
        df_json[cl[::-1]].append(len(subsubdf2))

In [40]:
styles = np.insert(styles, 0, 'Total')
df_json['styles'] = styles

In [42]:
df_styles = pd.DataFrame.from_dict(df_json)

In [44]:
df_styles.to_csv(data_folder + 'tmp/styles_5.csv', index=False)

# Country

In [51]:
countries = dfs['ba']['location'].unique()

In [52]:
df_json = {}

classes = ['H-L', 'H-M', 'L-M']

for cl in classes:
    df_json[cl] = []
    df_json[cl[::-1]] = []

In [53]:
for cl in classes:
    idx1 = np.array((dfs['ba']['class'] == cl[0]) & (dfs['rb']['class'] == cl[-1]))
    subdf1 = dfs['ba'][idx1]
    subdf1.index = range(len(subdf1))
    
    idx2 = np.array((dfs['ba']['class'] == cl[-1]) & (dfs['rb']['class'] == cl[0]))
    subdf2 = dfs['ba'][idx2]
    subdf2.index = range(len(subdf2))

    
    nbr = min(len(subdf1), len(subdf2))
    
    subdf1 = subdf1.loc[np.random.choice(range(len(subdf1)), nbr, replace=False)]
    subdf1.index = range(nbr)
    
    subdf2 = subdf2.loc[np.random.choice(range(len(subdf2)), nbr, replace=False)]
    subdf2.index = range(nbr)    
    
    df_json[cl].append(nbr)
    df_json[cl[::-1]].append(nbr)

    for c in countries:
        subsubdf1 = subdf1[subdf1['location'] == c]
        df_json[cl].append(len(subsubdf1))
        
        subsubdf2 = subdf2[subdf2['location'] == c]
        df_json[cl[::-1]].append(len(subsubdf2))

In [54]:
countries = np.insert(countries, 0, 'Total')
df_json['countries'] = countries


In [55]:
df_countries = pd.DataFrame.from_dict(df_json)

In [58]:
df_countries.to_csv(data_folder + 'tmp/countries_5.csv', index=False)