In [1]:
import os
os.chdir('../..')

In [35]:
import numpy as np
import json
import pandas as pd
from datetime import datetime
import time

from classes.helpers import *

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

data_folder = '../data/'

# Compute yearly z-score

In [6]:
df = pd.read_csv(data_folder + 'matched/beers.csv', header=[0,1])

beers_ids = {'ba': np.array(df['ba']['beer_id']), 'rb': np.array(df['rb']['beer_id'])}

In [21]:
ratings_year = {'ba': {}, 'rb': {}}

for key in ratings_year.keys():
    print('Parsing {} reviews.'.format(key.upper()))
    
    gen = parse(data_folder + key + '/ratings.txt.gz')
    
    for item in gen:
        
        date = int(item['date'])
        year = datetime.fromtimestamp(date).year
        
        beer_id = int(item['beer_id'])
        
        if beer_id in beers_ids[key]:
        
            if year not in ratings_year[key].keys():
                ratings_year[key][year] = []

            ratings_year[key][year].append(float(item['rating']))

Parsing BA reviews.
Parsing RB reviews.


In [24]:
z_score_params = {}
for key in ratings_year.keys():
    z_score_params[key] = {}
    for y in ratings_year[key].keys():
        z_score_params[key][y] = {'mean': np.mean(ratings_year[key][y]), 'std': np.std(ratings_year[key][y])}
        if z_score_params[key][y]['std'] == 0:
            z_score_params[key][y]['std'] = 1

In [29]:
z_score_params['ba'][1996] = {'mean': 0, 'std': 1}
z_score_params['ba'][1997] = {'mean': 0, 'std': 1}

In [31]:
with open(data_folder + 'tmp/z_score_params_matched.json', 'w') as file:
    json.dump(z_score_params, file)

# Add z-score to beers

In [32]:
with open('../data/tmp/z_score_params_matched.json') as file:
    z_score_params = json.load(file)

In [36]:
beers_zscore = {}
for key in ['ba', 'rb']:
    
    beers_zscore[key] = {}
    
    print('Parse ratings from {}'.format(key.upper()))

    # Get the generator
    gen = parse('../data/{}/ratings.txt.gz'.format(key))
    
    for item in gen:
        beer_id = int(item['beer_id'])
        
        if beer_id not in beers_zscore[key].keys():
            beers_zscore[key][beer_id] = []
            
        rat = float(item['rating'])
            
        year = time.strftime('%Y', time.localtime(int(item['date'])))
        zscore = (rat-z_score_params[key][year]['mean'])/(z_score_params[key][year]['std'])
            
        beers_zscore[key][beer_id].append(zscore)

Parse ratings from BA
Parse ratings from RB


In [39]:
for key in ['ba', 'rb']:
    ratings = []
    
    df = pd.read_csv('../data/{}/beers.csv'.format(key))
    
    for i in df.index:
        row = df.iloc[i]
        
        try:
            beer_id = int(row['beer_id'])
        
            ratings.append(np.mean(beers_zscore[key][beer_id]))
        except KeyError:
            ratings.append(np.nan)
            
    print(np.nanmean(ratings))
            
    df.loc[:, 'zscore'] = ratings
    
    df.to_csv('../data/{}/beers.csv'.format(key), index=False)

-0.390539303533
-0.302215344173
