# Time Series

We create the time series for the matched beers in this notebook


In [1]:
import os
os.chdir('../..')

In [2]:
from classes.helpers import *
import numpy as np
import json
import pandas as pd
from datetime import datetime

data_folder = '../data/'

In [3]:
beers = pd.read_csv(data_folder + 'matched/beers.csv', header=[0,1])

In [4]:
beers = beers[(beers['ba']['nbr_ratings'] >= 10) & (beers['rb']['nbr_ratings'] >= 10)]
beers.index = range(len(beers))

In [5]:
ratings = {'ba': {}, 'rb': {}}

for key in ratings.keys():
    print('Parse {} ratings'.format(key.upper()))
    gen = parse(data_folder + key + '/ratings.txt.gz')
    
    for item in gen:
        beer_id = item['beer_id']
        rating = item['rating']
        date = item['date']
        
        if beer_id not in ratings[key].keys():
            ratings[key][beer_id] = {'date': [], 'rating': []}
            
        ratings[key][beer_id]['date'].append(int(date))
        ratings[key][beer_id]['rating'].append(float(rating))

Parse BA ratings
Parse RB ratings


In [6]:
def flatten(l):
    """
    Flatten a list of lists
    :param l: List of lists
    :return: flattened list
    """
    try:
        return [item for sublist in l for item in sublist]
    except TypeError:
        return l

In [8]:
global_average = {'ba': {'rating': 0, 'z_score': 0, 'std': 0},
                  'rb': {'rating': 0, 'z_score': 0, 'std': ''}}

with open(data_folder + 'tmp/z_score_params_matched.json') as file:
    z = json.load(file)

for key in ratings.keys():
    all_ratings = []
    all_z_score = []
    nbr = 0
    for usr in ratings[key].keys():
        rats = ratings[key][usr]['rating']
        dates = ratings[key][usr]['date']
        
        years = [str(datetime.fromtimestamp(d).year) for d in dates]
        z_scores = [(r-z[key][y]['mean'])/z[key][y]['std'] for r,y in zip(rats, years)]
        
        all_ratings.append(rats)
        all_z_score.append(z_scores)
        
    all_ratings = flatten(all_ratings)
    all_z_score = flatten(all_z_score)
    
    global_average[key]['std'] = np.std(all_ratings)
    global_average[key]['rating'] = np.mean(all_ratings)
    global_average[key]['z_score'] = np.mean(all_z_score)
    
with open(data_folder + 'tmp/global_averages.json', 'w') as file:
    json.dump(global_average, file)

In [9]:
df_json = {'ba': {'beer_id': [], 'dates': [], 'ratings': [], 'z_scores': [], 'avg_ratings': [], 'avg_z_scores': []},
           'rb': {'beer_id': [], 'dates': [], 'ratings': [], 'z_scores': [], 'avg_ratings': [], 'avg_z_scores': []}}

for i in beers.index:
    row = beers.iloc[i]
    
    for key in ['ba', 'rb']:
        df_json[key]['beer_id'].append(row[key]['beer_id'])
        
        ratings_user = ratings[key][str(row[key]['beer_id'])]
        
        dates = ratings_user['date'][::-1]
        rats = ratings_user['rating'][::-1]
        
        years = [str(datetime.fromtimestamp(d).year) for d in dates]
        z_scores = [(r-z[key][y]['mean'])/z[key][y]['std'] for r,y in zip(rats, years)]
        
        dates = np.array(dates)
        rats = np.array(rats)
        z_scores = np.array(z_scores)
        
        idx = np.argsort(dates)
        dates = dates[idx]
        rats = rats[idx]
        z_scores = z_scores[idx]
        
        df_json[key]['dates'].append(list(dates))
        df_json[key]['ratings'].append(list(rats))
        df_json[key]['z_scores'].append(list(z_scores))
        df_json[key]['avg_ratings'].append(np.mean(rats))
        df_json[key]['avg_z_scores'].append(np.mean(z_scores))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """


In [10]:
df = pd.DataFrame.from_dict({(i, j): df_json[i][j] 
                             for i in df_json.keys() 
                             for j in df_json[i].keys()})

In [12]:
df.to_csv(data_folder + 'tmp/time_series_10.csv', index=False)