# Time Series

We create the time series for the matched beers in this notebook


In [1]:
import os
os.chdir('../..')

In [22]:
from classes.helpers import *
import numpy as np
import json
import pandas as pd
from datetime import datetime
import scipy.stats as ss

data_folder = '../data/'

# For the Python notebook
%matplotlib inline
%reload_ext autoreload
%autoreload 2

nbr_rats = 5

In [12]:
beers = pd.read_csv(data_folder + 'matched/beers.csv', header=[0,1])

In [13]:
beers = beers[(beers['ba']['nbr_ratings'] >= nbr_rats) & (beers['rb']['nbr_ratings'] >= nbr_rats)]
beers.index = range(len(beers))

In [14]:
ratings = {'ba': {}, 'rb': {}}

for key in ratings.keys():
    print('Parse {} ratings'.format(key.upper()))
    gen = parse(data_folder + 'matched/ratings_{}.txt.gz'.format(key))
    
    for item in gen:
        beer_id = item['beer_id']
        rating = item['rating']
        date = item['date']
        
        if beer_id not in ratings[key].keys():
            ratings[key][beer_id] = {'date': [], 'rating': []}
            
        ratings[key][beer_id]['date'].append(int(date))
        ratings[key][beer_id]['rating'].append(float(rating))

Parse BA ratings
Parse RB ratings


In [15]:
def flatten(l):
    """
    Flatten a list of lists
    :param l: List of lists
    :return: flattened list
    """
    try:
        return [item for sublist in l for item in sublist]
    except TypeError:
        return l

In [16]:
global_average = {'ba': {'rating': 0, 'z_score': 0, 'std': 0},
                  'rb': {'rating': 0, 'z_score': 0, 'std': 0}}

with open(data_folder + 'tmp/z_score_params_matched_ratings.json') as file:
    z = json.load(file)

for key in ratings.keys():
    all_ratings = []
    all_z_score = []
    nbr = 0
    for usr in ratings[key].keys():
        rats = ratings[key][usr]['rating']
        dates = ratings[key][usr]['date']
        
        years = [str(datetime.fromtimestamp(d).year) for d in dates]
        z_scores = [(r-z[key][y]['mean'])/z[key][y]['std'] for r,y in zip(rats, years)]
        
        all_ratings.append(rats)
        all_z_score.append(z_scores)
        
    all_ratings = flatten(all_ratings)
    all_z_score = flatten(all_z_score)
    
    global_average[key]['std'] = np.std(all_ratings)
    global_average[key]['rating'] = np.mean(all_ratings)
    global_average[key]['z_score'] = np.mean(all_z_score)
    
with open(data_folder + 'tmp/global_averages.json', 'w') as file:
    json.dump(global_average, file)

In [17]:
df_json = {'ba': {'beer_id': [], 'dates': [], 'ratings': [], 'z_scores': [], 'avg_ratings': [], 'avg_z_scores': []},
           'rb': {'beer_id': [], 'dates': [], 'ratings': [], 'z_scores': [], 'avg_ratings': [], 'avg_z_scores': []}}

for i in beers.index:
    row = beers.iloc[i]
    
    for key in ['ba', 'rb']:
        df_json[key]['beer_id'].append(row[key]['beer_id'])
        
        ratings_user = ratings[key][str(row[key]['beer_id'])]
        
        dates = ratings_user['date'][::-1]
        rats = ratings_user['rating'][::-1]
        
        years = [str(datetime.fromtimestamp(d).year) for d in dates]
        z_scores = [(r-z[key][y]['mean'])/z[key][y]['std'] for r,y in zip(rats, years)]
        
        dates = np.array(dates)
        rats = np.array(rats)
        z_scores = np.array(z_scores)
        
        idx = np.argsort(dates)
        dates = dates[idx]
        rats = rats[idx]
        z_scores = z_scores[idx]
        
        df_json[key]['dates'].append(list(dates))
        df_json[key]['ratings'].append(list(rats))
        df_json[key]['z_scores'].append(list(z_scores))
        df_json[key]['avg_ratings'].append(np.mean(rats))
        df_json[key]['avg_z_scores'].append(np.mean(z_scores))

In [18]:
df = pd.DataFrame.from_dict({(i, j): df_json[i][j] 
                             for i in df_json.keys() 
                             for j in df_json[i].keys()})

In [19]:
df.to_csv(data_folder + 'tmp/time_series_{}.csv'.format(nbr_rats), index=False)

## Add ranks

In [20]:
with open(data_folder + 'tmp/global_averages.json') as file:
    global_avg = json.load(file)

diffs = {'ba': [], 'rb': []}
for i in df.index:
    row = df.loc[i]
    for key in ['rb', 'ba']:
        diffs[key].append(row[key]['z_scores'][0] - global_avg[key]['z_score'])
        
thresholds = {}

for key in ['ba', 'rb']:
    thresholds[key] = {}
    thresholds[key]['low'] = np.percentile(diffs[key], 15)
    thresholds[key]['high'] = np.percentile(diffs[key], 85)

In [23]:
for key in ['ba', 'rb']:
    tmp = []
    nbr_rats = []
    
    for i in df.index:      
        if diffs[key][i] > thresholds[key]['high']:
            tmp.append('H')
        elif diffs[key][i] >= thresholds[key]['low']:
            tmp.append('M')
        else:
            tmp.append('L')
            
        nbr_rats.append(len(df.loc[i][key]['ratings']))

    df.loc[:, ((key, 'class'))] = tmp
    df.loc[:, ((key, 'nbr_ratings'))] = nbr_rats

    rank = ss.rankdata(-df[key]['avg_z_scores'], method='min')
    rank = 1-(rank-1)/(len(rank)-1)
        
    df.loc[:, ((key, 'rank_avg'))] = rank

In [25]:
ranks = {}
for key in ['ba', 'rb']:
    ranks[key] = []
    for i in df.index:
        ranks[key].append([])
    
    for i in range(100):
        tmp = []
        
        subdf = df[(df['ba']['nbr_ratings'] >= i+1) & (df['rb']['nbr_ratings'] >= i+1)]
        tmp = [subdf.loc[j][key]['z_scores'][i] for j in subdf.index]
            
        tmp = np.array(tmp)
        rank = ss.rankdata(-tmp, method='min')
        rank = 1-(rank-1)/(len(rank)-1)
        
        
        for idx, j in enumerate(subdf.index):
            ranks[key][j].append(rank[idx])
            
    df.loc[:, ((key, 'ranks'.format(i+1)))] = ranks[key]

In [26]:
df.to_csv(data_folder + 'tmp/time_series_{}_ranks.csv'.format(min_nbr_rats), index=False)

NameError: name 'min_nbr_rats' is not defined