In [1]:
import pandas as pd

In [2]:
import os
import matplotlib.pyplot as plt
import time

parentDirectory = os.path.abspath(os.path.join(os.path.join(os.getcwd(), os.pardir), os.pardir))
DATA_DIR = parentDirectory +'/data/'
FIGURES_DIR = parentDirectory +'/figures/'

In [3]:
df_codes = pd.read_csv(DATA_DIR+"country_codes_filtered.csv")

In [4]:
selected_codes = ['FR','DE','US','IT','CA','GB','ES','AU','NG','KE','JP','MX','BR','IN','DK']

In [5]:
dict_queries =  [{'kg_id' : '/m/0p57p', 'name' : 'Recipe', 'type' : 'Mode 1'},
     {'kg_id' : '/m/0dv34', 'name' : 'Baking', 'type' : 'Mode 1'},
     {'kg_id' : '/m/01mtb', 'name' : 'Cooking', 'type' : 'Mode 1'},
     {'kg_id' : '/m/01jpn4', 'name' : 'Grocery store', 'type' : 'Mode 1'},
     {'kg_id' : '/m/0dnkv', 'name' : 'Supermarket', 'type' : 'Mode 1'},
     {'kg_id' : '/g/1224tf85', 'name' : 'Food delivery', 'type' : 'Mode 2'},
     {'kg_id' : '/m/01w53b', 'name' : 'Take-out', 'type' : 'Mode 2'},
     {'kg_id' : '/m/01t40w', 'name' : 'Drive-in', 'type' : 'Mode 2'},
     {'kg_id' : '/m/06l8d', 'name' : 'Restaurant', 'type' : 'Mode 3'},
     {'kg_id' : '/m/02pdnx', 'name' : 'Cafeteria', 'type' : 'Mode 3'},
     {'kg_id' : '/m/01nq30', 'name' : 'Diner', 'type' : 'Mode 3'},
     {'kg_id' : '/m/020fb2', 'name' : 'Cafe', 'type' : 'Mode 3'},
     {'kg_id' : '/m/01kt56', 'name' : 'Picnic', 'type' : 'Mode 4'},
     {'kg_id' : '/m/0983v', 'name' : 'Barbecue', 'type' : 'Mode 4'},
     {'kg_id' : '/m/052qxz', 'name' : 'Food festival', 'type' : 'Mode 3'},
     {'kg_id' : '/m/02lfqj', 'name' : 'Lunchbox', 'type' : 'Mode 4'}]

In [6]:
df_query = pd.DataFrame(dict_queries)
df_query.head(20)

Unnamed: 0,kg_id,name,type
0,/m/0p57p,Recipe,Mode 1
1,/m/0dv34,Baking,Mode 1
2,/m/01mtb,Cooking,Mode 1
3,/m/01jpn4,Grocery store,Mode 1
4,/m/0dnkv,Supermarket,Mode 1
5,/g/1224tf85,Food delivery,Mode 2
6,/m/01w53b,Take-out,Mode 2
7,/m/01t40w,Drive-in,Mode 2
8,/m/06l8d,Restaurant,Mode 3
9,/m/02pdnx,Cafeteria,Mode 3


In [15]:
list_entries = []

for country_code in selected_codes:
    print(country_code)
    for name, row in df_query.iterrows():
        name = row['name']
        kg_id = row['kg_id']
        if os.path.exists("results_modes/{}_{}.tsv".format(country_code, name)):
            try:
                entry = {}
                entry["country_code"] = country_code
                entry["name"] = name
                entry["mid"] = kg_id
                entry["category"] = row['type']
                entry["ts"] = pd.read_csv("results_modes/{}_{}.tsv".format(country_code, name)).set_index('date').to_dict()

                list_entries.append(entry)
            except KeyError as e:
                continue

        

FR
DE
US
IT
CA
GB
ES
AU
NG
KE
JP
MX
BR
IN
DK


In [16]:
df = pd.DataFrame(list_entries)

In [17]:
len(df)

239

In [19]:
df

Unnamed: 0,country_code,name,mid,category,ts
0,FR,Recipe,/m/0p57p,Mode 1,{'max_ratio': {'2019-01-06': 2014.120062354311...
1,FR,Baking,/m/0dv34,Mode 1,{'max_ratio': {'2019-01-06': 333.3592494314424...
2,FR,Cooking,/m/01mtb,Mode 1,"{'max_ratio': {'2019-01-06': 380.98199935022, ..."
3,FR,Grocery store,/m/01jpn4,Mode 1,{'max_ratio': {'2019-01-06': 8.434547908232119...
4,FR,Supermarket,/m/0dnkv,Mode 1,{'max_ratio': {'2019-01-06': 49.06493371142087...
...,...,...,...,...,...
234,DK,Cafe,/m/020fb2,Mode 3,{'max_ratio': {'2019-01-06': 31.89493433395873...
235,DK,Picnic,/m/01kt56,Mode 4,{'max_ratio': {'2019-01-06': 0.292682926829268...
236,DK,Barbecue,/m/0983v,Mode 4,{'max_ratio': {'2019-01-06': 2.700348432055750...
237,DK,Food festival,/m/052qxz,Mode 3,"{'max_ratio': {'2019-01-06': 0.0, '2019-01-13'..."


In [20]:
df.to_parquet(DATA_DIR+'modes_fine.parquet')

### Generate aggregated dataframe

In [21]:
categories = list(df['category'].unique())
len(categories)

4

In [22]:
entry_list = []

for country_code in selected_codes:
    for category in categories:
        entry = {}
        
        df_temp = df.loc[(df['country_code']==country_code) & (df['category']==category)]
        entry['country'] = country_code
        entry['category'] = category
        
        entry['volume_total'] = df_temp['ts'].apply(lambda x: pd.Series(x['max_ratio'])).sum()
        entry['volume_total_l'] = df_temp['ts'].apply(lambda x: pd.Series(x['max_ratio_lo'])).sum()
        entry['volume_total_h'] = df_temp['ts'].apply(lambda x: pd.Series(x['max_ratio_hi'])).sum()
        
        entry_list.append(entry)

In [23]:
df_agg = pd.DataFrame(entry_list)

In [24]:
df_agg

Unnamed: 0,country,category,volume_total,volume_total_l,volume_total_h
0,FR,Mode 1,2019-01-06 2785.960793 2019-01-13 2773.5...,2019-01-06 2458.715557 2019-01-13 2448.5...,2019-01-06 3160.899878 2019-01-13 3145.9...
1,FR,Mode 2,2019-01-06 3.640363 2019-01-13 3.91838...,2019-01-06 3.399005 2019-01-13 3.66225...,2019-01-06 3.920400 2019-01-13 4.19104...
2,FR,Mode 3,2019-01-06 1422.335594 2019-01-13 1525.7...,2019-01-06 1257.986565 2019-01-13 1350.5...,2019-01-06 1610.285529 2019-01-13 1726.1...
3,FR,Mode 4,2019-01-06 10.452691 2019-01-13 12.73262...,2019-01-06 8.503778 2019-01-13 10.65624...,2019-01-06 12.645970 2019-01-13 15.06377...
4,DE,Mode 1,2019-01-06 639.344263 2019-01-13 624.6...,2019-01-06 577.355549 2019-01-13 563.9...,2019-01-06 708.781324 2019-01-13 692.6...
5,DE,Mode 2,2019-01-06 5.845501 2019-01-13 5.93010...,2019-01-06 5.458692 2019-01-13 5.54840...,2019-01-06 6.257302 2019-01-13 6.33646...
6,DE,Mode 3,2019-01-06 482.106699 2019-01-13 497.390...,2019-01-06 435.653090 2019-01-13 449.706...,2019-01-06 534.205604 2019-01-13 550.841...
7,DE,Mode 4,2019-01-06 6.968641 2019-01-13 8.01393...,2019-01-06 6.452518 2019-01-13 7.45810...,2019-01-06 7.519291 2019-01-13 8.60641...
8,US,Mode 1,2019-01-06 1024.325276 2019-01-13 1035.2...,2019-01-06 919.558903 2019-01-13 929.4...,2019-01-06 1142.404042 2019-01-13 1154.4...
9,US,Mode 2,2019-01-06 28.183634 2019-01-13 26.82762...,2019-01-06 26.311899 2019-01-13 25.01567...,2019-01-06 30.207382 2019-01-13 28.78782...


In [25]:
country_totals = []

for c,gr in df_agg.groupby('country'):
    entry = {}
    entry['country'] = c
    entry['total'] = gr['volume_total'].sum()
    country_totals.append(entry)

In [26]:
country_totals = pd.DataFrame((country_totals))

In [27]:
country_totals

Unnamed: 0,country,total
0,AU,2019-01-06 1662.694431 2019-01-13 1543.3...
1,BR,2019-01-06 7523.741768 2019-01-13 7406...
2,CA,2019-01-06 1494.861051 2019-01-13 1495.4...
3,DE,2019-01-06 1134.265104 2019-01-13 1135.9...
4,DK,2019-01-06 558.532346 2019-01-13 538.982...
5,ES,2019-01-06 2526.470255 2019-01-13 2529.1...
6,FR,2019-01-06 4222.389441 2019-01-13 4316.0...
7,GB,2019-01-06 1308.127730 2019-01-13 1291.9...
8,IN,2019-01-06 4515.299582 2019-01-13 4274.7...
9,IT,2019-01-06 1615.021277 2019-01-13 1655.6...


In [28]:
entry_list = []

for country_code in selected_codes:
    for category in categories:
        entry = {}
        
        df_temp = df.loc[(df['country_code']==country_code) & (df['category']==category)]
        entry['country'] = country_code
        entry['category'] = category
        
        entry['volume_weekly_total'] = df_temp['ts'].apply(lambda x: pd.Series(x['max_ratio'])).sum()
        entry['volume_weekly_total_l'] = df_temp['ts'].apply(lambda x: pd.Series(x['max_ratio_lo'])).sum()
        entry['volume_weekly_total_h'] = df_temp['ts'].apply(lambda x: pd.Series(x['max_ratio_hi'])).sum()
        
        entry['volume_percent_weekly_total'] = df_temp['ts'].apply(lambda x: pd.Series(x['max_ratio'])).sum() / \
                country_totals.loc[(country_totals['country']==country_code)].iloc[0]['total']
        
        entry_list.append(entry)


In [29]:
df_agg = pd.DataFrame(entry_list)

In [38]:
df_agg = df_agg.loc[df_agg['country'].isin(['FR','DE','US','IT','CA','GB','ES','AU','MX','BR','IN','DK'])].\
        reset_index(drop = True)

In [41]:
df_agg

Unnamed: 0,country,category,volume_weekly_total,volume_weekly_total_l,volume_weekly_total_h,volume_percent_weekly_total
0,FR,Mode 1,2019-01-06 2785.960793 2019-01-13 2773.5...,2019-01-06 2458.715557 2019-01-13 2448.5...,2019-01-06 3160.899878 2019-01-13 3145.9...,2019-01-06 0.659807 2019-01-13 0.642624 ...
1,FR,Mode 2,2019-01-06 3.640363 2019-01-13 3.91838...,2019-01-06 3.399005 2019-01-13 3.66225...,2019-01-06 3.920400 2019-01-13 4.19104...,2019-01-06 0.000862 2019-01-13 0.000908 ...
2,FR,Mode 3,2019-01-06 1422.335594 2019-01-13 1525.7...,2019-01-06 1257.986565 2019-01-13 1350.5...,2019-01-06 1610.285529 2019-01-13 1726.1...,2019-01-06 0.336856 2019-01-13 0.353518 ...
3,FR,Mode 4,2019-01-06 10.452691 2019-01-13 12.73262...,2019-01-06 8.503778 2019-01-13 10.65624...,2019-01-06 12.645970 2019-01-13 15.06377...,2019-01-06 0.002476 2019-01-13 0.002950 ...
4,DE,Mode 1,2019-01-06 639.344263 2019-01-13 624.6...,2019-01-06 577.355549 2019-01-13 563.9...,2019-01-06 708.781324 2019-01-13 692.6...,2019-01-06 0.563664 2019-01-13 0.549868 ...
5,DE,Mode 2,2019-01-06 5.845501 2019-01-13 5.93010...,2019-01-06 5.458692 2019-01-13 5.54840...,2019-01-06 6.257302 2019-01-13 6.33646...,2019-01-06 0.005154 2019-01-13 0.005220 ...
6,DE,Mode 3,2019-01-06 482.106699 2019-01-13 497.390...,2019-01-06 435.653090 2019-01-13 449.706...,2019-01-06 534.205604 2019-01-13 550.841...,2019-01-06 0.425039 2019-01-13 0.437857 ...
7,DE,Mode 4,2019-01-06 6.968641 2019-01-13 8.01393...,2019-01-06 6.452518 2019-01-13 7.45810...,2019-01-06 7.519291 2019-01-13 8.60641...,2019-01-06 0.006144 2019-01-13 0.007055 ...
8,US,Mode 1,2019-01-06 1024.325276 2019-01-13 1035.2...,2019-01-06 919.558903 2019-01-13 929.4...,2019-01-06 1142.404042 2019-01-13 1154.4...,2019-01-06 0.561272 2019-01-13 0.564239 ...
9,US,Mode 2,2019-01-06 28.183634 2019-01-13 26.82762...,2019-01-06 26.311899 2019-01-13 25.01567...,2019-01-06 30.207382 2019-01-13 28.78782...,2019-01-06 0.015443 2019-01-13 0.014622 ...


In [42]:
df_agg.to_pickle(DATA_DIR+'modes_coarse.pickle')