In [1]:
import pandas as pd
import os
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import spearmanr

In [2]:
parentDirectory = os.path.abspath(os.path.join(os.path.join(os.getcwd(), os.pardir), os.pardir))
DATA_DIR = parentDirectory +'/data/'
FIGURES_DIR = parentDirectory +'/figures/'

def make_stars(val):
    '''
    if val<0.0001:
        return '****'
    elif val<0.001:
        return '***'
    elif val<0.01:
        return '**'
    '''
    if val<0.05:
        return '*'
    else:
        return ''

In [3]:
df = pd.read_parquet(DATA_DIR+'food_timeseries.parquet')
df = df.loc[df['name']!='Bánh mì']

In [4]:
df.head()

Unnamed: 0,country_code,name,mid,category,ts
0,FR,Cookie,/m/021mn,dessert,{'max_ratio': {'2019-01-06': 54.39198365723229...
1,FR,Coca-Cola,/m/01yvs,soft drink,{'max_ratio': {'2019-01-06': 27.31837379184492...
2,FR,Rice,/m/09759,rice dish,"{'max_ratio': {'2019-01-06': 130.39174164405, ..."
3,FR,Miso,/m/057z_,soup,{'max_ratio': {'2019-01-06': 3.470213996529786...
4,FR,Table salt,/m/05g0_z,spice,{'max_ratio': {'2019-01-06': 85.77197422871245...


In [5]:
df_mobility = pd.read_csv(DATA_DIR+'df_mobility.csv')

In [6]:
full_names = {
    'AU': 'Australia',
    'BR': 'Brazil',
    'CA': 'Canada',
    'FR': 'France',
    'DE': 'Germany',
    'IN': 'India',
    'IT': 'Italy',
    'MX': 'Mexico',
    'ES': 'Spain',
    'GB': 'United Kingdom',
    'US': 'United States',
    'DK': 'Denmark',
    'KE': 'Kenya', 
    'NG': 'Nigeria',
    'JP': 'Japan',
    'SE': 'Sweden',
    'ID': 'Indonesia',
    'EG': 'Egypt'
}

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

event_dicts = [{'country': 'AU',
  'end_md_1': '2020-06-07',
  'start_md_1': '2020-03-27',
  'start_md_2': np.nan},
 {'country': 'BR',
  'end_md_1': '2020-08-09',
  'start_md_1': '2020-03-23',
  'start_md_2': np.nan},
 {'country': 'CA',
  'end_md_1': '2020-06-21',
  'start_md_1': '2020-03-19',
  'start_md_2': '2020-10-12'},
 {'country': 'DE',
  'end_md_1': '2020-05-09',
  'start_md_1': '2020-03-21',
  'start_md_2': '2020-12-18'},
 {'country': 'DK',
  'end_md_1': '2020-05-07',
  'start_md_1': '2020-03-17',
  'start_md_2': np.nan},
 {'country': 'EG',
  'end_md_1': '2020-07-01',
  'start_md_1': '2020-03-24',
  'start_md_2': np.nan},
 {'country': 'ES',
  'end_md_1': '2020-06-14',
  'start_md_1': '2020-03-17',
  'start_md_2': '2020-11-07'},
 {'country': 'FR',
  'end_md_1': '2020-06-08',
  'start_md_1': '2020-03-18',
  'start_md_2': '2020-11-01'},
 {'country': 'GB',
  'end_md_1': '2020-08-03',
  'start_md_1': '2020-03-23',
  'start_md_2': '2020-10-21'},
 {'country': 'ID',
  'end_md_1': '2020-08-10',
  'start_md_1': '2020-03-24',
  'start_md_2': np.nan},
 {'country': 'IN',
  'end_md_1': '2020-10-29',
  'start_md_1': '2020-03-24',
  'start_md_2': np.nan},
 {'country': 'IT',
  'end_md_1': '2020-06-06',
  'start_md_1': '2020-03-11',
  'start_md_2': '2020-11-06'},
 {'country': 'JP',
  'end_md_1': '2020-05-30',
  'start_md_1': '2020-04-12',
  'start_md_2': np.nan},
 {'country': 'KE',
  'end_md_1': '2020-10-04',
  'start_md_1': '2020-03-24',
  'start_md_2': np.nan},
 {'country': 'MX',
  'end_md_1': '2020-10-06',
  'start_md_1': '2020-03-25',
  'start_md_2': np.nan},
 {'country': 'NG',
  'end_md_1': '2020-08-09',
  'start_md_1': '2020-03-27',
  'start_md_2': np.nan},
 {'country': 'SE',
  'end_md_1': '2020-04-09',
  'start_md_1': '2020-04-03',
  'start_md_2': np.nan},
 {'country': 'US',
  'end_md_1': '2020-06-11',
  'start_md_1': '2020-03-21',
  'start_md_2': '2020-11-26'}]

df_events = pd.DataFrame(event_dicts)
df_events['start_md_1'] = pd.to_datetime(df_events['start_md_1'])
df_events['end_md_1'] = pd.to_datetime(df_events['end_md_1'])
df_events['start_md_2'] = pd.to_datetime(df_events['start_md_2'])

df_agg = pd.read_pickle(DATA_DIR+'df_agg_cats.pickle')

In [7]:
df = df.loc[df['country_code'].isin(full_names.keys())]

In [8]:
df_agg = df_agg.loc[df_agg['country'].isin(full_names.keys())]
studied_weeks = list(df_agg.iloc[0]['volume_weekly_total'].index)[-46:]

mobility_ts = {}

for country, gr in df_mobility.groupby('country_region_code'):
    mobility_dict = {}
    gr = gr.iloc[1:323]
    for i in chunker(gr,7):
        mobility_dict[i.iloc[0]['date']] = (i['residential_percent_change_from_baseline'].mean())
    mobility_ts[country] = mobility_dict

In [9]:
entry_list = []

c = 0
for category, gr1 in df.groupby('name'):
    for country, gr2 in gr1.groupby('country_code'):
        c+=1
        entry = {}
        y = list(mobility_ts[country].values())
        
        
        x = (np.array(list(gr2.iloc[0]['ts']['max_ratio'].values())[-46:]) - np.array(list(gr2.iloc[0]['ts']['max_ratio'].values())[-98:-52]) - 0.00001) /\
            (np.array(list(gr2.iloc[0]['ts']['max_ratio'].values())[-98:-52]) + 0.00001)
        entry['name'] = category
        entry['category'] = gr2.iloc[0]['category']
        entry['country'] = country
        entry['corr'] = spearmanr(x,y)[0]
        entry['p'] = spearmanr(x,y)[1]
        
        entry_list.append(entry)

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [10]:
df_results = pd.DataFrame(entry_list)

In [11]:
df_results

Unnamed: 0,name,category,country,corr,p
0,'Nduja,sausage,AU,-0.065820,0.663856
1,'Nduja,sausage,BR,0.050153,0.740644
2,'Nduja,sausage,CA,-0.227733,0.127977
3,'Nduja,sausage,DK,-0.013557,0.928745
4,'Nduja,sausage,ES,-0.247847,0.096765
...,...,...,...,...,...
20196,Époisses de Bourgogne,cheese,IT,0.149585,0.321094
20197,Époisses de Bourgogne,cheese,JP,-0.042554,0.778866
20198,Époisses de Bourgogne,cheese,MX,0.112002,0.458654
20199,Époisses de Bourgogne,cheese,SE,-0.429072,0.002926


### We consider individual entities with non-zero interest in most of the countries

In [12]:
tmp = df_results.groupby('name').apply(lambda x: pd.Series({
    'corr_avg': x['corr'].mean(), 
    'category': x['category'].unique()[0],
    'exists_most_countries': len(x)>=10,
    'country': x['country'], 
    'corr': x['corr'],
    'p': x['p'],
    'all': x[['country','corr','p']]
    })).reset_index()
tmp = tmp.loc[tmp['exists_most_countries']]

In [13]:
tmp.sort_values(by = 'corr_avg', ascending =  False).head(10)[['name','corr_avg','category']]

Unnamed: 0,name,corr_avg,category
890,Pancake,0.606012,dessert
71,Baking powder,0.583275,pastry and bakery product
163,Bread (Literature Subject),0.582979,bread and flatbread
70,Baker's yeast,0.580545,pastry and bakery product
352,Cookie,0.56894,dessert
306,Chocolate brownie,0.56384,dessert
289,Chicken meat,0.550372,chicken dish
307,Chocolate cake,0.545589,dessert
124,Biscuit,0.535692,dessert
915,Pasta,0.53396,"pasta, pizza and noodle dish"


In [14]:
tmp.sort_values(by = 'corr_avg').head(10)[['name','corr_avg','category']]

Unnamed: 0,name,corr_avg,category
1260,Tapas,-0.337654,snack
447,Energy drink,-0.271715,soft drink
676,Korean barbecue,-0.262708,beef dish
225,Campanelle,-0.221799,"pasta, pizza and noodle dish"
1238,Sushi,-0.194677,rice dish
515,Gelato,-0.190158,dessert
282,Chewing gum,-0.174086,dessert
1040,Ramen,-0.173289,soup
12,Agnolotti,-0.173056,"pasta, pizza and noodle dish"
189,Burrata,-0.163816,cheese


In [15]:
top_entities = tmp.sort_values(by = 'corr_avg', ascending =  False).head(10)[['name','corr_avg','category']]['name'].values

In [16]:
top_entities

array(['Pancake', 'Baking powder', 'Bread (Literature Subject)',
       "Baker's yeast", 'Cookie', 'Chocolate brownie', 'Chicken meat',
       'Chocolate cake', 'Biscuit', 'Pasta'], dtype=object)

In [17]:
for entity in top_entities:
    t = df_results.loc[df_results['name']==entity].sort_values(by = 'country')
    print(len(t))
    
    missing = []
    for i in full_names.keys():
        if i not in t['country'].values:
            missing.append(i)
    if len(t)<18:
        print(missing)
        
    print(entity+' ('+t.iloc[0]['category']+') & corr avg &') 
    
    for c,row in t.iterrows():
        print(str(round(row['corr'],2)) + make_stars(row['p']) + ' &')
    
    print('----------------------------------')
    print('\n')
    


18
Pancake (dessert) & corr avg &
0.73* &
0.87* &
0.62* &
0.64* &
0.53* &
0.47* &
0.67* &
0.61* &
0.56* &
0.54* &
0.74* &
0.8* &
0.12 &
0.59* &
0.72* &
0.58* &
0.53* &
0.62* &
----------------------------------


18
Baking powder (pastry and bakery product) & corr avg &
0.61* &
0.73* &
0.76* &
0.6* &
0.16 &
0.33* &
0.57* &
0.61* &
0.82* &
0.76* &
0.91* &
0.81* &
0.61* &
0.3* &
0.7* &
0.29* &
0.31* &
0.63* &
----------------------------------


18
Bread (Literature Subject) (bread and flatbread) & corr avg &
0.81* &
0.74* &
0.69* &
0.5* &
0.37* &
0.58* &
0.54* &
0.58* &
0.77* &
0.63* &
0.92* &
0.82* &
-0.14 &
0.57* &
0.56* &
0.56* &
0.29 &
0.71* &
----------------------------------


17
['MX']
Baker's yeast (pastry and bakery product) & corr avg &
0.79* &
0.84* &
0.71* &
0.56* &
0.33* &
0.24 &
0.68* &
0.7* &
0.78* &
0.56* &
0.92* &
0.54* &
0.35* &
0.5* &
0.54* &
0.05 &
0.78* &
----------------------------------


17
['MX']
Cookie (dessert) & corr avg &
0.67* &
0.83* &
0.68* &
0.51* &
0.

In [18]:
for i,row in tmp.sort_values(by = 'corr_avg', ascending =  False).head(10)[['name','corr_avg','country','corr','p','category']].iterrows():
    print(row['name']+' ('+row['category']+') & ')
    print(round(row['corr_avg'],2),' & ')
    

    

Pancake (dessert) & 
0.61  & 
Baking powder (pastry and bakery product) & 
0.58  & 
Bread (Literature Subject) (bread and flatbread) & 
0.58  & 
Baker's yeast (pastry and bakery product) & 
0.58  & 
Cookie (dessert) & 
0.57  & 
Chocolate brownie (dessert) & 
0.56  & 
Chicken meat (chicken dish) & 
0.55  & 
Chocolate cake (dessert) & 
0.55  & 
Biscuit (dessert) & 
0.54  & 
Pasta (pasta, pizza and noodle dish) & 
0.53  & 


In [19]:
tmp.sort_values(by = 'corr_avg', ascending =  True).head(10)[['name','corr_avg','category']]

Unnamed: 0,name,corr_avg,category
1260,Tapas,-0.337654,snack
447,Energy drink,-0.271715,soft drink
676,Korean barbecue,-0.262708,beef dish
225,Campanelle,-0.221799,"pasta, pizza and noodle dish"
1238,Sushi,-0.194677,rice dish
515,Gelato,-0.190158,dessert
282,Chewing gum,-0.174086,dessert
1040,Ramen,-0.173289,soup
12,Agnolotti,-0.173056,"pasta, pizza and noodle dish"
189,Burrata,-0.163816,cheese


In [20]:
top_entities = tmp.sort_values(by = 'corr_avg').head(10)[['name','corr_avg','category']]['name'].values

In [21]:
top_entities

array(['Tapas', 'Energy drink', 'Korean barbecue', 'Campanelle', 'Sushi',
       'Gelato', 'Chewing gum', 'Ramen', 'Agnolotti', 'Burrata'],
      dtype=object)

In [22]:
for entity in top_entities:
    t = df_results.loc[df_results['name']==entity].sort_values(by = 'country')
    print(len(t))
    
    missing = []
    for i in full_names.keys():
        if i not in t['country'].values:
            missing.append(i)
    if len(t)<18:
        print(missing)
        
    print(entity+' ('+t.iloc[0]['category']+') & corr avg &') 
    
    for c,row in t.iterrows():
        print(str(round(row['corr'],2)) + make_stars(row['p']) + ' &')
    
    print('----------------------------------')
    print('\n')
    

18
Tapas (snack) & corr avg &
-0.51* &
-0.05 &
-0.49* &
-0.76* &
-0.45* &
0.02 &
-0.89* &
-0.83* &
-0.77* &
-0.38* &
0.09 &
-0.18 &
-0.11 &
-0.24 &
0.41* &
0.24 &
-0.46* &
-0.7* &
----------------------------------


17
['MX']
Energy drink (soft drink) & corr avg &
0.02 &
-0.71* &
-0.32* &
-0.22 &
-0.2 &
-0.14 &
-0.42* &
-0.13 &
-0.46* &
-0.13 &
-0.48* &
-0.24 &
-0.08 &
-0.22 &
-0.08 &
-0.29* &
-0.52* &
----------------------------------


16
['KE', 'NG']
Korean barbecue (beef dish) & corr avg &
-0.62* &
0.02 &
-0.69* &
-0.5* &
-0.17 &
0.05 &
-0.17 &
-0.47* &
-0.6* &
-0.48* &
0.09 &
-0.06 &
-0.02 &
0.03 &
0.12 &
-0.73* &
----------------------------------


12
['DE', 'IN', 'KE', 'NG', 'JP', 'EG']
Campanelle (pasta, pizza and noodle dish) & corr avg &
-0.19 &
-0.22 &
-0.1 &
nan &
-0.09 &
-0.39* &
-0.48* &
-0.18 &
-0.14 &
-0.22 &
-0.22 &
-0.23 &
----------------------------------


18
Sushi (rice dish) & corr avg &
-0.33* &
-0.1 &
-0.36* &
-0.43* &
-0.15 &
-0.02 &
0.02 &
-0.07 &
-0.12 &


In [23]:
for i,row in tmp.sort_values(by = 'corr_avg').head(10)[['name','corr_avg','country','corr','p','category']].iterrows():
    print(row['name']+' ('+row['category']+') & ')
    print(round(row['corr_avg'],2),' & ')

Tapas (snack) & 
-0.34  & 
Energy drink (soft drink) & 
-0.27  & 
Korean barbecue (beef dish) & 
-0.26  & 
Campanelle (pasta, pizza and noodle dish) & 
-0.22  & 
Sushi (rice dish) & 
-0.19  & 
Gelato (dessert) & 
-0.19  & 
Chewing gum (dessert) & 
-0.17  & 
Ramen (soup) & 
-0.17  & 
Agnolotti (pasta, pizza and noodle dish) & 
-0.17  & 
Burrata (cheese) & 
-0.16  & 


In [24]:
df = pd.read_parquet(DATA_DIR+'modes_timeseries.parquet')
df = df.loc[df['country_code'].isin(full_names.keys())]

In [25]:
entry_list = []

c = 0
for category, gr1 in df.groupby('name'):
    for country, gr2 in gr1.groupby('country_code'):
        c+=1
        entry = {}
        y = list(mobility_ts[country].values())
        #x = list(gr2.iloc[0]['ts']['max_ratio'].values())[-46:]
        
        x = (np.array(list(gr2.iloc[0]['ts']['max_ratio'].values())[-46:]) - np.array(list(gr2.iloc[0]['ts']['max_ratio'].values())[-98:-52]) - 0.00001) /\
            (np.array(list(gr2.iloc[0]['ts']['max_ratio'].values())[-98:-52]) + 0.00001)
        entry['name'] = category
        entry['category'] = gr2.iloc[0]['category']
        entry['country'] = country
        entry['corr'] = spearmanr(x,y)[0]
        entry['p'] = spearmanr(x,y)[1]
        
        entry_list.append(entry)

In [26]:
df_results_modes = pd.DataFrame(entry_list)

In [27]:
tmp = df_results_modes.groupby('name').apply(lambda x: pd.Series({
    'corr_avg': x['corr'].mean(), 
    'category': x['category'].unique()[0],
    'C': x['country'].unique(),
    'all': x[['country','corr','p']]
    })).reset_index()


In [28]:
tmp.sort_values(by = 'corr_avg', ascending =  False)[['name','corr_avg','category']]

Unnamed: 0,name,corr_avg,category
12,Recipe,0.700203,Mode 1
0,Baking,0.621366,Mode 1
4,Cooking,0.575846,Mode 1
15,Take-out,0.496731,Mode 2
9,Grocery store,0.28486,Mode 1
14,Supermarket,0.170706,Mode 1
1,Barbecue,0.108432,Mode 4
7,Food delivery,0.085942,Mode 2
6,Drive-in,0.000244,Mode 2
11,Picnic,-0.177622,Mode 4


## Print correlations in latex formatting

In [29]:
for i,row in tmp.sort_values(by = 'corr_avg', ascending =  False)[['name','corr_avg','category','all']].iterrows():
    print(row['name'])
    print(row['category'])
    print(round(row['corr_avg'],2))
    

    if row['name']=='Lunchbox':
        for c,r in row['all'].iloc[:17].iterrows():
            print( '&',str(round(r['corr'],2))+make_stars(r['p']))
    else:
        for c,r in row['all'].iterrows():
            print( '&',str(round(r['corr'],2))+make_stars(r['p']))
    print('----------------------------------')
    print('\n')

Recipe
Mode 1
0.7
& 0.88*
& 0.87*
& 0.73*
& 0.74*
& 0.73*
& 0.45*
& 0.81*
& 0.72*
& 0.85*
& 0.76*
& 0.82*
& 0.85*
& 0.04
& 0.66*
& 0.95*
& 0.47*
& 0.61*
& 0.68*
----------------------------------


Baking
Mode 1
0.62
& 0.88*
& 0.8*
& 0.71*
& 0.55*
& 0.28
& 0.41*
& 0.64*
& 0.43*
& 0.88*
& 0.71*
& 0.91*
& 0.46*
& 0.17
& 0.77*
& 0.79*
& 0.47*
& 0.55*
& 0.77*
----------------------------------


Cooking
Mode 1
0.58
& 0.75*
& 0.67*
& 0.6*
& 0.39*
& 0.23
& 0.08
& 0.87*
& 0.61*
& 0.88*
& 0.63*
& 0.84*
& 0.75*
& 0.26
& 0.66*
& 0.83*
& 0.37*
& 0.34*
& 0.62*
----------------------------------


Take-out
Mode 2
0.5
& 0.8*
& 0.66*
& 0.64*
& 0.56*
& 0.73*
& 0.07
& 0.1
& 0.62*
& 0.33*
& 0.42*
& 0.31*
& 0.79*
& 0.8*
& 0.08
& 0.49*
& 0.03
& 0.63*
& 0.88*
----------------------------------


Grocery store
Mode 1
0.28
& 0.14
& 0.32*
& 0.64*
& 0.41*
& 0.32*
& 0.07
& 0.12
& -0.29*
& 0.64*
& 0.38*
& 0.8*
& 0.46*
& 0.15
& 0.36*
& -0.28
& 0.35*
& 0.16
& 0.39*
----------------------------------


Supermarket
