In [1]:
import pandas as pd
import os
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import spearmanr

In [2]:
parentDirectory = os.path.abspath(os.path.join(os.path.join(os.getcwd(), os.pardir), os.pardir))
DATA_DIR = parentDirectory +'/data/'
FIGURES_DIR = parentDirectory +'/figures/'

def make_stars(val):
    '''
    if val<0.0001:
        return '****'
    elif val<0.001:
        return '***'
    elif val<0.01:
        return '**'
    '''
    if val<0.05:
        return '*'
    else:
        return ''

In [3]:
df = pd.read_parquet(DATA_DIR+'dk_new_food_timeseries_items1.parquet')

In [4]:
df = df.loc[df['name']!='Bánh mì']

In [5]:
df.head()

Unnamed: 0,country_code,name,mid,category,ts
0,FR,Cookie,/m/021mn,dessert,{'max_ratio': {'2019-01-06': 54.39198365723229...
1,FR,Coca-Cola,/m/01yvs,soft drink,{'max_ratio': {'2019-01-06': 27.31837379184492...
2,FR,Rice,/m/09759,rice dish,"{'max_ratio': {'2019-01-06': 130.39174164405, ..."
3,FR,Miso,/m/057z_,soup,{'max_ratio': {'2019-01-06': 3.470213996529786...
4,FR,Table salt,/m/05g0_z,spice,{'max_ratio': {'2019-01-06': 85.77197422871245...


In [6]:
df_mobility = pd.read_csv(DATA_DIR+'df_mobility.csv')

In [7]:
full_names = {
    'AU': 'Australia',
    'BR': 'Brazil',
    'CA': 'Canada',
    'FR': 'France',
    'DE': 'Germany',
    'IN': 'India',
    'IT': 'Italy',
    'MX': 'Mexico',
    'ES': 'Spain',
    'GB': 'United Kingdom',
    'US': 'United States',
    'DK': 'Denmark'
}

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

event_dicts = [{'country': 'AU',
  'start_md_1': '2020-03-27',
  'end_md_1': '2020-06-07',
  'start_md_2': np.nan},
 {'country': 'BR',
  'start_md_1': '2020-03-23',
  'end_md_1': '2020-08-09',
  'start_md_2': np.nan},
 {'country': 'CA',
  'start_md_1': '2020-03-19',
  'end_md_1': '2020-06-21',
  'start_md_2': '2020-10-12'},
 {'country': 'DE',
  'start_md_1': '2020-03-21',
  'end_md_1': '2020-05-09',
  'start_md_2': '2020-12-18'},
 {'country': 'DK',
  'start_md_1': '2020-03-17',
  'end_md_1': '2020-05-07',
  'start_md_2': np.nan},
 {'country': 'ES',
  'start_md_1': '2020-03-17',
  'end_md_1': '2020-06-14',
  'start_md_2': '2020-11-07'},
 {'country': 'FR',
  'start_md_1': '2020-03-18',
  'end_md_1': '2020-06-08',
  'start_md_2': '2020-11-01'},
 {'country': 'GB',
  'start_md_1': '2020-03-23',
  'end_md_1': '2020-08-03',
  'start_md_2': '2020-10-21'},
 {'country': 'IN',
  'start_md_1': '2020-03-24',
  'end_md_1': '2020-10-29',
  'start_md_2': np.nan},
 {'country': 'IT',
  'start_md_1': '2020-03-11',
  'end_md_1': '2020-06-06',
  'start_md_2': '2020-11-06'},
 {'country': 'JP',
  'start_md_1': '2020-04-12',
  'end_md_1': '2020-05-30',
  'start_md_2': np.nan},
 {'country': 'KE',
  'start_md_1': '2020-03-24',
  'end_md_1': '2020-10-04',
  'start_md_2': np.nan},
 {'country': 'MX',
  'start_md_1': '2020-03-25',
  'end_md_1': '2020-10-06',
  'start_md_2': np.nan},
 {'country': 'NG',
  'start_md_1': '2020-03-27',
  'end_md_1': '2020-08-09',
  'start_md_2': np.nan},
 {'country': 'US',
  'start_md_1': '2020-03-21',
  'end_md_1': '2020-06-11',
  'start_md_2': '2020-11-26'}]

df_events = pd.DataFrame(event_dicts)
df_events['start_md_1'] = pd.to_datetime(df_events['start_md_1'])
df_events['end_md_1'] = pd.to_datetime(df_events['end_md_1'])
df_events['start_md_2'] = pd.to_datetime(df_events['start_md_2'])

df_agg = pd.read_pickle(DATA_DIR+'df_agg_cats.pickle')

In [8]:
df = df.loc[df['country_code'].isin(full_names.keys())]

In [9]:
df_agg = df_agg.loc[df_agg['country'].isin(full_names.keys())]
studied_weeks = list(df_agg.iloc[0]['volume_weekly_total'].index)[-46:]

mobility_ts = {}

for country, gr in df_mobility.groupby('country_region_code'):
    mobility_dict = {}
    gr = gr.iloc[1:323]
    for i in chunker(gr,7):
        mobility_dict[i.iloc[0]['date']] = (i['residential_percent_change_from_baseline'].mean())
    mobility_ts[country] = mobility_dict

In [10]:
entry_list = []

c = 0
for category, gr1 in df.groupby('name'):
    for country, gr2 in gr1.groupby('country_code'):
        c+=1
        entry = {}
        y = list(mobility_ts[country].values())
        x = list(gr2.iloc[0]['ts']['max_ratio'].values())[-46:]
        entry['name'] = category
        entry['category'] = gr2.iloc[0]['category']
        entry['country'] = country
        entry['corr'] = spearmanr(x,y)[0]
        entry['p'] = spearmanr(x,y)[1]
        
        entry_list.append(entry)

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


In [11]:
df_results = pd.DataFrame(entry_list)

In [12]:
tmp = df_results.groupby('name').apply(lambda x: pd.Series({
    'corr_avg': x['corr'].mean(), 
    'category': x['category'].unique()[0],
    'C': x['country'].unique(),
    'exists_all_countries': len(x)==12,
    'all': x[['country','corr','p']]
    })).reset_index()
tmp = tmp.loc[tmp['exists_all_countries']]

In [13]:
tmp.sort_values(by = 'corr_avg', ascending =  False).head(10)[['name','corr_avg','category']]

Unnamed: 0,name,corr_avg,category
163,Bread (Literature Subject),0.84727,bread and flatbread
1178,Sourdough,0.792586,bread and flatbread
71,Baking powder,0.781995,pastry and bakery product
309,Chocolate chip cookie,0.74937,dessert
610,Icing,0.746298,dessert
306,Chocolate brownie,0.744708,dessert
1003,Powdered sugar,0.742503,dessert
289,Chicken meat,0.712443,chicken dish
1125,Sauces,0.707543,sauce
513,Garlic,0.700749,spice


In [21]:
for i,row in tmp.sort_values(by = 'corr_avg', ascending =  False).head(10)[['name','corr_avg','category','all']].iterrows():
    print(row['name']+' ('+row['category']+') & ')
    print(round(row['corr_avg'],2),' & ')
    
    #print(row['all']['country'].values)
    for c,r in row['all'].iterrows():
        print( round(r['corr'],2),make_stars(r['p']) + ' &')
    print('----------------------------------')
    print('\n')
    

Bread (Literature Subject) (bread and flatbread) & 
0.85  & 
0.86 * &
0.96 * &
0.9 * &
0.58 * &
0.73 * &
0.92 * &
0.92 * &
0.78 * &
0.93 * &
0.94 * &
0.79 * &
0.85 * &
----------------------------------


Sourdough (bread and flatbread) & 
0.79  & 
0.86 * &
0.94 * &
0.71 * &
0.78 * &
0.64 * &
0.87 * &
0.78 * &
0.86 * &
0.6 * &
0.87 * &
0.73 * &
0.87 * &
----------------------------------


Baking powder (pastry and bakery product) & 
0.78  & 
0.69 * &
0.86 * &
0.82 * &
0.73 * &
0.43 * &
0.73 * &
0.78 * &
0.73 * &
0.92 * &
0.93 * &
0.85 * &
0.9 * &
----------------------------------


Chocolate chip cookie (dessert) & 
0.75  & 
0.77 * &
0.76 * &
0.87 * &
0.65 * &
0.48 * &
0.74 * &
0.65 * &
0.82 * &
0.8 * &
0.86 * &
0.71 * &
0.89 * &
----------------------------------


Icing (dessert) & 
0.75  & 
0.59 * &
0.9 * &
0.85 * &
0.69 * &
0.47 * &
0.8 * &
0.72 * &
0.83 * &
0.82 * &
0.87 * &
0.59 * &
0.84 * &
----------------------------------


Chocolate brownie (dessert) & 
0.74  & 
0.83 * &
0

In [15]:
tmp.sort_values(by = 'corr_avg', ascending =  True).head(10)[['name','corr_avg','category']]

Unnamed: 0,name,corr_avg,category
1257,Tapas,-0.459972,snack
674,Korean barbecue,-0.370761,beef dish
189,Burrata,-0.318461,cheese
108,Bento,-0.304061,rice dish
1363,Whopper,-0.292923,beef dish
1235,Sushi,-0.271824,rice dish
240,Carpaccio,-0.264037,beef dish
515,Gelato,-0.259161,dessert
616,Insalata Caprese,-0.258245,salad
778,McDonald's Chicken McNuggets,-0.239814,chicken dish


In [22]:
for i,row in tmp.sort_values(by = 'corr_avg', ascending =  True).head(10)[['name','corr_avg','category','all']].iterrows():
    print(row['name']+' ('+row['category']+') & ')
    print(round(row['corr_avg'],2),' & ')
    
    #print(row['all']['country'].values)
    for c,r in row['all'].iterrows():
        print( round(r['corr'],2),make_stars(r['p']) + ' &')
    print('----------------------------------')
    print('\n')
    

Tapas (snack) & 
-0.46  & 
-0.46 * &
-0.16  &
-0.62 * &
-0.66 * &
-0.43 * &
-0.91 * &
-0.73 * &
-0.69 * &
0.23  &
-0.34 * &
-0.08  &
-0.67 * &
----------------------------------


Korean barbecue (beef dish) & 
-0.37  & 
-0.55 * &
-0.11  &
-0.77 * &
-0.56 * &
-0.2  &
-0.17  &
-0.53 * &
-0.72 * &
-0.09  &
-0.02  &
-0.03  &
-0.69 * &
----------------------------------


Burrata (cheese) & 
-0.32  & 
-0.25  &
0.25  &
-0.37 * &
-0.56 * &
-0.22  &
-0.66 * &
-0.39 * &
-0.41 * &
-0.2  &
-0.49 * &
-0.11  &
-0.41 * &
----------------------------------


Bento (rice dish) & 
-0.3  & 
-0.48 * &
0.05  &
-0.59 * &
-0.48 * &
-0.28  &
-0.15  &
-0.43 * &
-0.58 * &
-0.23  &
0.01  &
-0.08  &
-0.42 * &
----------------------------------


Whopper (beef dish) & 
-0.29  & 
-0.05  &
-0.62 * &
-0.06  &
-0.1  &
-0.29  &
-0.2  &
-0.02  &
-0.58 * &
-0.32 * &
-0.44 * &
-0.15  &
-0.69 * &
----------------------------------


Sushi (rice dish) & 
-0.27  & 
-0.49 * &
-0.28  &
-0.5 * &
-0.51 * &
-0.03  &
-0.34 * &
0

In [64]:
df = pd.read_parquet(DATA_DIR+'modes_fine.parquet')
df = df.loc[df['country_code'].isin(full_names.keys())]

In [65]:
entry_list = []

c = 0
for category, gr1 in df.groupby('name'):
    for country, gr2 in gr1.groupby('country_code'):
        c+=1
        entry = {}
        y = list(mobility_ts[country].values())
        x = list(gr2.iloc[0]['ts']['max_ratio'].values())[-46:]
        entry['name'] = category
        entry['category'] = gr2.iloc[0]['category']
        entry['country'] = country
        entry['corr'] = spearmanr(x,y)[0]
        entry['p'] = spearmanr(x,y)[1]
        
        entry_list.append(entry)

In [66]:
df_results_modes = pd.DataFrame(entry_list)

In [67]:
tmp = df_results_modes.groupby('name').apply(lambda x: pd.Series({
    'corr_avg': x['corr'].mean(), 
    'category': x['category'].unique()[0],
    'C': x['country'].unique(),
    'all': x[['country','corr','p']]
    })).reset_index()


In [68]:
tmp.sort_values(by = 'corr_avg', ascending =  False)[['name','corr_avg','category']]

Unnamed: 0,name,corr_avg,category
12,Recipe,0.874205,Mode 1
0,Baking,0.816163,Mode 1
4,Cooking,0.794334,Mode 1
15,Take-out,0.618634,Mode 2
9,Grocery store,0.317404,Mode 1
6,Drive-in,0.17725,Mode 2
7,Food delivery,0.148599,Mode 2
14,Supermarket,0.146737,Mode 1
1,Barbecue,-0.03263,Mode 4
11,Picnic,-0.188654,Mode 4


In [94]:
for i,row in tmp.sort_values(by = 'corr_avg', ascending =  False)[['name','corr_avg','category','all']].iterrows():
    print(row['name'])
    print(row['category'])
    print(round(row['corr_avg'],2))
    
    if row['name']=='Lunchbox':
        for c,r in row['all'].iloc[:11].iterrows():
            print( '&',str(round(r['corr'],2))+make_stars(r['p']))
    else:
        for c,r in row['all'].iterrows():
            print( '&',str(round(r['corr'],2))+make_stars(r['p']))
    print('----------------------------------')
    print('\n')

Recipe
Mode 1
0.87
& 0.86*
& 0.9*
& 0.9*
& 0.77*
& 0.79*
& 0.94*
& 0.86*
& 0.9*
& 0.89*
& 0.96*
& 0.81*
& 0.9*
----------------------------------


Baking
Mode 1
0.82
& 0.86*
& 0.87*
& 0.89*
& 0.79*
& 0.64*
& 0.79*
& 0.78*
& 0.93*
& 0.92*
& 0.65*
& 0.78*
& 0.88*
----------------------------------


Cooking
Mode 1
0.79
& 0.78*
& 0.84*
& 0.73*
& 0.79*
& 0.33*
& 0.88*
& 0.8*
& 0.86*
& 0.92*
& 0.92*
& 0.81*
& 0.88*
----------------------------------


Take-out
Mode 2
0.62
& 0.92*
& 0.77*
& 0.7*
& 0.7*
& 0.84*
& 0.06
& 0.67*
& 0.51*
& 0.04
& 0.65*
& 0.64*
& 0.93*
----------------------------------


Grocery store
Mode 1
0.32
& 0.22
& 0.58*
& 0.49*
& 0.37*
& 0.38*
& -0.03
& -0.37*
& 0.64*
& 0.91*
& 0.28
& -0.34*
& 0.67*
----------------------------------


Drive-in
Mode 2
0.18
& -0.43*
& 0.58*
& -0.06
& 0.79*
& 0.65*
& -0.21
& 0.69*
& 0.05
& -0.3*
& -0.2
& 0.34*
& 0.22
----------------------------------


Food delivery
Mode 2
0.15
& 0.66*
& -0.07
& 0.68*
& -0.04
& -0.31*
& -0.1
& -0.14
& 0.3