In [1]:
import json
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
import inflect
import time
import gensim
import random
import pandas as pd
import pickle
import pycountry_convert as pyc

engine = inflect.engine()

In [2]:
recipes = json.load(open('../resources/all_recipes_repr.json'))
recipes[0]

['romaine lettuce',
 'garlic',
 'pepper',
 'taco seasoning',
 'garbanzo bean',
 'feta']

In [3]:
ingredients = {i for r in recipes for i in r}
len(ingredients)

1521

In [4]:
data = pd.read_csv('../resources/fao_norm.csv', sep=',', encoding = "ISO-8859-1")
data.shape

(2425275, 11)

In [5]:
countries = sorted(list(set(data['Area'].values)))
countries[:5], len(countries)

(['Afghanistan', 'Africa', 'Albania', 'Algeria', 'American Samoa'], 258)

In [6]:
def ref(ing) :
    matches = []
    for i in ingredients :
        if i in ing :
            matches.append(i)
    return matches;

In [7]:
import os

In [8]:
fname = '../resources/countries_to_ing.p'
if os.path.isfile(fname):
    countries_to_ing = pickle.load(open(fname, 'rb'))
else:
    countries_to_ing = dict()
    for c in tqdm(countries):
        c_data = data[data['Area']==c]
        c_ing = set(c_data['Item'].values)
        c_ing = {i.lower() for i in c_ing}
        c_ing_ref = {i_ for i in c_ing for i_ in ref(i)}
        c_ing_ref = {i for i in ingredients for s_i in c_ing_ref if s_i in i}
        countries_to_ing[c] = sorted(list(c_ing_ref))
        pickle.dump(countries_to_ing, pickle.load(open(fname, 'wb')))

In [9]:
c_all_ing = {i for _, ings in countries_to_ing.items() for i in ings}
ing_to_country = {i:[c for c in countries if i in countries_to_ing[c]] for i in tqdm(ingredients)}
pickle.dump(ing_to_country, open('ing_to_country', 'wb'))

HBox(children=(IntProgress(value=0, max=1521), HTML(value='')))




In [10]:
ing_to_country = pickle.load(open('ing_to_country', 'rb'))
def locality_score_bin(i, c='Switzerland'):
    return c in ing_to_country[i]

In [13]:
locality_score_bin('potato', 'Switzerland')

True

In [14]:
country = 'Switzerland'

In [15]:
c_ing = countries_to_ing[country]
len(c_ing)

544

In [16]:
c_dist = pd.read_csv('../resources/dist_cepii.csv', index_col=['iso_o', 'iso_d'])
c_dist.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,contig,comlang_off,comlang_ethno,colony,comcol,curcol,col45,smctry,dist,distcap,distw,distwces
iso_o,iso_d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ABW,ABW,0,0,0,0,0,0,0,0,5.225315,5.225315,25.09354,23.04723
ABW,AFG,0,0,0,0,0,0,0,0,13257.81,13257.81,13168.22,13166.37
ABW,AGO,0,0,0,0,0,0,0,0,9516.913,9516.913,9587.316,9584.193
ABW,AIA,0,0,1,0,0,0,0,0,983.2682,983.2682,976.8974,976.8916
ABW,ALB,0,0,0,0,0,0,0,0,9091.742,9091.742,9091.576,9091.466


In [17]:
c_dist['dist'].max()

19951.16

In [20]:
def alpha3_to_dist(c1, c2):
    if c1 == c2:
        return 0
    return c_dist.loc[c1, c2]['dist']

In [21]:
alpha3_to_dist('CHE', 'USA')

6272.285

In [22]:
def c_to_dist(c1, c2):
    c1 = pyc.country_name_to_country_alpha3(c1)
    c2 = pyc.country_name_to_country_alpha3(c2)
    return alpha3_to_dist(c1, c2)

In [23]:
recogni_c = []
unknown_c = []
for c in countries:
    try:
        pyc.country_name_to_country_alpha3(c)
        recogni_c.append(c)
    except KeyError:
        unknown_c.append(c)
        
len(recogni_c), len(unknown_c)

(204, 54)

In [24]:
ing_to_country2 = {i:[c for c in cs if c in recogni_c] for i, cs in ing_to_country.items()}
pickle.dump(ing_to_country2, open('../resources/ing_to_country2', 'wb'))

In [37]:
alpha3_dist = pd.read_csv('../resources/dist_cepii.csv', index_col=['iso_o', 'iso_d'])[['dist']]
alpha3_all = set(alpha3_dist.reset_index()['iso_o'].values)
ing_to_country2 = pickle.load(open('../resources/ing_to_country2', 'rb'))
def locality_score(i, c='Switzerland'):
    max_dist = alpha3_dist['dist'].max()
    to_alpha3 = lambda c: pyc.country_name_to_country_alpha3(c)
    producers = [p for p in ing_to_country2[i] if to_alpha3(p) in alpha3_all]
    if c in producers:
        dist = 0
    else:
        distances = [alpha3_dist.loc[to_alpha3(c), to_alpha3(p)]['dist'] for p in producers]
        if len(distances) == 0:
            return -1
        dist_idx = np.argmin(distances)
        min_dist = distances[dist_idx]
    return  1 - np.power(min_dist / max_dist, 0.6), producers[dist_idx]

In [46]:
locality_score('olive')

(0.8991310509591338, 'France')