In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
from geopy import Nominatim


In [2]:
ratings = pd.read_csv('../data/chocolate_ratings.csv')
ratings.shape, ratings.columns

((2530, 10),
 Index(['REF', 'Company (Manufacturer)', 'Company Location', 'Review Date',
        'Country of Bean Origin', 'Specific Bean Origin or Bar Name',
        'Cocoa Percent', 'Ingredients', 'Most Memorable Characteristics',
        'Rating'],
       dtype='object'))

In [3]:
ratings['Cocoa Percent Int'] = ratings['Cocoa Percent'].str[:2].astype(int)

abbrev2ingredient = {'B':'Beans', 'S':'Sugar', 'S*': 'Sweetener other than white cane or beet sugar', 'C':'Cocoa Butter', 
                    'V': 'Vanilla', 'L': 'Lecithin', 'Sa': 'Salt', '':''}

ratings['Ingredients List'] = [[abbrev2ingredient[x] for x in i[1].strip().split(',')] for i in ratings['Ingredients'].fillna('0 - ').str.split('-')]

ratings['Most Memorable Characteristics List'] = ratings['Most Memorable Characteristics'].fillna(',').str.split(', ')

In [4]:
ratings.head()

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating,Cocoa Percent Int,Ingredients List,Most Memorable Characteristics List
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25,76,"[Beans, Sugar, Cocoa Butter]","[rich cocoa, fatty, bready]"
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.5,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, vegetal, savory]"
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, blackberry, full body]"
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68%,"3- B,S,C","chewy, off, rubbery",3.0,68,"[Beans, Sugar, Cocoa Butter]","[chewy, off, rubbery]"
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72%,"3- B,S,C","fatty, earthy, moss, nutty,chalky",3.0,72,"[Beans, Sugar, Cocoa Butter]","[fatty, earthy, moss, nutty,chalky]"


In [5]:
def findGeocode(place):  
    '''Finds the geocode of any location'''
    try:
        geolocator = Nominatim(user_agent="") ##user agent should be created
        return geolocator.geocode(place) 
    except Exception as e:
        print(e)
        return None


In [6]:
places = list(ratings['Company Location'].unique())

In [7]:
lat_long = []
# use geopy to extract coordinates from place names
for place in tqdm(places): 
        loc = findGeocode(place) 
        lat_long.append({'latitude': loc.latitude, 'longitude': loc.longitude})


100%|██████████| 67/67 [00:33<00:00,  2.01it/s]


In [13]:
with open('jsons/country_locations.json', 'w') as outfile:
    json.dump(lat_long, outfile, )

In [7]:
with open('jsons/country_locations.json', 'r') as infile:
    lat_long = json.load(infile, )

print(lat_long[:2])
places_dic = {places[i]:lat_long[i] for i in range(len(lat_long))}

[{'latitude': 28.41769265, 'longitude': -81.58111071077724}, {'latitude': 46.603354, 'longitude': 1.8883335}]


In [15]:
aggregations = {'Company Location': ['count', ], #'first' 
                'Company (Manufacturer)': lambda x: list(x),
                'Review Date': [lambda x: list(x), lambda x: int(np.mean(x))], 
                'Country of Bean Origin': lambda x: list(x), 
                'Specific Bean Origin or Bar Name': lambda x: list(x),
                'Cocoa Percent': lambda x: list(x),
                'Cocoa Percent Int': lambda x: np.round(np.mean(x), 2),
                'Rating': [lambda x: list(x), lambda x: np.round(np.mean(x), 2)],
                'Ingredients List': ['sum', lambda x: list(x)],
                'Most Memorable Characteristics List': ['sum', lambda x: list(x)],
                }

In [16]:
aggregated = ratings.groupby('Company Location').agg(aggregations).reset_index()
aggregated.columns = ['company_location_name', 'company_location_count', 'company_names_list', 'review_date_list', 'review_date_mean', 'country_bean_origin_list',
                      'specific_bean_origin_list', 'cocoa_percent_list', 'cocoa_percent_mean', 'rating_list', 'rating_mean', 'ingredients_lists',
                      'ingredients_list_of_list', 'memorable_characteristics_lists', 'memorable_characteristics_list_of_list']
aggregated['latitude'] = [places_dic[loc]['latitude'] for loc in aggregated['company_location_name']]
aggregated['longitude'] = [places_dic[loc]['longitude'] for loc in aggregated['company_location_name']]
aggregated.sample()

Unnamed: 0,company_location_name,company_location_count,company_names_list,review_date_list,review_date_mean,country_bean_origin_list,specific_bean_origin_list,cocoa_percent_list,cocoa_percent_mean,rating_list,rating_mean,ingredients_lists,ingredients_list_of_list,memorable_characteristics_lists,memorable_characteristics_list_of_list,latitude,longitude
24,Hungary,26,"[Chococard (Lapos), Chococard (Lapos), Rozsavo...","[2019, 2019, 2011, 2011, 2011, 2011, 2012, 201...",2013,"[Nicaragua, Dominican Republic, Venezuela, Pri...","[La Dalia, Zorzal, Porcelana, Principe, Carene...","[72%, 71%, 71%, 77%, 73%, 70%, 76%, 84%, 72%, ...",72.27,"[2.5, 3.25, 2.5, 2.75, 3.5, 3.5, 2.25, 2.5, 2....",3.22,"[Beans, Sugar, Cocoa Butter, Beans, Sugar, Coc...","[[Beans, Sugar, Cocoa Butter], [Beans, Sugar, ...","[grassy, rubbery, chunky, spicy, rubbery, inte...","[[grassy, rubbery], [chunky, spicy, rubbery], ...",47.181759,19.506094


In [19]:
to_nest = [col for col in aggregated.columns if col.split('_')[-1] == 'list']
to_nest

['company_names_list',
 'review_date_list',
 'country_bean_origin_list',
 'specific_bean_origin_list',
 'cocoa_percent_list',
 'rating_list',
 'ingredients_list_of_list',
 'memorable_characteristics_list_of_list']

In [30]:
json_agg = []

for row in aggregated.iterrows():

    dic_add = {col:row[1][col] for col in aggregated.columns if col not in to_nest}
    dic_add['card'] = [{col : row[1][col][i]} for i in range(row[1]['company_location_count']) for col in aggregated.columns if col in to_nest ]
    json_agg.append(dic_add)

json_agg[1]

{'company_location_name': 'Argentina',
 'company_location_count': 9,
 'review_date_mean': 2008,
 'cocoa_percent_mean': 73.33,
 'rating_mean': 3.31,
 'ingredients_lists': ['',
  '',
  '',
  '',
  '',
  'Beans',
  'Sugar',
  'Cocoa Butter',
  'Lecithin',
  'Beans',
  'Sugar',
  'Cocoa Butter',
  'Lecithin',
  'Beans',
  'Sugar',
  'Cocoa Butter',
  'Lecithin',
  'Beans',
  'Sugar',
  'Cocoa Butter',
  'Lecithin'],
 'memorable_characteristics_lists': ['very sour',
  'mildly bitter',
  'slightly burnt',
  'mildly bitter',
  'dry',
  'coffee',
  'sour',
  'minty',
  'nutty',
  'nutmeg',
  'cocoa then roasted nut',
  'creamy',
  'vanilla',
  'rich',
  'nutty',
  'spicy',
  'floral',
  'nutty',
  'basic cocoa',
  'floral',
  'spicey'],
 'latitude': -34.9964963,
 'longitude': -64.9672817,
 'card': [{'company_names_list': 'Compania de Chocolate (Salgado)'},
  {'review_date_list': 2008},
  {'country_bean_origin_list': 'Venezuela'},
  {'specific_bean_origin_list': 'Carenero Superior'},
  {'cocoa_

In [None]:
# one json for bean origin
# one for links from country to bean origin

In [31]:
with open('jsons/aggregated_dataset.json', 'w') as outfile:
    json.dump(json_agg, outfile, )

In [15]:
makers = pd.read_csv('../data/chocolate_makers.csv')
makers.shape, makers.columns

((373, 5),
 Index(['COMPANY NAME', 'CITY', 'STATE/PROVINCE', 'OWNER/MAKER', 'COUNTRY'], dtype='object'))

In [17]:
places = list(makers['STATE/PROVINCE'].unique())

lat_long = []
# use geopy to extract coordinates from place names
for place in tqdm(places): 
        loc = findGeocode(place) 
        lat_long.append({'latitude': loc.latitude, 'longitude': loc.longitude})


100%|██████████| 59/59 [00:32<00:00,  1.82it/s]


In [18]:
with open('state_locations.json', 'w') as outfile:
    json.dump(lat_long, outfile, )

In [21]:
places = list(makers['CITY'].unique())

lat_long = []
# use geopy to extract coordinates from place names
for place in tqdm(places): 
        loc = findGeocode(place) 
        try:
                lat_long.append({'latitude': loc.latitude, 'longitude': loc.longitude})
        except:
                print(place)

 21%|██        | 55/266 [00:27<01:43,  2.03it/s]

Ascata


 41%|████      | 108/266 [00:53<01:24,  1.87it/s]

Anahola (Kaua'I)


 71%|███████   | 189/266 [01:35<00:40,  1.90it/s]

Swamanoa


 82%|████████▏ | 217/266 [01:49<00:24,  2.03it/s]

Chattonooga


100%|██████████| 266/266 [02:14<00:00,  1.97it/s]


In [22]:
with open('city_locations.json', 'w') as outfile:
    json.dump(lat_long, outfile, )