In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
from geopy import Nominatim


In [8]:
ratings = pd.read_csv('../data/chocolate_ratings.csv')
ratings.shape, ratings.columns

((2530, 10),
 Index(['REF', 'Company (Manufacturer)', 'Company Location', 'Review Date',
        'Country of Bean Origin', 'Specific Bean Origin or Bar Name',
        'Cocoa Percent', 'Ingredients', 'Most Memorable Characteristics',
        'Rating'],
       dtype='object'))

In [9]:
ratings['Cocoa Percent Int'] = ratings['Cocoa Percent'].str[:2].astype(int)

abbrev2ingredient = {'B':'Beans', 'S':'Sugar', 'S*': 'Sweetener other than white cane or beet sugar', 'C':'Cocoa Butter', 
                    'V': 'Vanilla', 'L': 'Lecithin', 'Sa': 'Salt', '':''}

ratings['Ingredients List'] = [[abbrev2ingredient[x] for x in i[1].strip().split(',')] for i in ratings['Ingredients'].fillna('0 - ').str.split('-')]

ratings['Most Memorable Characteristics List'] = ratings['Most Memorable Characteristics'].fillna(',').str.split(', ')

In [10]:
ratings.head()

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating,Cocoa Percent Int,Ingredients List,Most Memorable Characteristics List
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25,76,"[Beans, Sugar, Cocoa Butter]","[rich cocoa, fatty, bready]"
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.5,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, vegetal, savory]"
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, blackberry, full body]"
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68%,"3- B,S,C","chewy, off, rubbery",3.0,68,"[Beans, Sugar, Cocoa Butter]","[chewy, off, rubbery]"
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72%,"3- B,S,C","fatty, earthy, moss, nutty,chalky",3.0,72,"[Beans, Sugar, Cocoa Butter]","[fatty, earthy, moss, nutty,chalky]"


In [3]:
def findGeocode(place):  
    '''Finds the geocode of any location'''
    try:
        geolocator = Nominatim(user_agent="") ##user agent should be created
        return geolocator.geocode(place) 
    except Exception as e:
        print(e)
        return None


In [7]:
places = list(ratings['Company Location'].unique())

lat_long = []
# use geopy to extract coordinates from place names
for place in tqdm(places): 
        loc = findGeocode(place) 
        lat_long.append({'latitude': loc.latitude, 'longitude': loc.longitude})


100%|██████████| 67/67 [00:33<00:00,  2.01it/s]


In [8]:
lat_long[:2]

[{'latitude': 28.41769265, 'longitude': -81.58111071077724},
 {'latitude': 46.603354, 'longitude': 1.8883335}]

In [13]:
with open('country_locations.json', 'w') as outfile:
    json.dump(lat_long, outfile, )

In [3]:
with open('country_locations.json', 'r') as infile:
    lat_long = json.load(infile, )

In [31]:
aggregations = {'Company Location': ['count', ], #'first' 
                'Review Date': [lambda x: list(x), lambda x: int(np.mean(x))], 
                'Country of Bean Origin': lambda x: list(x), 
                'Specific Bean Origin or Bar Name': lambda x: list(x),
                'Cocoa Percent': lambda x: list(x),
                'Cocoa Percent Int': lambda x: np.round(np.mean(x), 2),
                'Rating': [lambda x: list(x), lambda x: np.round(np.mean(x), 2)],
                'Ingredients List': ['sum', lambda x: list(x)],
                'Most Memorable Characteristics List': ['sum', lambda x: list(x)],
                }

In [32]:
aggregated = ratings.groupby('Company Location').agg(aggregations).reset_index()
aggregated.columns = ['company_location_name', 'company_location_count', 'review_date_list', 'review_date_mean', 'country_bean_origin_list',
                      'specific_bean_origin_list', 'cocoa_percent_list', 'cocoa_percent_mean', 'rating_list', 'rating_mean', 'ingredients_list',
                      'ingredients_list_of_list', 'memorable_characteristics_list', 'memorable_characteristics_list_of_list']
aggregated['latitude'] = [it['latitude'] for it in lat_long]
aggregated['longitude'] = [it['longitude'] for it in lat_long]
aggregated

Unnamed: 0,company_location_name,company_location_count,review_date_list,review_date_mean,country_bean_origin_list,specific_bean_origin_list,cocoa_percent_list,cocoa_percent_mean,rating_list,rating_mean,ingredients_list,ingredients_list_of_list,memorable_characteristics_list,memorable_characteristics_list_of_list,latitude,longitude
0,Amsterdam,12,"[2015, 2015, 2015, 2017, 2017, 2017, 2018, 201...",2016,"[Dominican Republic, Congo, Peru, Belize, Viet...","[Tres Hombres, Congo, Gorilla bar, Peru, Awagu...","[75%, 68%, 80%, 70%, 70%, 70%, 70%, 70%, 70%, ...",70.75,"[3.25, 3.5, 3.75, 3.0, 3.0, 3.25, 3.0, 3.25, 3...",3.31,"[Beans, Sugar, Cocoa Butter, Beans, Sugar, Coc...","[[Beans, Sugar, Cocoa Butter], [Beans, Sugar, ...","[earthy, coffee, mild, tang, bold, rich cocoa,...","[[earthy, coffee, mild, tang], [bold, rich coc...",28.417693,-81.581111
1,Argentina,9,"[2008, 2008, 2008, 2008, 2010, 2008, 2008, 200...",2008,"[Venezuela, Ecuador, Bolivia, Venezuela, Venez...","[Carenero Superior, Esmeraldas, Moxos, Ocumare...","[80%, 88%, 72%, 70%, 70%, 70%, 70%, 70%, 70%]",73.33,"[2.75, 2.75, 3.25, 3.75, 3.25, 3.5, 3.5, 3.5, ...",3.31,"[, , , , , Beans, Sugar, Cocoa Butter, Lecithi...","[[], [], [], [], [], [Beans, Sugar, Cocoa Butt...","[very sour, mildly bitter, slightly burnt, mil...","[[very sour, mildly bitter], [slightly burnt, ...",46.603354,1.888334
2,Australia,53,"[2019, 2012, 2012, 2012, 2015, 2015, 2014, 201...",2014,"[Solomon Islands, Brazil, Blend, Madagascar, B...","[Solomon Islands, Bahia, Houseblend, Sambirano...","[70%, 70%, 70%, 70%, 80%, 70%, 68%, 72%, 70%, ...",70.19,"[3.0, 2.5, 2.5, 3.0, 3.25, 3.5, 2.75, 3.0, 3.5...",3.36,"[Beans, Sugar, Cocoa Butter, Lecithin, Beans, ...","[[Beans, Sugar, Cocoa Butter, Lecithin], [Bean...","[off aroma,vegetal,honey,sandy, chalky, intens...","[[off aroma,vegetal,honey,sandy], [chalky, int...",-18.123970,179.012274
3,Austria,30,"[2016, 2016, 2016, 2016, 2016, 2016, 2016, 201...",2013,"[Bolivia, Haiti, Colombia, Ecuador, Trinidad, ...","[Alto Beni, Pisa, Tumaco, UNOCACE, San Juan Es...","[76%, 82%, 74%, 70%, 85%, 85%, 70%, 70%, 78%, ...",72.13,"[2.75, 3.0, 3.25, 2.75, 2.75, 3.0, 3.5, 3.5, 3...",3.26,"[Beans, Sugar, Cocoa Butter, Beans, Sugar, Coc...","[[Beans, Sugar, Cocoa Butter], [Beans, Sugar, ...","[sticky, pungent, off note, sandy, earthy, swe...","[[sticky, pungent, off note], [sandy, earthy, ...",-16.525507,168.106915
4,Belgium,63,"[2010, 2010, 2010, 2010, 2011, 2011, 2013, 201...",2013,"[Costa Rica, Papua New Guinea, Peru, Ecuador, ...","[Costa Rica, Papua New Guinea, Peru, Ecuador, ...","[64%, 64%, 64%, 71%, 72%, 72%, 74%, 74%, 74%, ...",72.14,"[2.75, 2.75, 2.75, 3.5, 3.75, 4.0, 3.5, 3.5, 3...",3.10,"[Beans, Sugar, Cocoa Butter, Lecithin, Beans, ...","[[Beans, Sugar, Cocoa Butter, Lecithin], [Bean...","[pastey, nutty, intense, smokey, sticky, moss,...","[[pastey, nutty], [intense, smokey], [sticky, ...",-1.339767,-79.366697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,U.S.A.,1136,"[2019, 2019, 2019, 2021, 2021, 2021, 2021, 201...",2015,"[Tanzania, Dominican Republic, Madagascar, Fij...","[Kokoa Kamili, batch 1, Zorzal, batch 1, Bejof...","[76%, 76%, 76%, 68%, 72%, 80%, 68%, 70%, 70%, ...",71.39,"[3.25, 3.5, 3.75, 3.0, 3.0, 3.25, 3.5, 3.75, 3...",3.19,"[Beans, Sugar, Cocoa Butter, Beans, Sugar, Coc...","[[Beans, Sugar, Cocoa Butter], [Beans, Sugar, ...","[rich cocoa, fatty, bready, cocoa, vegetal, sa...","[[rich cocoa, fatty, bready], [cocoa, vegetal,...",-31.761336,-71.318770
63,Vanuatu,4,"[2019, 2019, 2019, 2019]",2019,"[Vanuatu, Vanuatu, Vanuatu, Vanuatu]","[Malekula Island, batch M10/19, Malo Island, b...","[70%, 70%, 70%, 70%]",70.00,"[2.5, 2.75, 2.75, 3.0]",2.75,"[Beans, Sugar, Beans, Sugar, Beans, Sugar, Bea...","[[Beans, Sugar], [Beans, Sugar], [Beans, Sugar...","[unrefined, sweet, metallic, basic, sweet, coc...","[[unrefined, sweet, metallic], [basic, sweet, ...",8.030028,-1.080027
64,Venezuela,31,"[2018, 2018, 2018, 2014, 2014, 2014, 2014, 201...",2014,"[Venezuela, Venezuela, Venezuela, Venezuela, V...","[Merida, Carabobo, Aragua, Agua Fria; Sucre re...","[70%, 70%, 70%, 75%, 74%, 75%, 70%, 74%, 75%, ...",70.00,"[3.0, 3.0, 3.5, 2.5, 3.25, 3.5, 3.75, 3.25, 3....",3.11,"[Beans, Sugar, Cocoa Butter, Lecithin, Beans, ...","[[Beans, Sugar, Cocoa Butter, Lecithin], [Bean...","[gritty, spicy, earthy, fatty, bland, nutty, s...","[[gritty, spicy, earthy], [fatty, bland, nutty...",64.984182,-18.105901
65,Vietnam,16,"[2018, 2021, 2011, 2012, 2012, 2012, 2012, 201...",2014,"[Vietnam, Vietnam, Vietnam, Vietnam, Vietnam, ...","[Lam Dong, Ben Tre, batch 70301, Ben Tre, Dong...","[75%, 70%, 72%, 80%, 76%, 72%, 70%, 78%, 74%, ...",74.06,"[3.25, 3.75, 3.0, 3.0, 3.5, 3.5, 3.5, 3.5, 3.5...",3.36,"[Beans, Sugar, Cocoa Butter, Beans, Sugar, Coc...","[[Beans, Sugar, Cocoa Butter], [Beans, Sugar, ...","[high acidity, nutty, roasty, dominate cocoa n...","[[high acidity, nutty, roasty], [dominate coco...",4.141303,-56.077119


In [40]:
json_agg = []

for row in aggregated.iterrows():
    json_agg.append(dict(row[1]))

json_agg[0]

{'company_location_name': 'Amsterdam',
 'company_location_count': 12,
 'review_date_list': [2015,
  2015,
  2015,
  2017,
  2017,
  2017,
  2018,
  2018,
  2018,
  2019,
  2015,
  2018],
 'review_date_mean': 2016,
 'country_bean_origin_list': ['Dominican Republic',
  'Congo',
  'Peru',
  'Belize',
  'Vietnam',
  'Mexico',
  'Madagascar',
  'Philippines',
  'Mexico',
  'Papua New Guinea',
  'Blend',
  'Sulawesi'],
 'specific_bean_origin_list': ['Tres Hombres',
  'Congo, Gorilla bar',
  'Peru, Awagum bar',
  'Belize',
  'Vietnam',
  'Xoconusco',
  "Mava Sa Ferme D'ottange",
  'Kablon Farm',
  'Xoconusco, triple turned',
  'Gazelle',
  'South America',
  'Sulawesi'],
 'cocoa_percent_list': ['75%',
  '68%',
  '80%',
  '70%',
  '70%',
  '70%',
  '70%',
  '70%',
  '70%',
  '70%',
  '70%',
  '66%'],
 'cocoa_percent_mean': 70.75,
 'rating_list': [3.25,
  3.5,
  3.75,
  3.0,
  3.0,
  3.25,
  3.0,
  3.25,
  3.75,
  3.25,
  3.5,
  3.25],
 'rating_mean': 3.31,
 'ingredients_list': ['Beans',
  'Sug

In [42]:
with open('aggregated_dataset.json', 'w') as outfile:
    json.dump(json_agg, outfile, )

In [15]:
makers = pd.read_csv('../data/chocolate_makers.csv')
makers.shape, makers.columns

((373, 5),
 Index(['COMPANY NAME', 'CITY', 'STATE/PROVINCE', 'OWNER/MAKER', 'COUNTRY'], dtype='object'))

In [17]:
places = list(makers['STATE/PROVINCE'].unique())

lat_long = []
# use geopy to extract coordinates from place names
for place in tqdm(places): 
        loc = findGeocode(place) 
        lat_long.append({'latitude': loc.latitude, 'longitude': loc.longitude})


100%|██████████| 59/59 [00:32<00:00,  1.82it/s]


In [18]:
with open('state_locations.json', 'w') as outfile:
    json.dump(lat_long, outfile, )

In [21]:
places = list(makers['CITY'].unique())

lat_long = []
# use geopy to extract coordinates from place names
for place in tqdm(places): 
        loc = findGeocode(place) 
        try:
                lat_long.append({'latitude': loc.latitude, 'longitude': loc.longitude})
        except:
                print(place)

 21%|██        | 55/266 [00:27<01:43,  2.03it/s]

Ascata


 41%|████      | 108/266 [00:53<01:24,  1.87it/s]

Anahola (Kaua'I)


 71%|███████   | 189/266 [01:35<00:40,  1.90it/s]

Swamanoa


 82%|████████▏ | 217/266 [01:49<00:24,  2.03it/s]

Chattonooga


100%|██████████| 266/266 [02:14<00:00,  1.97it/s]


In [22]:
with open('city_locations.json', 'w') as outfile:
    json.dump(lat_long, outfile, )