In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
from geopy import Nominatim


In [2]:
ratings = pd.read_csv('../data/chocolate_ratings.csv')
ratings.shape, ratings.columns

((2530, 10),
 Index(['REF', 'Company (Manufacturer)', 'Company Location', 'Review Date',
        'Country of Bean Origin', 'Specific Bean Origin or Bar Name',
        'Cocoa Percent', 'Ingredients', 'Most Memorable Characteristics',
        'Rating'],
       dtype='object'))

In [3]:
ratings['Cocoa Percent Int'] = ratings['Cocoa Percent'].str[:2].astype(int)

abbrev2ingredient = {'B':'Beans', 'S':'Sugar', 'S*': 'Sweetener other than white cane or beet sugar', 'C':'Cocoa Butter', 
                    'V': 'Vanilla', 'L': 'Lecithin', 'Sa': 'Salt', '':''}

ratings['Ingredients List'] = [[abbrev2ingredient[x] for x in i[1].strip().split(',')] for i in ratings['Ingredients'].fillna('0 - ').str.split('-')]

ratings['Most Memorable Characteristics List'] = ratings['Most Memorable Characteristics'].fillna(',').str.split(', ')

In [4]:
ratings.head()

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating,Cocoa Percent Int,Ingredients List,Most Memorable Characteristics List
0,2454,5150,U.S.A.,2019,Tanzania,"Kokoa Kamili, batch 1",76%,"3- B,S,C","rich cocoa, fatty, bready",3.25,76,"[Beans, Sugar, Cocoa Butter]","[rich cocoa, fatty, bready]"
1,2458,5150,U.S.A.,2019,Dominican Republic,"Zorzal, batch 1",76%,"3- B,S,C","cocoa, vegetal, savory",3.5,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, vegetal, savory]"
2,2454,5150,U.S.A.,2019,Madagascar,"Bejofo Estate, batch 1",76%,"3- B,S,C","cocoa, blackberry, full body",3.75,76,"[Beans, Sugar, Cocoa Butter]","[cocoa, blackberry, full body]"
3,2542,5150,U.S.A.,2021,Fiji,"Matasawalevu, batch 1",68%,"3- B,S,C","chewy, off, rubbery",3.0,68,"[Beans, Sugar, Cocoa Butter]","[chewy, off, rubbery]"
4,2546,5150,U.S.A.,2021,Venezuela,"Sur del Lago, batch 1",72%,"3- B,S,C","fatty, earthy, moss, nutty,chalky",3.0,72,"[Beans, Sugar, Cocoa Butter]","[fatty, earthy, moss, nutty,chalky]"


In [5]:
def findGeocode(place):  
    '''Finds the geocode of any location'''
    try:
        geolocator = Nominatim(user_agent="") ##user agent should be created
        return geolocator.geocode(place) 
    except Exception as e:
        print(e)
        return None


## Companies json

In [6]:
places = list(ratings['Company Location'].unique())

In [76]:
# lat_long = []
# # use geopy to extract coordinates from place names
# for place in tqdm(places): 
#         loc = findGeocode(place) 
#         lat_long.append({'latitude': loc.latitude, 'longitude': loc.longitude})


In [77]:
# with open('jsons/country_locations.json', 'w') as outfile:
#     json.dump(lat_long, outfile, )

In [7]:
with open('jsons/country_locations.json', 'r') as infile:
    lat_long = json.load(infile, )

print(lat_long[:2])
places_dic = {places[i]:lat_long[i] for i in range(len(lat_long))}

[{'latitude': 28.41769265, 'longitude': -81.58111071077724}, {'latitude': 46.603354, 'longitude': 1.8883335}]


In [8]:
aggregations = {'Company Location': ['count', ], #'first' 
                'Company (Manufacturer)': lambda x: list(x),
                'Review Date': [lambda x: list(x), lambda x: int(np.mean(x))], 
                'Country of Bean Origin': lambda x: list(x), 
                'Specific Bean Origin or Bar Name': lambda x: list(x),
                'Cocoa Percent': lambda x: list(x),
                'Cocoa Percent Int': lambda x: np.round(np.mean(x), 2),
                'Rating': [lambda x: list(x), lambda x: np.round(np.mean(x), 2)],
                'Ingredients List': ['sum', lambda x: list(x)],
                'Most Memorable Characteristics List': ['sum', lambda x: list(x)],
                }

In [16]:
aggregated = ratings.groupby('Company Location').agg(aggregations).reset_index()
aggregated.columns = ['location_name', 'location_count', 'company_names_list', 'review_date_list', 'review_date_mean', 'country_bean_origin_list',
                      'specific_bean_origin_list', 'cocoa_percent_list', 'cocoa_percent_mean', 'rating_list', 'rating_mean', 'ingredients_lists',
                      'ingredients_list_list', 'memorable_characteristics_lists', 'memorable_characteristics_list_list']
aggregated['latitude'] = [places_dic[loc]['latitude'] for loc in aggregated['location_name']]
aggregated['longitude'] = [places_dic[loc]['longitude'] for loc in aggregated['location_name']]
aggregated.sample()

Unnamed: 0,location_name,location_count,company_names_list,review_date_list,review_date_mean,country_bean_origin_list,specific_bean_origin_list,cocoa_percent_list,cocoa_percent_mean,rating_list,rating_mean,ingredients_lists,ingredients_list_list,memorable_characteristics_lists,memorable_characteristics_list_list,latitude,longitude
14,Ecuador,58,"[Aequare (Gianduja), Aequare (Gianduja), Bouga...","[2009, 2009, 2009, 2009, 2008, 2008, 2008, 200...",2010,"[Ecuador, Ecuador, Ecuador, Ecuador, Ecuador, ...","[Los Rios, Quevedo, Arriba, Los Rios, Quevedo,...","[55%, 70%, 100%, 77%, 91%, 82%, 63%, 71%, 77%,...",69.28,"[2.75, 3.0, 1.5, 3.25, 1.5, 2.5, 3.5, 3.5, 2.0...",3.04,"[Beans, Sugar, Cocoa Butter, Vanilla, Beans, S...","[[Beans, Sugar, Cocoa Butter, Vanilla], [Beans...","[sandy, sweet, banana cream, roasty, nutty, co...","[[sandy, sweet, banana cream], [roasty, nutty,...",-1.339767,-79.366697


In [17]:
to_nest = [col for col in aggregated.columns if col.split('_')[-1] == 'list']
to_nest

['company_names_list',
 'review_date_list',
 'country_bean_origin_list',
 'specific_bean_origin_list',
 'cocoa_percent_list',
 'rating_list',
 'ingredients_list_list',
 'memorable_characteristics_list_list']

In [18]:
json_agg = []

for row in aggregated.iterrows():

    dic_add = {col:row[1][col] for col in aggregated.columns if col not in to_nest}
    dic_add['card'] = [{'_'.join(col.split('_')[:-1]) : row[1][col][i] for col in aggregated.columns if col in to_nest} for i in range(row[1]['location_count']) ]
    json_agg.append(dic_add)

In [19]:
json_agg[0]

{'location_name': 'Amsterdam',
 'location_count': 12,
 'review_date_mean': 2016,
 'cocoa_percent_mean': 70.75,
 'rating_mean': 3.31,
 'ingredients_lists': ['Beans',
  'Sugar',
  'Cocoa Butter',
  'Beans',
  'Sugar',
  'Cocoa Butter',
  'Beans',
  'Sugar',
  'Cocoa Butter',
  'Beans',
  'Sugar',
  'Beans',
  'Sugar',
  'Beans',
  'Sugar',
  'Beans',
  'Sugar',
  'Beans',
  'Sugar',
  'Beans',
  'Sugar',
  'Beans',
  'Sugar',
  'Beans',
  'Sugar',
  'Vanilla',
  'Beans',
  'Sweetener other than white cane or beet sugar',
  'Cocoa Butter'],
 'memorable_characteristics_lists': ['earthy',
  'coffee',
  'mild',
  'tang',
  'bold',
  'rich cocoa',
  'coffee',
  'nuts',
  'coffee',
  'peanut butter',
  'gritty',
  'sour',
  'berries',
  'sticky',
  'molasses',
  'sour',
  'medium roast',
  'nutty',
  'roasty',
  'nutty',
  'nuts',
  'dried fruit',
  'grassy',
  'intense',
  'high acidity',
  'astringent',
  'smomkey cocoa',
  'accesible',
  'simple cocoa note',
  'nuts',
  'fruit',
  'cocoa'],

In [20]:
with open('jsons/aggregated_dataset.json', 'w') as outfile:
    json.dump(json_agg, outfile, )

## Beans json

In [21]:
# one json for bean origin
# one for links from country to bean origin

places = list(ratings['Country of Bean Origin'].unique())

In [85]:
# lat_long = []
# # use geopy to extract coordinates from place names
# for place in tqdm(places): 
#         loc = findGeocode(place) 
#         lat_long.append({'latitude': loc.latitude, 'longitude': loc.longitude})

In [86]:
# with open('jsons/beans_locations.json', 'w') as outfile:
#     json.dump(lat_long, outfile, )

In [22]:
with open('jsons/beans_locations.json', 'r') as infile:
    lat_long_beans = json.load(infile, )

print(lat_long_beans[:2])
places_dic_beans = {places[i]:lat_long_beans[i] for i in range(len(lat_long_beans))}

[{'latitude': -6.5247123, 'longitude': 35.7878438}, {'latitude': 19.0974031, 'longitude': -70.3028026}]


In [23]:
aggregations_beans = {'Country of Bean Origin': ['count', ], #'first' 
                'Company (Manufacturer)': lambda x: list(x),
                'Review Date': [lambda x: list(x), lambda x: int(np.mean(x))], 
                'Company Location': lambda x: list(x), 
                'Specific Bean Origin or Bar Name': lambda x: list(x),
                'Cocoa Percent': lambda x: list(x),
                'Cocoa Percent Int': lambda x: np.round(np.mean(x), 2),
                'Rating': [lambda x: list(x), lambda x: np.round(np.mean(x), 2)],
                'Ingredients List': ['sum', lambda x: list(x)],
                'Most Memorable Characteristics List': ['sum', lambda x: list(x)],
                }

In [24]:
aggregated_beans = ratings.groupby('Country of Bean Origin').agg(aggregations_beans).reset_index()
aggregated_beans.columns = ['location_name', 'location_count', 'company_names_list', 'review_date_list', 'review_date_mean', 'company_location_list',
                      'specific_bean_origin_list', 'cocoa_percent_list', 'cocoa_percent_mean', 'rating_list', 'rating_mean', 'ingredients_lists',
                      'ingredients_list_list', 'memorable_characteristics_lists', 'memorable_characteristics_list_list']
aggregated_beans['latitude'] = [places_dic_beans[loc]['latitude'] for loc in aggregated_beans['location_name']]
aggregated_beans['longitude'] = [places_dic_beans[loc]['longitude'] for loc in aggregated_beans['location_name']]
aggregated_beans.sample()

Unnamed: 0,location_name,location_count,company_names_list,review_date_list,review_date_mean,company_location_list,specific_bean_origin_list,cocoa_percent_list,cocoa_percent_mean,rating_list,rating_mean,ingredients_lists,ingredients_list_list,memorable_characteristics_lists,memorable_characteristics_list_list,latitude,longitude
49,Sumatra,1,[Love Brown],[2018],2018,[Taiwan],[Sumatra],[70%],70.0,[3.0],3.0,"[Beans, Sugar]","[[Beans, Sugar]]","[smokey, earthy, cocoa]","[[smokey, earthy, cocoa]]",-0.143294,101.624102


In [25]:
to_nest = [col for col in aggregated_beans.columns if col.split('_')[-1] == 'list']

json_agg_beans = []

for row in aggregated_beans.iterrows():

    dic_add = {col:row[1][col] for col in aggregated_beans.columns if col not in to_nest}
    dic_add['card'] = [{'_'.join(col.split('_')[:-1]) : row[1][col][i] for col in aggregated_beans.columns if col in to_nest} for i in range(row[1]['location_count']) ]
    json_agg_beans.append(dic_add)


In [26]:
json_agg_beans[0]

{'location_name': 'Australia',
 'location_count': 3,
 'review_date_mean': 2014,
 'cocoa_percent_mean': 69.0,
 'rating_mean': 3.25,
 'ingredients_lists': ['Beans',
  'Sugar',
  'Cocoa Butter',
  'Vanilla',
  'Lecithin',
  'Beans',
  'Sugar',
  'Cocoa Butter',
  'Beans',
  'Sugar',
  'Cocoa Butter'],
 'memorable_characteristics_lists': ['vanilla',
  'banana',
  'citrus',
  'sandy',
  'candy like vanilla',
  'off',
  'delicate',
  'smooth',
  'dairy'],
 'latitude': -24.7761086,
 'longitude': 134.755,
 'card': [{'company_names': 'Daintree',
   'review_date': 2011,
   'company_location': 'Australia',
   'specific_bean_origin': 'Daintree Estates, N. Queensland',
   'cocoa_percent': '70%',
   'rating': 3.25,
   'ingredients_list': ['Beans',
    'Sugar',
    'Cocoa Butter',
    'Vanilla',
    'Lecithin'],
   'memorable_characteristics_list': ['vanilla', 'banana', 'citrus']},
  {'company_names': 'Daintree',
   'review_date': 2015,
   'company_location': 'Australia',
   'specific_bean_origin': '

In [27]:
with open('jsons/bean_dataset.json', 'w') as outfile:
    json.dump(json_agg_beans, outfile, )

## Links json

In [128]:
aggregations_link = {
                'Company Location': 'first', 
                'Country of Bean Origin': 'first', #'first' 
                'Company (Manufacturer)': lambda x: list(x),
                }

In [129]:
aggregated_links = ratings.groupby(['Company Location', 'Country of Bean Origin']).agg(aggregations_link)#.reset_index()
aggregated_links.columns = ['company_location_name', 'bean_location_name', 'company_names_list',  ]
aggregated_links = aggregated_links.reset_index() 
aggregated_links['latitude'] = [places_dic[loc]['latitude'] for loc in aggregated_links['company_location_name']]
aggregated_links['longitude'] = [places_dic[loc]['longitude'] for loc in aggregated_links['company_location_name']]
aggregated_links['latitude_beans'] = [places_dic_beans[loc]['latitude'] for loc in aggregated_links['bean_location_name']]
aggregated_links['longitude_beans'] = [places_dic_beans[loc]['longitude'] for loc in aggregated_links['bean_location_name']]

aggregated_links['link_name'] = aggregated_links['Company Location'] + '_' + aggregated_links['Country of Bean Origin']
aggregated_links = aggregated_links.drop(columns=['Company Location', 'Country of Bean Origin'])
aggregated_links['link_count'] = aggregated_links['company_names_list'].str.len()
aggregated_links.sample()

Unnamed: 0,company_location_name,bean_location_name,company_names_list,latitude,longitude,latitude_beans,longitude_beans,link_name,link_count
181,Ghana,Ghana,[Omanhene],8.030028,-1.080027,8.030028,-1.080027,Ghana_Ghana,1


In [132]:
to_nest = [col for col in aggregated_links.columns if col.split('_')[-1] == 'list']

json_agg_links = []

for row in aggregated_links.iterrows():

    dic_add = {col:row[1][col] for col in aggregated_links.columns if col not in to_nest}
    dic_add['card'] = [{'_'.join(col.split('_')[:-2]) : row[1][col][i] for col in aggregated_links.columns if col in to_nest} for i in range(row[1]['link_count']) ]
    json_agg_links.append(dic_add)


In [133]:
with open('jsons/link_dataset.json', 'w') as outfile:
    json.dump(json_agg_links, outfile, )

## For the future: makers locations

In [92]:
makers = pd.read_csv('../data/chocolate_makers.csv')
makers.shape, makers.columns

((373, 5),
 Index(['COMPANY NAME', 'CITY', 'STATE/PROVINCE', 'OWNER/MAKER', 'COUNTRY'], dtype='object'))

In [93]:
places = list(makers['STATE/PROVINCE'].unique())

# lat_long = []
# # use geopy to extract coordinates from place names
# for place in tqdm(places): 
#         loc = findGeocode(place) 
#         lat_long.append({'latitude': loc.latitude, 'longitude': loc.longitude})


In [94]:
# with open('state_locations.json', 'w') as outfile:
#     json.dump(lat_long, outfile, )

In [95]:
places = list(makers['CITY'].unique())

# lat_long = []
# # use geopy to extract coordinates from place names
# for place in tqdm(places): 
#         loc = findGeocode(place) 
#         try:
#                 lat_long.append({'latitude': loc.latitude, 'longitude': loc.longitude})
#         except:
#                 print(place)

In [96]:
# with open('city_locations.json', 'w') as outfile:
#     json.dump(lat_long, outfile, )