## Importing Key Packages

In [2]:
import pandas as pd
import numpy as np

## Reading In the Data

In [3]:
df = pd.read_csv('../../../data/chocolate.csv')
df

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating
0,5,Jacque Torres,U.S.A.,2006,Ghana,Trinatario Treasure,71%,"5- B,S,C,V,L","gritty, unrefined, off notes",2.00
1,15,Neuhaus (Callebaut),Belgium,2006,Sao Tome,Sao Tome,75%,"5- B,S,C,V,L","grassy, earthy, burnt",2.75
2,15,Neuhaus (Callebaut),Belgium,2006,Blend,West Africa,73%,"5- B,S,C,V,L","non descript, poor aftertaste",2.00
3,15,Scharffen Berger,U.S.A.,2006,Blend,Bittersweet,70%,"5- B,S,C,V,L","cherry, mild bitter",3.50
4,15,Scharffen Berger,U.S.A.,2006,Blend,Extra Dark,82%,"5- B,S,C,V,L","dry, bitter, poor aftertaste",2.00
...,...,...,...,...,...,...,...,...,...,...
2357,995,Bahen & Co.,Australia,2012,Madagascar,Sambirano,70%,"2- B,S","unrefined, flat, grassy",3.00
2358,999,Mast Brothers,U.S.A.,2012,Venezuela,Chuao,73%,"2- B,S","gritty, sour, sweet",2.75
2359,999,Mast Brothers,U.S.A.,2012,Blend,Madagascar,72%,"2- B,S","gritty, overly tart, sour",2.50
2360,999,Bahen & Co.,Australia,2012,Brazil,Bahia,70%,"2- B,S","chalky, intense, chemical",2.50


## Data Cleaning

Next, we dropped an unecessary column from our dataframe

In [5]:
df = df.dropna(subset = ['Ingredients'])

We then checked the dataframe for null values.


Luckily, there were not any.

In [4]:
df.isna().sum()

REF                                 0
Company (Manufacturer)              0
Company Location                    0
Review Date                         0
Country of Bean Origin              0
Specific Bean Origin or Bar Name    0
Cocoa Percent                       0
Ingredients                         0
Most Memorable Characteristics      0
Rating                              0
dtype: int64

To make the Review Data column easier to manage, we converted it to datetime.

In [6]:
df.loc[:,'Review Date'] = pd.to_datetime(df['Review Date'])

In [7]:
df.dtypes

REF                                          int64
Company (Manufacturer)                      object
Company Location                            object
Review Date                         datetime64[ns]
Country of Bean Origin                      object
Specific Bean Origin or Bar Name            object
Cocoa Percent                               object
Ingredients                                 object
Most Memorable Characteristics              object
Rating                                     float64
dtype: object

In [8]:
df

Unnamed: 0,REF,Company (Manufacturer),Company Location,Review Date,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating
0,5,Jacque Torres,U.S.A.,1970-01-01 00:00:00.000002006,Ghana,Trinatario Treasure,71%,"5- B,S,C,V,L","gritty, unrefined, off notes",2.00
1,15,Neuhaus (Callebaut),Belgium,1970-01-01 00:00:00.000002006,Sao Tome,Sao Tome,75%,"5- B,S,C,V,L","grassy, earthy, burnt",2.75
2,15,Neuhaus (Callebaut),Belgium,1970-01-01 00:00:00.000002006,Blend,West Africa,73%,"5- B,S,C,V,L","non descript, poor aftertaste",2.00
3,15,Scharffen Berger,U.S.A.,1970-01-01 00:00:00.000002006,Blend,Bittersweet,70%,"5- B,S,C,V,L","cherry, mild bitter",3.50
4,15,Scharffen Berger,U.S.A.,1970-01-01 00:00:00.000002006,Blend,Extra Dark,82%,"5- B,S,C,V,L","dry, bitter, poor aftertaste",2.00
...,...,...,...,...,...,...,...,...,...,...
2357,995,Bahen & Co.,Australia,1970-01-01 00:00:00.000002012,Madagascar,Sambirano,70%,"2- B,S","unrefined, flat, grassy",3.00
2358,999,Mast Brothers,U.S.A.,1970-01-01 00:00:00.000002012,Venezuela,Chuao,73%,"2- B,S","gritty, sour, sweet",2.75
2359,999,Mast Brothers,U.S.A.,1970-01-01 00:00:00.000002012,Blend,Madagascar,72%,"2- B,S","gritty, overly tart, sour",2.50
2360,999,Bahen & Co.,Australia,1970-01-01 00:00:00.000002012,Brazil,Bahia,70%,"2- B,S","chalky, intense, chemical",2.50


However, the conversion to datetime tacked on an uneccessary default month and day of January 1st to the Review Date values. We fixed this by using dt.year and creating a new column, Year Reviewed.

In [10]:
df["Year Reviewed"] = df['Review Date'].dt.year

Next, we dropped the old Review Date column.

In [11]:
df = df.drop('Review Date', axis=1)

To incorporate various climate data into our chocolate dataset, we wrote multiple functions using dictionaries. These functions merged datasets on the basis of Country of Bean Origin.

In [17]:
def lat(country):
    lat_dict = {'Ghana':7.946527, 'Sao Tome':0.18636, 'Blend':np.NAN, 'Colombia':4.570868, 'Dominican Republic':18.735693, 'Madagascar':-18.766947,
 'Papua New Guinea':-6.314993, 'Venezuela':6.42375, 'U.S.A.':19.8968, 'Jamaica':18.109581, 'Vanuatu':-15.376706, 'Ecuador':-1.831239,
 'Bolivia':-16.290154, 'Trinidad':10.691803, 'Mexico':23.634501, 'Sri Lanka':7.873054, 'Peru':-9.189967, 'Indonesia':-0.789275, 'Cuba':21.521757,
 'Liberia':6.428055, 'Panama':8.537981, 'Brazil':-14.235004, 'Burma':21.913965, 'Belize':17.189877, 'St. Lucia':13.909444, 'Nicaragua':12.865416,
 'Vietnam':14.058324, 'Gabon':-0.803689, 'Congo':-0.228021, 'Puerto Rico':18.220833, 'Costa Rica':9.748917, 'Haiti':18.971187,
 'Philippines':12.879721, 'Tobago':10.691803, 'Martinique':14.641528, 'Grenada':12.262776, 'Guatemala':15.783471, 'Honduras':15.199999,
 'Fiji':-16.578193, 'Uganda':1.373333, 'Tanzania':-6.369028, 'El Salvador':13.794185, 'Ivory Coast':7.5400, 'Australia':-25.274398,
 'Cameroon':7.369722, 'Togo':8.619543, 'Malaysia':4.210484, 'Solomon Islands':-9.64571, 'Samoa':-13.759029, 'India':20.593684, 'Suriname':3.919305,
 'Nigeria':9.081999, 'Sulawesi':1.8479, 'Taiwan':23.69781, 'Sierra Leone':8.460555, 'Sumatra':0.5897, 'Thailand':15.870032,
 'DR Congo':4.0383, 'St.Vincent-Grenadines':12.984305, 'Sao Tome & Principe':0.18636, 'Principe':0.18636}
    if country in lat_dict.keys():
        return lat_dict[country]
    else:
        return np.nan

df['Latitude'] = df["Country of Bean Origin"].apply(lat)

In [18]:
def long(country):
    long_dict = {'Ghana':-1.023194, 'Sao Tome':6.613081, 'Blend':np.NAN, 'Colombia':-74.297333, 'Dominican Republic':-70.162651, 'Madagascar':46.869107,
 'Papua New Guinea':143.95555, 'Venezuela':-66.58973, 'U.S.A.':155.5828, 'Jamaica':-77.297508, 'Vanuatu':166.959158, 'Ecuador':-78.183406,
 'Bolivia':-63.588653, 'Trinidad':-61.222503, 'Mexico':-102.552784, 'Sri Lanka':80.771797, 'Peru':-75.015152, 'Indonesia':113.921327, 'Cuba':-77.781167,
 'Liberia':-9.429499, 'Panama':-80.782127, 'Brazil':-51.92528, 'Burma':95.956223, 'Belize':-88.49765, 'St. Lucia':-60.978893, 'Nicaragua':-85.207229,
 'Vietnam':108.277199, 'Gabon':11.609444, 'Congo':15.827659, 'Puerto Rico':-66.590149, 'Costa Rica':-83.753428, 'Haiti':-72.285215,
 'Philippines':121.774017, 'Tobago':-61.222503, 'Martinique':-61.024174, 'Grenada':-61.604171, 'Guatemala':-90.230759, 'Honduras':-86.241905,
 'Fiji':179.414413, 'Uganda':32.290275, 'Tanzania':34.888822, 'El Salvador':-88.89653, 'Ivory Coast':5.5471, 'Australia':133.775136,
 'Cameroon':12.354722, 'Togo':0.824782, 'Malaysia':101.975766, 'Solomon Islands':160.156194, 'Samoa':-172.104629, 'India':78.96288, 'Suriname':-56.027783,
 'Nigeria':8.675277, 'Sulawesi':120.5279, 'Taiwan':120.960515, 'Sierra Leone':-11.779889, 'Sumatra':101.3431, 'Thailand':100.992541,
 'DR Congo':21.758664, 'St.Vincent-Grenadines':-61.287228, 'Sao Tome & Principe':6.613081, 'Principe':6.613081}
    if country in long_dict.keys():
        return long_dict [country]
    else:
        return np.nan

df['Longitude'] = df["Country of Bean Origin"].apply(long)

In [19]:
def temp(country):
    temp_dict = {'Ghana':27.25, 'Sao Tome':27, 'Blend':np.NAN, 'Colombia':24.40, 'Dominican Republic':23.93, 'Madagascar':22.01,
 'Papua New Guinea':25.05, 'Venezuela':25.32, 'U.S.A.':27.5, 'Jamaica':24.47, 'Vanuatu':23.84, 'Ecuador':21.33,
 'Bolivia':20.98, 'Trinidad':25.86, 'Mexico':20.54, 'Sri Lanka':26.76, 'Peru':19.46, 'Indonesia':25.72, 'Cuba':25.13,
 'Liberia':25.25, 'Panama':24.72, 'Brazil':24.92, 'Burma':22.96, 'Belize':25.06, 'St. Lucia':27, 'Nicaragua':24.59,
 'Vietnam':24.07, 'Gabon':25.00, 'Congo':24.52, 'Puerto Rico':24.28, 'Costa Rica':23.85, 'Haiti':24.48,
 'Philippines':25.33, 'Tobago':25.86, 'Martinique':26.7, 'Grenada':28, 'Guatemala':23.07, 'Honduras':23.39,
 'Fiji':23.37, 'Uganda':22.60, 'Tanzania':22.31, 'El Salvador':24.77, 'Ivory Coast':26.30, 'Australia':21.51,
 'Cameroon':24.53, 'Togo':26.80, 'Malaysia':25.14, 'Solomon Islands':25.68, 'Samoa':28.5, 'India':23.95, 'Suriname':25.77,
 'Nigeria':26.78, 'Sulawesi':21.4, 'Taiwan':22, 'Sierra Leone':26.04, 'Sumatra':25.2, 'Thailand':26.25,
 'DR Congo':24.04, 'St.Vincent-Grenadines':26.82, 'Sao Tome & Principe':24.7, 'Principe':24.7}
    if country in temp_dict.keys():
        return temp_dict[country]
    else:
        return np.nan

df['Temperature (Celsius)'] = df["Country of Bean Origin"].apply(temp)

In [20]:
def precipitation(country):
    precipitation_dict = {'Ghana':1184.94, 'Sao Tome':1382, 'Blend':np.NAN, 'Colombia':2618.59, 'Dominican Republic':1417.07, 'Madagascar':1475.68,
 'Papua New Guinea':3106.06, 'Venezuela':1957.65, 'U.S.A.':433, 'Jamaica':2114.18, 'Vanuatu':2662.11, 'Ecuador':1945.27,
 'Bolivia':1095.56, 'Trinidad':1831.41, 'Mexico':738.14, 'Sri Lanka':1699.41, 'Peru':1513.24, 'Indonesia':2801.54, 'Cuba':1329.49,
 'Liberia':2460.08, 'Panama':2487.03, 'Brazil':1762.97, 'Burma':2007.70, 'Belize':2089.26, 'St. Lucia':1400, 'Nicaragua':2406.97,
 'Vietnam':1835.16, 'Gabon':1819.44, 'Congo':1635.05, 'Puerto Rico':2126.02, 'Costa Rica':3268.27, 'Haiti':1480.20,
 'Philippines':2317.59, 'Tobago':1831.41, 'Martinique':1159, 'Grenada':1208, 'Guatemala':2606.33, 'Honduras':1945.76,
 'Fiji':2905.11, 'Uganda':1207.90, 'Tanzania':1052.38, 'El Salvador':1672.81, 'Ivory Coast':1356.08, 'Australia':473.09,
 'Cameroon':1613.07, 'Togo':1189.11, 'Malaysia':2992.79, 'Solomon Islands':3157.37, 'Samoa':4500, 'India':1072.22, 'Suriname':2316.81,
 'Nigeria':1138.53, 'Sulawesi':3835, 'Taiwan':2590, 'Sierra Leone':2471.72, 'Sumatra':4000, 'Thailand':1532.48,
 'DR Congo':1535.04, 'St.Vincent-Grenadines':1585.97, 'Sao Tome & Principe':1382, 'Principe':1382}
    if country in precipitation_dict.keys():
        return precipitation_dict[country]
    else:
        return np.nan

df['Precipitation (millimeters)'] = df["Country of Bean Origin"].apply(precipitation)

In [21]:
df

Unnamed: 0,REF,Company (Manufacturer),Company Location,Country of Bean Origin,Specific Bean Origin or Bar Name,Cocoa Percent,Ingredients,Most Memorable Characteristics,Rating,Year Reviewed,Latitude,Longitude,Temperature (Celsius),Precipitation (millimeters)
0,5,Jacque Torres,U.S.A.,Ghana,Trinatario Treasure,71%,"5- B,S,C,V,L","gritty, unrefined, off notes",2.00,1970,7.946527,-1.023194,27.25,1184.94
1,15,Neuhaus (Callebaut),Belgium,Sao Tome,Sao Tome,75%,"5- B,S,C,V,L","grassy, earthy, burnt",2.75,1970,0.186360,6.613081,27.00,1382.00
2,15,Neuhaus (Callebaut),Belgium,Blend,West Africa,73%,"5- B,S,C,V,L","non descript, poor aftertaste",2.00,1970,,,,
3,15,Scharffen Berger,U.S.A.,Blend,Bittersweet,70%,"5- B,S,C,V,L","cherry, mild bitter",3.50,1970,,,,
4,15,Scharffen Berger,U.S.A.,Blend,Extra Dark,82%,"5- B,S,C,V,L","dry, bitter, poor aftertaste",2.00,1970,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2357,995,Bahen & Co.,Australia,Madagascar,Sambirano,70%,"2- B,S","unrefined, flat, grassy",3.00,1970,-18.766947,46.869107,22.01,1475.68
2358,999,Mast Brothers,U.S.A.,Venezuela,Chuao,73%,"2- B,S","gritty, sour, sweet",2.75,1970,6.423750,-66.589730,25.32,1957.65
2359,999,Mast Brothers,U.S.A.,Blend,Madagascar,72%,"2- B,S","gritty, overly tart, sour",2.50,1970,,,,
2360,999,Bahen & Co.,Australia,Brazil,Bahia,70%,"2- B,S","chalky, intense, chemical",2.50,1970,-14.235004,-51.925280,24.92,1762.97
