In [1]:
import pandas as pd

In [2]:
# Read the csv data file
food_facts = pd.read_csv('en.openfoodfacts.org.products.csv', sep='\t', low_memory=False)

In [4]:
food_facts.shape

(665697, 174)

In [25]:
countries_info = food_facts[['code', 'url', 'countries', 'countries_en']]
countries_info.head()

Unnamed: 0,code,url,countries,countries_en,countries_tags
0,17,http://world-en.openfoodfacts.org/product/0000...,en:france,France,en:france
1,31,http://world-en.openfoodfacts.org/product/0000...,en:FR,France,en:france
2,123,http://world-en.openfoodfacts.org/product/0000...,en:france,France,en:france
3,291,http://world-en.openfoodfacts.org/product/0000...,en:france,France,en:france
4,949,http://world-en.openfoodfacts.org/product/0000...,en:france,France,en:france


In [41]:
# Most of these containing tags for `to-be-completed` and stuff are about Milk and yogurt and these rows can be dropped
rows_to_be_dropped = countries_info[countries_info['countries'] == 'Milk and yogurt']

In [42]:
# Drop the rows which contain the Milk and Yogurt thing from both dataset and sub dataset
food_facts = food_facts.drop(food_facts.index[rows_to_be_dropped.index.tolist()])
countries_info = countries_info.drop(food_facts.index[rows_to_be_dropped.index.tolist()])

In [59]:
# Groupby the country to check if the data is skewed towards a particular country
products_by_unique_countries = countries_info.groupby('countries_en')['code'].nunique()
products_by_unique_countries = products_by_unique_countries.reset_index()

In [66]:
products_by_unique_countries['countries_en'].str.split(',', expand=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,Afghanistan,,,,,,,,,,...,,,,,,,,,,
1,Afghanistan,France,,,,,,,,,...,,,,,,,,,,
2,Albania,,,,,,,,,,...,,,,,,,,,,
3,Albania,Bosnia and Herzegovina,Bulgaria,Croatia,Czech Republic,Finland,France,Greece,Hungary,Netherlands,...,Slovenia,Spain,Sweden,United Kingdom,,,,,,
4,Albania,Bosnia and Herzegovina,Croatia,Czech Republic,Hungary,Republic of Macedonia,Republika Srpska,Romania,,,...,,,,,,,,,,
5,Albania,Denmark,Italy,,,,,,,,...,,,,,,,,,,
6,Albania,France,,,,,,,,,...,,,,,,,,,,
7,Albania,France,Italy,,,,,,,,...,,,,,,,,,,
8,Albania,Italy,,,,,,,,,...,,,,,,,,,,
9,Albania,Republic of Macedonia,Romania,Serbia,Slovenia,,,,,,...,,,,,,,,,,


## Observation about countries information about products

- The `countries` column might contain different tags for the same country as we can see for the first two rows that it contains `en:france` and `en:FR` for the same country `France` in `countries_en` column.
- There are cases where for particular country tags are weird and contain `to-be-completed` and stuff and these rows can be dropped since most of them I think are test cases since they are about just `Milk and yogurt` and no `url` and `code` is defined for them. So, we will drop them from the dataset.
- One of the other issues we have is that sometimes the countries are written in different languages such as `fr:Allemagne` and sometimes also in Chinese letters.

In [67]:
food_facts.head()

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,product_name,generic_name,quantity,...,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g,choline_100g,phylloquinone_100g,beta-glucan_100g,inositol_100g,carnitine_100g
0,17,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1529059080,2018-06-15T10:38:00Z,1529059204,2018-06-15T10:40:04Z,Vitória crackers,,,...,,,,,,,,,,
1,31,http://world-en.openfoodfacts.org/product/0000...,isagoofy,1539464774,2018-10-13T21:06:14Z,1539464817,2018-10-13T21:06:57Z,Cacao,,130 g,...,,,,,,,,,,
2,123,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1535737982,2018-08-31T17:53:02Z,1535737986,2018-08-31T17:53:06Z,Sauce Sweety chili 0%,,,...,,,,,,,,,,
3,291,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1534239669,2018-08-14T09:41:09Z,1534239732,2018-08-14T09:42:12Z,Mendiants,,,...,,,,,,,,,,
4,949,http://world-en.openfoodfacts.org/product/0000...,kiliweb,1523440813,2018-04-11T10:00:13Z,1523440823,2018-04-11T10:00:23Z,Salade de carottes râpées,,,...,,,,,,,,,,


In [84]:
# Get all products for which the carbon footprint information is given and product name is not NaN
carbon_footprint_food = food_facts[['code', 'product_name', 'url', 'carbon-footprint_100g']]
mask = (carbon_footprint_food['product_name'].notnull()) & (carbon_footprint_food['carbon-footprint_100g'].notnull()) & (carbon_footprint_food['carbon-footprint_100g'] != 0.0)

carbon_footprint_valid_data = carbon_footprint_food[mask]

print('Total number of data points for carbon footprint are: {}'.format(carbon_footprint_valid_data.shape[0]))

Total number of data points for carbon footprint are: 259


In [85]:
carbon_footprint_valid_data.head()

Unnamed: 0,code,product_name,url,carbon-footprint_100g
142698,613008722654,Green tea,http://world-en.openfoodfacts.org/product/0613...,68.0
181485,819377000120,châtaignes pelées et rôties cesares,http://world-en.openfoodfacts.org/product/0819...,300.0
210127,2000000074609,Terrine de Chevreuil,http://world-en.openfoodfacts.org/product/2000...,0.05
210135,2000000074660,TERRINE AUX CHÂTAIGNES,http://world-en.openfoodfacts.org/product/2000...,0.05
218577,20386658,Emotionali Multi Fruit Lollies,http://world-en.openfoodfacts.org/product/2038...,135.0
