# Dataset notebook

Import all the different chunks of the dataset, put into a single file and remove unneeded columns (first two)

In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('openfood_dataset.tsv', sep='\t', low_memory=False)

In [8]:
df.columns.values

array(['code', 'url', 'creator', 'created_t', 'created_datetime',
       'last_modified_t', 'last_modified_datetime', 'product_name',
       'generic_name', 'quantity', 'packaging', 'packaging_tags',
       'brands', 'brands_tags', 'categories', 'categories_tags',
       'categories_en', 'origins', 'origins_tags', 'manufacturing_places',
       'manufacturing_places_tags', 'labels', 'labels_tags', 'labels_en',
       'emb_codes', 'emb_codes_tags', 'first_packaging_code_geo',
       'cities', 'cities_tags', 'purchase_places', 'stores', 'countries',
       'countries_tags', 'countries_en', 'ingredients_text', 'allergens',
       'allergens_en', 'traces', 'traces_tags', 'traces_en',
       'serving_size', 'no_nutriments', 'additives_n', 'additives',
       'additives_tags', 'additives_en', 'ingredients_from_palm_oil_n',
       'ingredients_from_palm_oil', 'ingredients_from_palm_oil_tags',
       'ingredients_that_may_be_from_palm_oil_n',
       'ingredients_that_may_be_from_palm_oil',
   

## Extract just the column needed

- product_name
- fat_100g
- carbohydrates_100g
- proteins_100g
- energy_100g (kJ)
- saturated_fat_100g
- sugars_100g
- fiber_100g
- salt_100g

In [11]:
COLUMNS = ["product_name", "fat_100g", "carbohydrates_100g", "proteins_100g", "energy_100g", "saturated-fat_100g", "sugars_100g", "fiber_100g", "salt_100g"]

In [12]:
df_projected = df[COLUMNS]

In [13]:
df_projected

Unnamed: 0,product_name,fat_100g,carbohydrates_100g,proteins_100g,energy_100g,saturated-fat_100g,sugars_100g,fiber_100g,salt_100g
0,Farine de blé noir,,,,,,,,
1,Banana Chips Sweetened (Whole),28.57,64.29,3.57,2243.0,28.57,14.29,3.6,0.00000
2,Peanuts,17.86,60.71,17.86,1941.0,0.00,17.86,7.1,0.63500
3,Organic Salted Nut Mix,57.14,17.86,17.86,2540.0,5.36,3.57,7.1,1.22428
4,Organic Polenta,1.43,77.14,8.57,1552.0,,,5.7,
...,...,...,...,...,...,...,...,...,...
356022,"Mint Melange Tea A Blend Of Peppermint, Lemon ...",0.00,0.00,0.00,0.0,0.00,0.00,0.0,0.00000
356023,乐吧泡菜味薯片,,,,,,,,
356024,Biscottes bio,,,,,,,,
356025,Tomates aux Vermicelles,,,,,,,,


## Sanitize dataset

It seems like some rows are not following the rule of values per 100g of product, since some foods have values for either protein, fat or carbs above 100. 
For this reason, those rules are pruned from the dataset.

Dataset is then sorted aphabetically by food name.

In [17]:
df_without_nulls = df_projected.dropna(how='any',axis=0) 

In [20]:
df_without_nulls

Unnamed: 0,product_name,fat_100g,carbohydrates_100g,proteins_100g,energy_100g,saturated-fat_100g,sugars_100g,fiber_100g,salt_100g
1,Banana Chips Sweetened (Whole),28.57,64.29,3.57,2243.0,28.57,14.29,3.6,0.00000
2,Peanuts,17.86,60.71,17.86,1941.0,0.00,17.86,7.1,0.63500
3,Organic Salted Nut Mix,57.14,17.86,17.86,2540.0,5.36,3.57,7.1,1.22428
7,Organic Muesli,18.75,57.81,14.06,1833.0,4.69,15.62,9.4,0.13970
12,Zen Party Mix,36.67,36.67,16.67,2230.0,5.00,3.33,6.7,1.60782
...,...,...,...,...,...,...,...,...,...
355981,"Biscuits aux céréales, aux pépites de chocolat...",22.00,60.00,8.00,2008.0,9.00,27.00,5.0,0.28000
355985,Natural Cassava,0.00,87.06,1.18,1477.0,0.00,2.35,4.7,0.03048
356005,Tartines craquantes bio au sarrasin,2.80,74.80,13.00,1643.0,0.60,2.60,5.9,0.68000
356017,Thé vert Earl grey,0.20,0.50,0.50,21.0,0.20,0.50,0.2,0.02540


In [23]:
df_sanitized = df_without_nulls[(df_without_nulls["proteins_100g"] < 100) | (df_without_nulls["carbohydrates_100g"] < 100) | (df_without_nulls["fat_100g"] < 100)]
df_sanitized = df_sanitized.drop(df_sanitized[(df_sanitized['proteins_100g'] == 0.0) & (df_sanitized['carbohydrates_100g'] == 0.0) & (df_sanitized['fat_100g'] == 0.0)].index)


In [24]:
df_sanitized

Unnamed: 0,product_name,fat_100g,carbohydrates_100g,proteins_100g,energy_100g,saturated-fat_100g,sugars_100g,fiber_100g,salt_100g
1,Banana Chips Sweetened (Whole),28.57,64.29,3.57,2243.0,28.57,14.29,3.6,0.00000
2,Peanuts,17.86,60.71,17.86,1941.0,0.00,17.86,7.1,0.63500
3,Organic Salted Nut Mix,57.14,17.86,17.86,2540.0,5.36,3.57,7.1,1.22428
7,Organic Muesli,18.75,57.81,14.06,1833.0,4.69,15.62,9.4,0.13970
12,Zen Party Mix,36.67,36.67,16.67,2230.0,5.00,3.33,6.7,1.60782
...,...,...,...,...,...,...,...,...,...
355979,Fairy tail,12.00,12.00,12.00,50.0,12.00,12.00,12.0,12.00000
355981,"Biscuits aux céréales, aux pépites de chocolat...",22.00,60.00,8.00,2008.0,9.00,27.00,5.0,0.28000
355985,Natural Cassava,0.00,87.06,1.18,1477.0,0.00,2.35,4.7,0.03048
356005,Tartines craquantes bio au sarrasin,2.80,74.80,13.00,1643.0,0.60,2.60,5.9,0.68000


In [25]:
final_dataset = df_sanitized.sort_values('product_name')
final_dataset

Unnamed: 0,product_name,fat_100g,carbohydrates_100g,proteins_100g,energy_100g,saturated-fat_100g,sugars_100g,fiber_100g,salt_100g
331606,18 marrons glacés,0.8,76.00,0.80,1378.0,0.1,56.00,4.8,0.050038
219939,"6 carrés fourrés, saveur amande",14.3,59.10,5.50,1636.0,6.2,34.30,2.6,0.530000
259660,8 Beignet de crevette,10.8,28.70,5.10,977.0,0.8,9.90,0.4,1.750000
354064,Beef Madras,7.8,5.40,12.40,601.0,4.5,4.00,1.2,0.838200
287758,Beeren schokoliert,36.5,45.60,8.30,2296.0,22.9,45.30,3.6,0.300000
...,...,...,...,...,...,...,...,...,...
293401,超大紅寶石葡萄乾,0.0,77.50,2.50,1339.0,0.0,72.50,5.0,0.063500
293825,道地百果園柑桔檸檬果汁飲品,0.0,6.20,0.00,439.0,0.0,6.10,0.0,0.071000
313397,頂好牌花生醬,48.7,25.30,22.50,2431.0,11.5,10.60,12.2,0.988060
350849,초고추장,0.0,47.37,5.26,883.0,0.0,21.05,5.3,4.813300


## Save final dataset

In [26]:
final_dataset.to_csv("projected_food_dataset.csv")