In [360]:
# imports
import pandas as pd
import numpy as np
from helpers import select_entries, clean_categories

In [361]:
data_folder = "./data/"

In [362]:
france = pd.read_csv(data_folder + 'france_data.csv.zip')
usa = pd.read_csv(data_folder + 'us_data.csv.zip')

# Category: Snacks

First we select all rows that is associated with snacks. Then we clean out the category to smaller categories.

In [363]:
snacks = [
    'snack', 'bar', 'ice cream', 'chips', 'popcorn', 'cake', 'cookie', 'choco', 'sugary', 'salty'
]

In [364]:
snacks_france = select_entries(france, snacks)
snacks_usa = select_entries(usa, snacks)

In [365]:
print("There are %d entries that are classified as snacks for France." % len(snacks_france))
print("There are %d entries that are classified as snacks for USA." % len(snacks_usa))

There are 56325 entries that are classified as snacks for France.
There are 31544 entries that are classified as snacks for USA.


## Sugary snacks

We create one category called `sugary_snacks`. This category will have all snacks defined as sugary and not salty. Then we will devide this category in smaller categories.

In [366]:
sugary = [
    'choco', 'coco', 'cookie', 'candy', 'candies', 'candi', 'bonbon', 'fudge', 'caramel', 'bar', 'nougat',
    'sugar', 'sucr', 'cake', 'gâteux', 'ice cream'
]

In [367]:
sugary_snacks_france = select_entries(snacks_france, sugary)
sugary_snacks_usa = select_entries(snacks_usa, sugary)

In [368]:
not_sugary_snacks = [
    'popcorn', 'chips', 'patate', 'potato', 'Snacks salés'
]
not_sugary_snacks_ingredients = ['patate', 'salé', 'salt']

In [369]:
sugary_snacks_france = clean_categories(sugary_snacks_france, not_sugary_snacks, not_sugary_snacks_ingredients)
sugary_snacks_usa = clean_categories(sugary_snacks_usa, not_sugary_snacks, not_sugary_snacks_ingredients)

In [370]:
print("Number of sugary snacks in France: %d" % len(sugary_snacks_france))
print("Number of sugary snacks in USA: %d" % len(sugary_snacks_usa))

Number of sugary snacks in France: 45757
Number of sugary snacks in USA: 11177


The category contains a lot of rows. We will divide the category into sub-categories `chocolates`, `bars`, `candy` and `cookies`. We start with chocolates.

In [371]:
chocolate = [
    'chocolate', 'chocolat', 'choco', 'cacao', 'coco'
]

In [372]:
chocolates_france = select_entries(sugary_snacks_france, chocolate)
chocolates_usa = select_entries(sugary_snacks_usa, chocolate)

Looking at the frames we do not find any values that clearly should not be there and therefore we skip cleaning the category.

In [373]:
print("Sugary snacks that are made with chocolate in France: %d" % len(chocolates_france))
print("Sugary snacks that are made with chocolate in USA: %d" % len(chocolates_usa))

Sugary snacks that are made with chocolate in France: 26840
Sugary snacks that are made with chocolate in USA: 5742


We then move forward to `bars`.

In [374]:
bars = ['bar', 'bars']

In [375]:
bars_france = select_entries(sugary_snacks_france, bars)
bars_usa = select_entries(sugary_snacks_usa, bars)

We look at the dataframes to decide what should not be classified as bars and then remove based on words that are not associated with bars. Note that we keep all bars that are made with chocolate even though these are also in the `chocolates` category.

In [376]:
not_bars = ['barbecue', 'rhubarb', 'barbe', 'baratte', 'barley']

In [377]:
bars_france = clean_categories(bars_france, not_bars)
bars_usa = clean_categories(bars_usa, not_bars)

In [378]:
print('Number of bars in France: %d' % len(bars_france))
print('Number of bars in the USA: %d' % len(bars_usa))

Number of bars in France: 5608
Number of bars in the USA: 2261


Next up is `candy`.

In [379]:
candy = [
    'candy', 'candies', 'bonbon', 'candi', 'confectioneries', 'pastilles', 'mints', 'caramels', 'gummies',
    'lollipop'
]

In [380]:
candy_france = select_entries(sugary_snacks_france, candy)
candy_usa = select_entries(sugary_snacks_usa, candy)

Looking at the frames no values seem to be out of place.

In [381]:
print("Rows classified as candy in France: %d" % len(candy_france))
print("Rows classified as candy in the USA: %d" % len(candy_usa))

Rows classified as candy in France: 4898
Rows classified as candy in the USA: 419


Lastly we create the category `cookies`.

In [382]:
cookies_france = select_entries(sugary_snacks_france, ['cookie', 'biscuit'])
cookies_usa = select_entries(sugary_snacks_usa, ['cookie', 'biscuit'])

In [383]:
not_cookies = [
    'brownie', 'muffin', 'candy', 'candies', 'candi', 'kit kat', 'tourteaux fromagés', 'gaufres', 'gâteau à la',
    'cakes ', 'fondants ', 'pancake', 'ice cream'
]

In [384]:
cookies_france = clean_categories(cookies_france, not_cookies)
cookies_usa = clean_categories(cookies_usa, not_cookies)

In [385]:
print("Number of cookies in France: %d" % len(cookies_france))
print("Number of cookies in USA: %d" % len(cookies_usa))

Number of cookies in France: 10735
Number of cookies in USA: 1822


## Salty snacks

The second big category will be `salty_snacks`. We will then divide this category into smaller categories.

In [405]:
salty = [
    'salt', 'salé', 'chips', 'popcorn', 'sodium', 'crisps'
]

In [406]:
salty_snacks_france = select_entries(snacks_france, salty)
salty_snacks_usa = select_entries(snacks_usa, salty)

In [411]:
not_salty = [
    'sucr', 'sugar', 'cake', 'caramel'
]

In [412]:
len(salty_snacks_france)

6956

In [413]:
salty_snacks_france = clean_categories(salty_snacks_france, not_salty)
salty_snacks_usa = clean_categories(salty_snacks_usa, not_salty)

We divide the category into `chips` and `popcorn`, starting out with `chips`.

In [415]:
chips_france = select_entries(salty_snacks_france, ['chips', 'crisps'])
chips_usa = select_entries(salty_snacks_usa, ['chips', 'crisps'])

We then proceed to `popcorn`.

In [419]:
popcorn_france = select_entries(salty_snacks_france, ['popcorn'])
popcorn_usa = select_entries(salty_snacks_usa, ['popcorn'])

In [423]:
popcorn_france

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,product_name,brands,brands_tags,ingredients_text,serving_size,categories,categories_tags,categories_en,...,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,nutrition-score-fr_100g,nutrition-score-uk_100g
133,133,133,Maïs popcorn,,,,,,,,...,,,,,,,,,,
452,452,452,Popcorn salés,,,,,,,,...,,,,,,,,,,
1368,1368,1368,Popcorn bites,M&S,m-s,,,,,,...,,,,,,,,,,
2350,2350,2350,"Premium Popcorn, Movie Theater Butter",Pop-Secret,pop-secret,"Whole grain popcorn, partially hydrogenated so...",32 g (2 Tbsp),,,,...,,,,,,,,,,
2724,2724,2724,Popcorn Américain : Maïs Pour Micro-ondes,,,,,,,,...,,,,,,,,,,
2937,2937,2937,Golden Fluff 100 Cal Microwave Popcorn,Golden seafood,golden-seafood,"EDIENTS: Popcorn, Palm Oil, Salt",,,,,...,,,,,,,,,,
2938,2938,2938,Golden Fluff Microwave Popcorn Light,Golden Fluff,golden-fluff,"Popcorn, Palm Oil, Salt. INGREDIENTS: Maïs, Hu...",,,,,...,,,,,,,,,,
2939,2939,2939,Golden Fluff Microwave Popcorn,,,,,,,,...,,,,,,,,,,
4264,4264,4264,Sweet popcorn,,,,,,,,...,,,,,,,,,,
5863,5863,5863,Popcorn,,,,,,,,...,,,,,,,,,,
