In [1]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv("../data/en.openfoodfacts.org.products.tsv", sep = "\t")

df = df[["nutrition-score-uk_100g", "additives", "ingredients_text"]].dropna()

In [20]:
import sklearn.feature_extraction.text as text
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import re

display(df.head())
print(df.shape)

Unnamed: 0,nutrition-score-uk_100g,additives,ingredients_text
1,14.0,[ bananas -> en:bananas ] [ vegetable-oil -...,"Bananas, vegetable oil (coconut oil, corn oil ..."
2,0.0,[ peanuts -> en:peanuts ] [ wheat-flour -> ...,"Peanuts, wheat flour, sugar, rice flour, tapio..."
3,12.0,[ organic-hazelnuts -> en:organic-hazelnuts ...,"Organic hazelnuts, organic cashews, organic wa..."
7,7.0,[ org-oats -> en:org-oats ] [ oats -> en:oa...,"Org oats, org hemp granola (org oats, evaporat..."
12,12.0,[ roasted-peanuts -> en:roasted-peanuts ] [...,"Roasted peanuts (peanuts, peanut or canola oil..."


(230076, 3)


In [24]:
# import nltk
# nltk.download('stopwords')
stop_words = set( stopwords.words('english')+ ['de', 'contains', 's', 'et', 'of', 'en', "fr",'less', 'p'])

clean_additives = lambda s: re.sub(r'[^a-z- ]+' , " ", s.lower().replace("en:","").replace("->","")).strip()
print(clean_additives("[ peanuts -> en:peanuts ] [ wheat-flour ->")  )

clean_ingre = lambda s: " ".join( map( lambda s1: s1.strip().replace(" ", "-") , s.lower().replace(".","").replace("(",",").replace(")",",").split(",") ) )
print(clean_ingre("Bananas, vegetable oil (coconut oil, corn oil and/or palm oil) sugar, natural banana flavor."))


peanuts  peanuts     wheat-flour
bananas vegetable-oil coconut-oil corn-oil-and/or-palm-oil sugar natural-banana-flavor


In [26]:
df_clean = df.copy()
df_clean["additives"] = df_clean["additives"].apply(clean_additives)
df_clean["ingredients_text"] = df_clean["ingredients_text"].apply(clean_ingre)

In [35]:
vectorizer = text.TfidfVectorizer(strip_accents="unicode",stop_words="english", max_features=50)
vectors = vectorizer.fit_transform(df_clean["additives"])
feature_names = vectorizer.get_feature_names()
print(len(feature_names))
dense = vectors.todense()

feature_names = ["ADD:"+fn for fn in feature_names]
df_additive_features = pd.DataFrame(dense, columns=feature_names)

display(df_additive_features)

50


Unnamed: 0,ADD:acid,ADD:amidon,ADD:arome,ADD:artificial,ADD:au,ADD:ble,ADD:cacao,ADD:cheese,ADD:concentrate,ADD:corn,...,ADD:sirop,ADD:sodium,ADD:soy,ADD:soybean,ADD:starch,ADD:sucre,ADD:sugar,ADD:syrup,ADD:water,ADD:wheat
0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.079931,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.452283,0.000000,0.000000,0.000000
1,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,...,0.0,0.0,0.172045,0.0,0.669338,0.000000,0.115601,0.000000,0.126469,0.308226
2,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.344383,0.302178,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.084189,...,0.0,0.0,0.101281,0.0,0.197016,0.000000,0.000000,0.281325,0.148902,0.362899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230071,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
230072,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
230073,0.0,0.0,0.231059,0.0,0.0,0.027886,0.23858,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.048347,0.000000,0.000000,0.000000,0.000000
230074,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [36]:
vectorizer = text.TfidfVectorizer(strip_accents="unicode",stop_words="english", max_features=50)
vectors = vectorizer.fit_transform(df_clean["ingredients_text"])
feature_names = vectorizer.get_feature_names()
# print(len(feature_names))
dense = vectors.todense()

feature_names = ["ING:"+fn for fn in feature_names]
df_ingre_features = pd.DataFrame(dense, columns=feature_names)

display(df_ingre_features)

Unnamed: 0,ING:_lait_,ING:acid,ING:acide,ING:arome,ING:artificial,ING:butter,ING:cacao,ING:cheese,ING:citric,ING:cocoa,...,ING:sodium,ING:soy,ING:soybean,ING:starch,ING:sucre,ING:sugar,ING:syrup,ING:vitamin,ING:water,ING:wheat
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.000000,0.190175,0.000000,0.000000,0.000000,0.000000
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.272963,0.0,0.530024,0.000000,0.183021,0.000000,0.000000,0.200230,0.489559
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.181586,0.000000,0.000000
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.176693,0.0,0.171546,0.000000,0.000000,0.163247,0.000000,0.259224,0.475348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230071,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
230072,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
230073,0.0,0.0,0.0,0.253776,0.0,0.0,0.598967,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.212391,0.000000,0.000000,0.000000,0.000000,0.000000
230074,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [37]:
df_features = pd.concat([df_additive_features, df_ingre_features], axis=1)
df_features["nutrition-score-uk_100g"] = df["nutrition-score-uk_100g"]
display(df_features)
df_features.to_csv("../data/tfidf_features.csv")

Unnamed: 0,ADD:acid,ADD:amidon,ADD:arome,ADD:artificial,ADD:au,ADD:ble,ADD:cacao,ADD:cheese,ADD:concentrate,ADD:corn,...,ING:sodium,ING:soy,ING:soybean,ING:starch,ING:sucre,ING:sugar,ING:syrup,ING:vitamin,ING:water,ING:wheat
0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.079931,...,0.0,0.000000,0.0,0.000000,0.000000,0.190175,0.000000,0.000000,0.000000,0.000000
1,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,...,0.0,0.272963,0.0,0.530024,0.000000,0.183021,0.000000,0.000000,0.200230,0.489559
2,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.344383,0.302178,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.181586,0.000000,0.000000
4,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.084189,...,0.0,0.176693,0.0,0.171546,0.000000,0.000000,0.163247,0.000000,0.259224,0.475348
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
230071,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
230072,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
230073,0.0,0.0,0.231059,0.0,0.0,0.027886,0.23858,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.212391,0.000000,0.000000,0.000000,0.000000,0.000000
230074,0.0,0.0,0.000000,0.0,0.0,0.000000,0.00000,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
