# Get product's foodgroup

**Objective** : <br>
Predict the foodgroup (12 groups) of a product of Phenix, online version (from the package openfoodfacts) <br>

**Outputs** : <br>
- `data/foodgroup_statut.csv` (11.7Mo, assigns a foodgroup and a statut for each food of Phenix, built from complete models `clf_nutrients_rf_groupeAlim_2` and `clf_names_nb`)


In [2]:
import numpy as np
import pandas as pd
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
#import unidecode
#from collections import Counter
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
import pickle
import itertools

import openfoodfacts

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.legend_handler import HandlerLine2D

pd.options.display.max_rows = 100

## Load models

In [3]:
# Model to get the foodgroup of a product which is in the Open Food Facts database
clf_nutrients_rf = pickle.load(open('../../data/clf_nutrients_rf_groupeAlim_2_light.sav', 'rb'))
print(clf_nutrients_rf)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=20, max_features=3, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=-1,
            oob_score=False, random_state=50, verbose=0, warm_start=False)


In [4]:
# Model to get the foodgroup of a product which is not in the Open Food Facts database
clf_names_nb = pickle.load(open('../../data/clf_names_nb_light.sav', 'rb'))
print(clf_names_nb)

Pipeline(memory=None,
     steps=[('count_vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 2), preprocessor=None,
        stop_words...e, vocabulary=None)), ('naive_bayes', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True))])


In [5]:
# Mapping groups
mapping_groups = pd.read_csv("../../data/mapping_off_ideal.csv", sep = ';', encoding = 'UTF-8')

In [6]:
# Transform into a dictionnary
dict_mapping = mapping_groups.set_index('pnns_groups_2')['groupeAlim_2'].to_dict()

## Function to get foodgroup

In [7]:
def get_foodGroup(EAN, Produit_Nom, convert_groups = dict_mapping,
                  model_classifier = clf_nutrients_rf, 
                  model_matching = clf_names_nb) : 
    '''
    -- Input --
    EAN : EAN code, string
    Produit_Nom : name of the product, string
    convert_groups : dictionnary which enables to create a family of food from OpenFoodFacts' groups
    model_classifier : model which predicts the foodgroups from nutrients
    model_matching : model which predicts the foodgroups from names
    
    -- Output --
    food_group : the group of the product, string
    statut : how the foodgroup has been obtained
        1 - the product is in OFF and belongs to a well defined foodgroup 
        2 - the product is in OFF and its foodgroup is predicted from nutrients
        3 - the product is not in OFF and its foodgroup is predicted from its name
    '''
    
    try : #incase of missing EAN
        product_off = openfoodfacts.products.get_product(str(EAN))  #gets the product from Open Food Facts
    except :
        pass

    try : #manages to get info on pnns_groups_2
        
        product_off_groups2 = product_off['product']['pnns_groups_2']
    
        if product_off_groups2 in convert_groups.keys() : #if the product of OFF belongs to a well defined group
            foodgroup = convert_groups[product_off_groups2]
            statut = 1
            
            return [foodgroup, statut]
    except : pass    
        
    try : #manages to get info on nutriments

        #looks for nutrients
        df_nutrients = pd.DataFrame([product_off['product']['nutriments']],
                               dtype = 'float64')[['salt_100g', 'fat_100g', 'sugars_100g', 'proteins_100g', 'carbohydrates_100g', 'saturated-fat_100g']]

        # We will predict if and only if the values are valid
        df_nutrients = df_nutrients[df_nutrients['salt_100g'] <= 100]
        df_nutrients = df_nutrients[df_nutrients['sugars_100g'] <= 100]
        df_nutrients = df_nutrients[df_nutrients['carbohydrates_100g'] <= 100]
        df_nutrients = df_nutrients[df_nutrients['fat_100g'] <= 100]
        df_nutrients = df_nutrients[df_nutrients['proteins_100g'] <= 100]
        df_nutrients = df_nutrients[df_nutrients['saturated-fat_100g'] <= 100]

        n_row = df_nutrients.shape[0] #1 if values are correct, 0 if one value over 100

        if n_row == 1 : #no missing values and no weird values

            #then predicts the foodgroup from nutrients
            foodgroup = model_classifier.predict(df_nutrients[['salt_100g', 'sugars_100g',
                                                               'carbohydrates_100g', 'fat_100g',
                                                               'proteins_100g', 'saturated-fat_100g']])[0]
            statut = 2
            
            return [foodgroup, statut]
    except : 
        pass
    
    try : #manages to predicts the foodgroup from the name
        foodgroup = model_matching.predict([Produit_Nom])[0]
        statut = 3
        return [foodgroup, statut]

    except : #arggg
        return [None, None]

In [8]:
EAN = "4260436322114" #in OFF with foodgroup
get_foodGroup(EAN, Produit_Nom = None)  #4260436322114 -> ['Produits gras sucrés salés', 1]

['Produits gras sucrés salés', 1]

In [9]:
%%timeit
get_foodGroup("4260436322114", Produit_Nom = None) #100ms for statut 1

10 loops, best of 3: 106 ms per loop


In [10]:
EAN = "5410233710105" #in OFF without foodgroup
get_foodGroup(EAN, Produit_Nom = None) #5410233710105 -> ['Produits gras sucrés salés', 2]

['Produits gras sucrés salés', 2]

In [11]:
%%timeit
get_foodGroup("5410233710105", Produit_Nom = None) #250ms for statut 2

1 loop, best of 3: 268 ms per loop


In [12]:
EAN = 'hgbjnklhgcvbjh6e5fgr' #is not in OFF
Produit_Nom = "Pizza"
get_foodGroup(EAN, Produit_Nom) #Produit_Nom = "Pizza" -> ['Plats préparés', 3]

['Plats préparés', 3]

In [13]:
%%timeit
get_foodGroup('hgbjnklhgcvbjh6e5fgr', "Pizza") #100ms for statut 3

10 loops, best of 3: 99.1 ms per loop


In [14]:
# missing
get_foodGroup(None, None)

[None, None]

## Prediction for multiple products

In [15]:
# Phenix database
all_foodsCommandes_2017 = pd.read_csv("../../data/all_foodsCommandes_2017.csv", sep = ';', 
                                      encoding = 'UTF-8',dtype = {'EAN': str, 'Produit_Nom' : str})

# Only keeping unique products (based on the name and the EAN code)
unique_foodsCommandes_2017 = all_foodsCommandes_2017[['EAN', 'Produit_Nom']].drop_duplicates()

print(unique_foodsCommandes_2017.shape)

(184660, 2)


In [16]:
#unique_foodsCommandes_2017_sample = unique_foodsCommandes_2017[['Produit_Nom', 'EAN']]
unique_foodsCommandes_2017_sample = unique_foodsCommandes_2017[['Produit_Nom', 'EAN']].sample(10)

In [17]:
unique_foodsCommandes_2017_sample[['foodgroup', 'statut']] = unique_foodsCommandes_2017_sample.apply(lambda row: get_foodGroup(row['EAN'], row['Produit_Nom']), axis=1)
unique_foodsCommandes_2017_sample

Unnamed: 0,Produit_Nom,EAN,foodgroup,statut
2760811,"SDW LE MEGA BUN,ARIZONA,200G",3242272951056,Plats préparés,1
73516,MOULIN MEL.POIV.SEL DUC 30G,3166291748421,Plats préparés,2
311016,"NOIX DE BRESIL,200G,OPTIMYS",7640149390360,Produits gras sucrés salés,3
1054759,"SAUCE BEURRE BLC,RUSTICA,300ML",3564700445518,Matières grasses ajoutées,3
1357257,COUSCOUS MAROCAINE,3302741843104,Plats préparés,1
2118831,"JBON DD SEL REDUIT X2,TRADI,80G",1056241,"Viande, oeufs",3
1540608,"ESPRESSO LUNGO LIEVE 16CAP,112G",8000070086456,Produits gras sucrés salés,3
1986667,"PREPA MOELLX CHOCO,NESTLE,344G",7613034060679,Produits gras sucrés salés,1
441575,GRAND MIE DUROI 700G,3029330001829,Féculents raffinés,1
132289,HARICOT BEURRE E.F 4/4 460G,3017800022016,Légumes,1


In [None]:
# Saving
#unique_foodsCommandes_2017_sample.to_csv("../data/_output/foodgroup_statut.csv", sep = ';', encoding = 'UTF-8', index = False)