# Very short notebook to add one column in the JOW quantity dataframe + add missing ingredient quantities

The input dataframe is `95_perc_kg_unit_ingredients.csv` created by Ugo.  
- We add one column to change the format of ingredients so as to be consistent with the format used to match Jow ingredients with Agribalyse ingredients:
    - we remove ligatures
    - we remove informations in parenthesis
- We add missing ingredient quantities


In [15]:
import pandas as pd
import re

In [16]:
path = r"C:\\Users\\genef\\Documents\\Projets\\carbondiet\\from_gdrive\\data\\Recipes\\"

df = pd.read_csv(path + "95_perc_kg_unit_ingredients.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,ingredient,name_unit,unit_kg
0,0,bœuf (carpaccio),g,0.001
1,1,emmental,g,0.001
2,2,sauce teriyaki,càs,0.015
3,3,riz,g,0.001
4,5,gnocchi,g,0.001


Add ingredients

In [17]:
df.loc[len(df)] = [len(df)+1, "epinard", "poignée", "0.025"]
df.tail()

Unnamed: 0.1,Unnamed: 0,ingredient,name_unit,unit_kg
732,7498,taco kit,unitaire,0.4
733,7505,crousti' taco kit,unitaire,0.5
734,7513,sirop de pêche,cl,0.01
735,7514,sirop de menthe,cl,0.01
736,737,epinard,poignée,0.025


Remove ligatures

In [18]:
REGEX_REPLACEMENTS = [(r"\u0153", "oe"), \
                      (r"\u0152", "Oe")]

def remove_ligatures(transcript):
    for old, new in REGEX_REPLACEMENTS:
        transcript = re.sub(old, new, transcript, flags=re.IGNORECASE)
    return transcript

df['simple_ingredient'] = df['ingredient'].apply(lambda s: remove_ligatures(str(s)))
df.head()

Unnamed: 0.1,Unnamed: 0,ingredient,name_unit,unit_kg,simple_ingredient
0,0,bœuf (carpaccio),g,0.001,boeuf (carpaccio)
1,1,emmental,g,0.001,emmental
2,2,sauce teriyaki,càs,0.015,sauce teriyaki
3,3,riz,g,0.001,riz
4,5,gnocchi,g,0.001,gnocchi


Remove informations in parenthesis

In [19]:
def remove_parenthesis(ingredient):
    # remove parenthesis
    ingredient = re.sub("[\(\[].*?[\)\]]", "", ingredient)
    # remove blank at the end of the string that remained when parenthesis have been removed
    ingredient = ingredient[:-1] if ingredient[-1]==" " else ingredient
    
    return ingredient

df['simple_ingredient'] = df['simple_ingredient'].apply(remove_parenthesis)
df.head()

Unnamed: 0.1,Unnamed: 0,ingredient,name_unit,unit_kg,simple_ingredient
0,0,bœuf (carpaccio),g,0.001,boeuf
1,1,emmental,g,0.001,emmental
2,2,sauce teriyaki,càs,0.015,sauce teriyaki
3,3,riz,g,0.001,riz
4,5,gnocchi,g,0.001,gnocchi


Save file

In [20]:
df.to_csv(path + "95_perc_kg_unit_ingredients_v2.csv", index = False)

Notice that several different ingredients have the same simple_ingredient name

In [8]:
pd.set_option('display.max_rows', None)
df.groupby('simple_ingredient').agg(list)

Unnamed: 0_level_0,Unnamed: 0,ingredient,name_unit,unit_kg
simple_ingredient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abricot,"[1833, 3372]","[abricot (frais), abricot (frais)]","[g, unitaire]","[0.001, 0.048076923076923]"
abricots,[1736],[abricots (secs)],[g],[0.001]
ail,"[12, 1108, 6893]","[ail, ail, ail (semoule)]","[gou., unitaire, càc]","[0.0833333333333333, 0.0833333333333333, 0.005]"
algues nori,[6528],[algues nori (feuille)],[unitaire],[0.003]
aligot,[6438],[aligot],[g],[0.001]
amandes,"[188, 2060]","[amandes (effilées), amandes (entières)]","[g, g]","[0.001, 0.001]"
amaretto,[3427],[amaretto],[càs],[0.015]
ananas,[5353],[ananas (entier)],[unitaire],[1.0389610389610389]
ananas au sirop,"[2387, 2825]","[ananas au sirop (tranches), ananas au sirop (...","[g, tran.]","[0.001, 0.025]"
anchois,"[1152, 3264]","[anchois, anchois]","[unitaire, g]","[0.015, 0.001]"


In [9]:
pd.set_option('display.max_rows', None)
df.groupby(['simple_ingredient','name_unit']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,ingredient,unit_kg
simple_ingredient,name_unit,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abricot,g,1,1,1
abricot,unitaire,1,1,1
abricots,g,1,1,1
ail,càc,1,1,1
ail,gou.,1,1,1
ail,unitaire,1,1,1
algues nori,unitaire,1,1,1
aligot,g,1,1,1
amandes,g,2,2,2
amaretto,càs,1,1,1


In [10]:
ingredient = 'boeuf'
condition = (df['simple_ingredient']==ingredient) & (df['name_unit']=='g')
df[condition]['unit_kg'].values.tolist()

[0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001]