In [30]:
import pandas as pd
import numpy as np
import re

#### Load menu

In [14]:
import os

def relativeFilePaths(directory):
    list_paths = []
    for root, dirs, files in os.walk(os.path.relpath(directory)):
        for file in files:
            list_paths.append(os.path.join(root, file))
    return list_paths

In [None]:
menus = relativeFilePaths('../data/Menus')

In [16]:
menus

['..\\data\\Menus\\burgart_205074_c70.jpg',
 '..\\data\\Menus\\Carte-boissons-les-Ambassades-de-Montmartre-Paris-18-Restaurant-bar-terrasse.jpg.crdownload',
 '..\\data\\Menus\\carte-du-restaurant.jpg',
 '..\\data\\Menus\\Carte-in-seoul.png',
 '..\\data\\Menus\\Carte-Restaurant-Nelsons-Paris-Jour-2-1024x724.jpg',
 '..\\data\\Menus\\Carte-Restaurant-Nelsons-Paris-Jour-3-1014x1024.jpg',
 '..\\data\\Menus\\fabbrezza_202084_665.jpg',
 '..\\data\\Menus\\feaa6c7b-9b16-4d51-b71d-75518f16cb08.jpg',
 '..\\data\\Menus\\in-bocca-al-lupo_42356_293.jpg',
 '..\\data\\Menus\\menu-In-Bocca-Al-Lupo.jpg',
 '..\\data\\Menus\\menu-PNY-BURGER-GAITE.jpg',
 '..\\data\\Menus\\menu_bobby.jpg',
 '..\\data\\Menus\\pasta.jpg',
 '..\\data\\Menus\\roberta_131760_635.jpg']

# Computer vision task - detect meals and extract description

### V1 - OpenCV & Pytesseract

In [None]:
import cv2

In [None]:
image = cv2.imread('../data/Menus/menu-PNY-BURGER-GAITE.jpg')

In [None]:
# transform image
#resized_image = cv2.resize(image, (800,1200))
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#blur = cv2.GaussianBlur(gray, (7,7), 0)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# Create rectangular structuring element and dilate
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
dilate = cv2.dilate(thresh, kernel, iterations=4)

In [None]:
cv2.imshow('dilated image', dilate)
cv2.waitKey(0)
cv2.destroyAllWindows()

##### Spot the meals

In [21]:
# Find contours and draw rectangle
contoured_image = image.copy()
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    x,y,w,h = cv2.boundingRect(c)
    cv2.rectangle(contoured_image, (x, y), (x + w, y + h), (36,255,12), 2)

In [22]:
cv2.imshow('contoured image', contoured_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

##### Extract text from image portions

In [None]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'

In [117]:
# number of image portion
nb = 8

x,y,w,h = cv2.boundingRect(cnts[nb])
ROI = image[y:y+h,x:x+w]

text = pytesseract.image_to_string(ROI, lang='eng+fra+ita', config='--psm 6')

In [180]:
menu_list = []
for c in cnts:
    x,y,w,h = cv2.boundingRect(c)
    ROI = image[y:y+h,x:x+w]

    text = pytesseract.image_to_string(ROI, lang='eng+fra+ita', config='--psm 6')
    menu_list.append( re.sub('[\n|\x0c]', '', text).lower())

### V2 - PaddleOCR

In [8]:
from paddleocr import PaddleOCR, draw_ocr

In [None]:
ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory

In [18]:
result = ocr.ocr(menus[7], cls=True)
for line in result:
    print(line)

[2022/08/02 15:24:06] ppocr DEBUG: dt_boxes num : 56, elapse : 0.8240160942077637
[2022/08/02 15:24:07] ppocr DEBUG: cls num  : 56, elapse : 1.0329232215881348
[2022/08/02 15:24:30] ppocr DEBUG: rec_res num  : 56, elapse : 22.855818033218384
[[[493.0, 265.0], [740.0, 265.0], [740.0, 291.0], [493.0, 291.0]], ('LUNCH & DINNER', 0.9743348360061646)]
[[[552.0, 343.0], [683.0, 343.0], [683.0, 375.0], [552.0, 375.0]], ('STARTERS', 0.9963060617446899)]
[[[203.0, 396.0], [605.0, 396.0], [605.0, 420.0], [203.0, 420.0]], ('GASPACHO TOMATE, PASTEQUE & CRABE.13', 0.9851629734039307)]
[[[624.0, 395.0], [781.0, 395.0], [781.0, 418.0], [624.0, 418.0]], ('TARTARE TRUFFE', 0.976279079914093)]
[[[984.0, 398.0], [1028.0, 398.0], [1028.0, 418.0], [984.0, 418.0]], ('14/25', 0.9957812428474426)]
[[[201.0, 428.0], [587.0, 428.0], [587.0, 451.0], [201.0, 451.0]], ('Tomato, watermelon, coriander and chive oil', 0.9763725399971008)]
[[[626.0, 426.0], [984.0, 426.0], [984.0, 449.0], [626.0, 449.0]], ('French bee

In [20]:
from PIL import Image
image = Image.open(menus[7]).convert('RGB')
boxes = [line[0] for line in result]
txts = [line[1][0] for line in result]
scores = [line[1][1] for line in result]
im_show = draw_ocr(image, boxes, txts, scores, font_path='../ressources/fonts/simfang.ttf')
im_show = Image.fromarray(im_show)
im_show.show()

# NLP Task - compute carbon footprint

In [None]:
# Get Ademe agribalyse data
data_ademe = pd.read_excel("../data/Agribalyse/data_food.xlsx")

# Keep only first ingredients in product description name
ingredients = list(dict.fromkeys([x.split(",")[0].lower() for x in data_ademe["LCI Name"]]))

##### Version 1 - Transformer chaque plat en embedding et aller cherche son plus proche voisin dans la base Ademe

##### Version 2 - Localiser chaque ingrédient dans un plat et aller chercher dans la base Ademe son empreinte

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [93]:
def ingredient_check(string, list_ingredients, tresh):
    res = []
    for ingredient in list_ingredients:
        if fuzz.token_sort_ratio(string, ingredient) > tresh:
            res.append(ingredient)
        else:
            pass
    return res

def most_similar_ingredient(string, list_ingredients):
    res = "no match"
    value = 60
    for ingredient in list_ingredients:
        if fuzz.token_sort_ratio(string, ingredient) > value:
            res = ingredient
            value = fuzz.token_sort_ratio(string, ingredient)
        else:
            pass
    return res

In [94]:
meal = re.sub('[\n|\x0c]', '', result[0][-1][0]).split(",")
meal

['LUNCH & DINNER']

In [95]:
menu_list = []
for r in result:
    meal = re.sub('[\n|\x0c]', '', r[-1][0]).split(",")
    meal_list = []
    for m in meal:
        meal_list.append(most_similar_ingredient(m, ingredients))
    menu_list.append(meal_list)
menu_list

[['no match'],
 ['water'],
 ['tomato sauce', 'no match'],
 ['stuffed cabbage'],
 ['no match'],
 ['tomato', 'watermelon', 'no match'],
 ['french bean', 'apples', 'no match', 'no match'],
 ['salmon carpaccio'],
 ['no match'],
 ['hazelnut', 'pastry cream puff'],
 ['salmon carpaccio', 'miso', 'no match', 'no match'],
 ['no match'],
 ['sweet and sour sauce'],
 ['no match'],
 ['no match', 'roman rocket', 'mint', 'no match'],
 ['strawberry', 'gouda cheese', 'roman rocket', 'cucumber', 'no match'],
 ['feta cheese', 'almond'],
 ['shallot', 'sunflower seed'],
 ['mint'],
 ['no match', 'onion tart', 'no match', 'green pepper sauce'],
 ['no match', 'artichoke base'],
 ['no match'],
 ['chestnut flour', 'no match', 'mustard sauce prepacked', 'almond'],
 ['no match'],
 ['chocolate sauce',
  'no match',
  'coriander',
  'no match',
  'sauce',
  'red endive'],
 ['no match'],
 ['no match'],
 ['no match', 'cheddar cheese', 'no match', 'fritter', 'salami'],
 ['no match'],
 ['no match', 'haricot bean', 'avo

In [96]:
pd.set_option('display.max_colwidth', None)

df_results = pd.DataFrame({"text from OCR":[x[-1][0] for x in result], "extracted components":menu_list})
df_results

Unnamed: 0,text from OCR,extracted components
0,LUNCH & DINNER,[no match]
1,STARTERS,[water]
2,"GASPACHO TOMATE, PASTEQUE & CRABE.13","[tomato sauce, no match]"
3,TARTARE TRUFFE,[stuffed cabbage]
4,14/25,[no match]
5,"Tomato, watermelon, coriander and chive oil","[tomato, watermelon, no match]"
6,"French beef fillet, capers, shimeji pickles,","[french bean, apples, no match, no match]"
7,CARPACCIO THON SESAME,[salmon carpaccio]
8,15,[no match]
9,"hazelnut, truffle cream","[hazelnut, pastry cream puff]"
