In [1]:
!sudo apt install tesseract-ocr
!pip install pytesseract
!pip install inflect
!pip install gensim

import os
import pytesseract as pt 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import inflect
import regex

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 34 not upgraded.
Need to get 4,795 kB of archives.
After this operation, 15.8 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr amd64 4.00~git2288-10f4998a-2 [218 kB]
Fetched 4,795 kB in 1s (4,448 kB/s)
debconf: unable to initi

In [111]:
allergies = pd.read_csv("FoodData.csv")
allergy_types = allergies.groupby("Allergy", as_index = False).agg(list)[['Allergy','Food']]

ingredient_coor = pd.read_csv('ingredient_coor.csv').drop('Unnamed: 0', axis = 1)
ingredient_coor['name'] = ingredient_coor['name'].str.replace('[_]', ' ')


#takes a string input of a food/ingredient name and returns the corresponding 
#allergy information if it is found in the allergies table.
def search_food(food):
  engine = inflect.engine()
  if engine.singular_noun(food) != False:
    food = engine.singular_noun(food)
  if allergies['Food'].str.contains(food.capitalize()).any() == False:
    return False
  return True

def lookup(menu_string):
  split = menu_string.split('\n\n')
  item_names = []
  ingredient_matches = []
  potential_matches = []
  for item in split:
    item_names.append(item.split('\n')[0])
    #print(item.split('\n'))
    ingredients = []
    potential_found = []
    for string in item.split('\n'):
      for word in string.split(' '):
        #print(word)
        word = regex.sub("[^a-zA-Z]+", "", word.lower())
        if search_food(word) == True:
          ingredients.append(word)
        else:
          potential = most_similar(word, 50)
          if potential.all() != False:
            for i in potential:
              if search_food(i) == True:
                potential_found.append(i)
    ingredient_matches.append(np.unique(np.array(ingredients)))
    potential_matches.append(np.unique(np.array(potential_found)))
  lookup_table = pd.DataFrame(data = {'Item Name': item_names, 'Ingredient Matches': ingredient_matches, 'Potential Allergen Matches' : potential_matches})
  return lookup_table

def edible(allergy_type):
  if type(allergy_type) == str:
    if np.array([allergy_types['Allergy'].str.contains(x) for x in allergy_type]).any() == True:
      allergens = allergy_types[allergy_types['Allergy'] == allergy_type]['Food'].iloc[0]
  else:
    allergens = np.array(allergy_type)
  return allergens


def minus_allergens(menu_image, allergens):
  menu_string = pt.image_to_string(menu_image, timeout=10)
  engine = inflect.engine()
  allergies = edible(allergens)
  lookup_df = lookup(menu_string)
  allergen_included = []
  allergens_detected = []
  allergen_matches = []
  potential_included = []
  potential_detected = []
  for dish in lookup_df['Ingredient Matches']:
    alg = []
    inc = False
    for food in dish:
      if engine.singular_noun(food) != False:
        current_food = engine.singular_noun(food).capitalize()
      else:
        current_food = food.capitalize()
      if current_food in allergies:
        inc = True
        alg.append(current_food)
    allergen_included.append(inc)
    allergens_detected.append(alg)
  for dish in lookup_df['Potential Allergen Matches']:
    alg2 = []
    inc2 = False
    for food in dish:
      if engine.singular_noun(food) != False:
        current_food2 = engine.singular_noun(food).capitalize()
      else:
        current_food2 = food.capitalize()
      if current_food2 in allergies:
        inc2 = True
        alg2.append(current_food2)
    potential_included.append(inc2)
    potential_detected.append(alg2)
  lookup_df['Allergen Included?'] = allergen_included
  lookup_df['Allergens Detected'] = allergens_detected
  lookup_df['Potential Allergen Included?'] = potential_included
  lookup_df['Potential Allergen Detected'] = potential_detected
  return lookup_df
  

def euc_dist(food1, food2):
  food1_x = ingredient_coor[ingredient_coor['name'] == food1]['x'].values[0]
  food1_y = ingredient_coor[ingredient_coor['name'] == food1]['y'].values[0]
  food2_x = ingredient_coor[ingredient_coor['name'] == food2]['x'].values[0]
  food2_y = ingredient_coor[ingredient_coor['name'] == food2]['y'].values[0]
  return np.sqrt((food1_x - food2_x)**2 + (food1_y - food2_y)**2)

#takes in word and returns n most similar ingredients
def most_similar(word, n):
  word = word.lower()
  if word not in np.array(ingredient_coor['name']):
    return np.array([False])
  #print(np.array(ingredient_coor['name']))
  ingredient_coor['dist'] = [euc_dist(word, i) for i in np.array(ingredient_coor['name'])]
  ingredients = ingredient_coor.sort_values('dist', ascending = True)
  return ingredients.iloc[1:n+1]['name'].values


In [93]:
'hummus' in np.array(ingredient_coor['name'])

True

In [101]:
o = 'onion'
o.capitalize() in ['Potato', 'Bean', 'Onion']

True

In [86]:
food1_x = ingredient_coor[ingredient_coor['name'] == 'hummus']['x'].values[0]
food1_x

-30.48619

In [87]:
most_similar('hummus', 20)


array(['greek style vinaigrette', 'pita rounds', 'pita chips',
       'balsamico bianco', 'greek seasoning', 'flatbread', 'pitas',
       'tzatziki', 'crackers', 'hearts of romaine',
       'vinaigrette dressing', 'sumac', 'breadstick', 'golden beets',
       'caesar salad dressing', 'peperoncini', 'hoagie rolls', 'crostini',
       'mixed greens', 'flounder fillets'], dtype=object)

In [112]:
processed_df = minus_allergens('lazy dog menu.PNG', ['Potato', 'Bean', 'Onion'])
processed_df

Unnamed: 0,Item Name,Ingredient Matches,Potential Allergen Matches,Allergen Included?,Allergens Detected,Potential Allergen Included?,Potential Allergen Detected
0,STARTER,[],[],False,[],False,[]
1,CAJUN FRIES,[],"[chicken, garlic, kidney beans, lime, pepper, ...",False,[],False,[]
2,BRICK OVEN SPINACH & SUNDRIED TOMATO CHEESE DIP,"[, cheese, cheeses, corn, spinach, tomato]","[avocado, lettuce, onion, onions, tomatoes]",False,[],True,"[Onion, Onion]"
3,HUMMUS TRIOZP,[],[horseradish],False,[],False,[]
4,"Atrio of walnut-pesto, sundried tomato and tra...","[, tomato]",[horseradish],False,[],False,[]
5,"with garlic flatbread, sliced cucumbers and to...","[, cucumbers, garlic, tomatoes]","[broad beans, horseradish, lemon, shells]",False,[],False,[]
6,Tender calamari strips tossed with sweet soy p...,"[, green, onions, peanuts, peppers, rice, sesa...","[broad beans, chicken, garlic, kidney beans, l...",True,[Onion],True,"[Onion, Onion]"
7,HAND-BREADED CHICKEN TENDERS,"[, buttermilk, chicken, honey]",[],False,[],False,[]
8,HAWAIIAN AHI POKE,[],"[mitsuba, sesame seeds]",False,[],False,[]
9,Sashimi grade ahi tuna tossed with sesame pean...,"[, a, avocado, green, on, onions, peanut, sesa...","[chicken, cranberry, garlic, horseradish, lima...",True,[Onion],True,"[Onion, Onion]"
