# Product matching for recipes

In [2]:
# Uncomment to install the dependencies
!pip install -r requirements.txt
!pip install bedrock_fm-0.1.0-py3-none-any.whl

[0mProcessing ./bedrock_fm-0.1.0-py3-none-any.whl
bedrock-fm is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
[0m

In [33]:
#!pip uninstall --yes bedrock_fm-0.1.0-py3-none-any.whl

Found existing installation: bedrock-fm 0.1.0
Uninstalling bedrock-fm-0.1.0:
  Successfully uninstalled bedrock-fm-0.1.0
[0m

In [3]:
from bs4 import BeautifulSoup
import requests
from bedrock_fm import AI21Mid, ClaudeInstantV1, TitanLarge
import boto3
import  backoff
import logging
import json
logger = logging.getLogger()

In [4]:
#recipe_url = "https://www.hemkop.se/recept/aggburgare"
recipe_url= "https://www.hemkop.se/recept/flaskkarre_porchetta"

In [5]:
resp = requests.get(recipe_url)
bd = BeautifulSoup(resp.content)

Let's find the element with "Ingredienser" and get the ancestors

In [6]:
doc = bd.find(string="Ingredienser").parent.parent

Get all the items

In [7]:
items=[]
for p in doc.select('p'):
    items += p.getText().split('\n')

items


['2 msk fänkålsfrön, rostade i torr panna',
 '1 msk grovsalt',
 '2 tsk grovkrossade svartpepparkorn',
 '1 tsk chiliflakes',
 '2 kg benfri fläskkarré',
 'finrivet skal från 1 citron',
 '6 hackade vitlöksklyftor',
 '2 msk olivolja',
 '2 dl vitt vin',
 '2 dl utspädd kycklingbuljong',
 'bindgarn']

Let's use an LLM to get the ingredient name (and if we want also quantity), from the ingredient list

In [13]:
session = boto3.Session(region_name='us-west-2')
bedrock = session.client('bedrock-runtime')
client = boto3.client( service_name='bedrock',region_name='us-west-2')

In [19]:
import requests

In [20]:
URL = f"https://59s1etw4kf.execute-api.eu-west-1.amazonaws.com/api/get_products?url={recipe_url}"

In [None]:
# Send a GET request to the URL to list all indices
response = requests.get(URL)

print(response.status_code )
# Check if the request was successful
if response.status_code == 200:
    # Log the response text (list of indices)
    #logger.info(response)
    print(response.json())
else:
    # Log an error message if the request was unsuccessful
    logger.error(f'Failed to list indices. Status code: {response.status_code}, Response: {response.text}')

In [9]:
ai21mid = AI21Mid(client=bedrock)
ci = ClaudeInstantV1(client=bedrock, token_count=2000, temperature=0.2)


We are going to use Claude since we need multilingual capabilities

In [10]:
@backoff.on_exception(backoff.expo, bedrock.exceptions.ThrottlingException)
def get_ingredients_from_list(ingredient_list):
    return ci.generate("""From the following ingredient list in swedish extract the swedish name of the ingredient, the quantity and the unit of measure for each item. 
Create a JSON document containing: the simple ingredient name in swedish, the quantity and the unit of measure. Use the fields: ingredient, quantity, unit. 
Quantity must be a number. Ingredients are nouns only. The answer contains only the JSON document.

{}""".format("\n".join([i for i in ingredient_list if i.upper() != i])))[0].strip()


A naive approach would be to call the LLM for every ingredient, but this is going to encounter throttling

In [11]:
resp = get_ingredients_from_list(items)
resp

'[\n  {\n    "ingredient": "fänkålsfrön",\n    "quantity": 2,\n    "unit": "msk"\n  },\n  {\n    "ingredient": "grovsalt", \n    "quantity": 1,\n    "unit": "msk"\n  }, \n  {\n    "ingredient": "svartpepparkorn",\n    "quantity": 2,\n    "unit": "tsk"\n  },\n  {\n    "ingredient": "chiliflakes",\n    "quantity": 1,\n    "unit": "tsk"\n  },\n  {\n    "ingredient": "fläskkarré",\n    "quantity": 2,\n    "unit": "kg"\n  },\n  {\n    "ingredient": "citron",\n    "quantity": "finrivet skal från 1",\n    "unit": ""\n  },\n  {\n    "ingredient": "vitlöksklyftor",\n    "quantity": 6,\n    "unit": "hackade"\n  },\n  {\n    "ingredient": "olivolja",\n    "quantity": 2,\n    "unit": "msk" \n  },\n  {\n    "ingredient": "vin",\n    "quantity": 2,\n    "unit": "dl"\n  },\n  {\n    "ingredient": "kycklingbuljong",\n    "quantity": 2,\n    "unit": "dl utspädd"\n  },\n  {\n    "ingredient": "bindgarn",\n    "quantity": "",\n    "unit": ""\n  }\n]'

In [14]:


client.list_foundation_models()



{'ResponseMetadata': {'RequestId': '4535a063-d7b1-4000-89b8-b544dfa6269f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Sat, 07 Oct 2023 22:19:14 GMT',
   'content-type': 'application/json',
   'content-length': '5729',
   'connection': 'keep-alive',
   'x-amzn-requestid': '4535a063-d7b1-4000-89b8-b544dfa6269f'},
  'RetryAttempts': 0},
 'modelSummaries': [{'customizationsSupported': ['FINE_TUNING'],
   'inferenceTypesSupported': ['ON_DEMAND'],
   'inputModalities': ['TEXT'],
   'modelArn': 'arn:aws:bedrock:us-west-2::foundation-model/amazon.titan-tg1-large',
   'modelId': 'amazon.titan-tg1-large',
   'modelName': 'Titan Text Large',
   'outputModalities': ['TEXT'],
   'providerName': 'Amazon',
   'responseStreamingSupported': True},
  {'customizationsSupported': [],
   'inferenceTypesSupported': ['ON_DEMAND'],
   'inputModalities': ['TEXT'],
   'modelArn': 'arn:aws:bedrock:us-west-2::foundation-model/amazon.titan-e1t-medium',
   'modelId': 'amazon.titan-e1t-medium',
   'modelNam

In [15]:
prompt = """From the following ingredient list in swedish extract the swedish name of the ingredient, the quantity and the unit of measure for each item. 
Create a JSON document containing: the simple ingredient name in swedish, the quantity and the unit of measure. Use the fields: ingredient, quantity, unit. 
Quantity must be a number. Ingredients are nouns only. The answer contains only the JSON document.

{}""".format("\n".join([i for i in items if i.upper() != i]))

In [16]:

body =json.dumps( {
    "prompt": f"Human: {prompt}\n\nAssistant:",
    "max_tokens_to_sample": 2000,
    "stop_sequences": [],
    "temperature": 0.2,
    "top_p": 1,
    "anthropic_version": "bedrock-2023-05-31",
})

resp = bedrock.invoke_model(
modelId="anthropic.claude-instant-v1",
body=body,
contentType="application/json",
accept="*/*",
)

In [19]:
response_body = json.loads(resp.get("body").read())

print(response_body.get("completion"))

 [
  {
    "ingredient": "fänkålsfrön",
    "quantity": 2,
    "unit": "msk"
  },
  {
    "ingredient": "grovsalt", 
    "quantity": 1,
    "unit": "msk"
  }, 
  {
    "ingredient": "svartpepparkorn",
    "quantity": 2,
    "unit": "tsk"
  },
  {
    "ingredient": "chiliflakes",
    "quantity": 1,
    "unit": "tsk"
  },
  {
    "ingredient": "fläskkarré",
    "quantity": 2,
    "unit": "kg"
  },
  {
    "ingredient": "vitlök",
    "quantity": 6,
    "unit": "klyftor"
  },
  {
    "ingredient": "olivolja",
    "quantity": 2,
    "unit": "msk"
  },
  {
    "ingredient": "vin",
    "quantity": 2,
    "unit": "dl" 
  },
  {
    "ingredient": "kycklingbuljong",
    "quantity": 2,
    "unit": "dl"
  }
]


In [17]:
json.loads(resp)

TypeError: the JSON object must be str, bytes or bytearray, not dict

We create instead to pass the full list of ingredients and tell the LLM to extract the information we want:

In [78]:
ingredient_names = [i.strip() for i in get_ingredients_from_list(items).split("\n")]
ingredient_names

['hamburgerbröd',
 'ägg',
 'salladsmix',
 'spenat',
 'tomat',
 'ost',
 'rostad lök',
 'inlagd gurka',
 'nacho chips',
 'chilimajonnäs',
 'ketchup',
 'krossade tomater',
 'äppelcidervinäger',
 'socker',
 'lagerblad',
 'kanel',
 'kryddnejlikor',
 'tomatpuré',
 'salt',
 'potatismjöl',
 'vatten']

Next, we use the site search functionality to get the matching products from the shop. This is a naive approach, and could be enhanced by using some additional logic to match ingredients to the recipe for example by "learning" from other recipes. For example instead of Hushallsost one would prefer som cheese that better matches the recipe of hamburgers

In [103]:
import requests
import json

search = "https://www.hemkop.se/search?size=30&page=0&q={query}&sort=relevance"

match_prod = []
for i in ingredient_names:
    resp = requests.get(search.format(query=i))
    prod = json.loads(resp.content)["results"][0]
    match_prod.append({"ingredient": i, "product": prod})

#match_prod

In [91]:
for m in match_prod:
    print(f"{m['ingredient']:20}{m['product']['name']:40} {(m['product']['manufacturer'] or ''):15} {m['product']['code']}")

hamburgerbröd       Hamburgerbröd Brioche 4-pack             Garant          101237094_ST
ägg                 Ägg 10p Frigående Utomhus Medium/large   Garant          101218667_ST
salladsmix          Salladsmix Roman Lollo Klass 1           Garant Eko      101281470_ST
spenat              Babyspenat Ekologisk Klass 1             Garant Eko      101240156_ST
tomat               Tomater Kvist Sverige Klass 1                            101215340_KG
ost                 Hushållsost 26%                          Arla            101197477_KG
rostad lök          Lök Rostad                               Eldorado        101260936_ST
inlagd gurka        Smörgåsgurka Skivad                      Felix           100532210_ST
nacho chips         Nacho Chips Original                     Santa Maria     101247212_ST
chilimajonnäs       Majonnäs Korean Chili                    Uma             101514798_ST
ketchup             Tomatketchup                             Heinz           100365418_ST
krossade t

Let's put all in a single function

In [22]:
def get_matching_products(recipe_url):
    resp = requests.get(recipe_url)
    bd = BeautifulSoup(resp.content)
    doc = bd.find(string="Ingredienser").parent.parent
    items=[]
    for p in doc.select('p'):
        items += p.getText().split('\n')
    ingredient_names = [i.strip() for i in get_ingredients_from_list(items).split("\n")]
    search = "https://www.hemkop.se/search?size=30&page=0&q={query}&sort=relevance"

    match_prod = []
    logger.debug(ingredient_names)
    print(ingredient_names)
    for i in ingredient_names:
        resp = requests.get(search.format(query=i))
        results = json.loads(resp.content)["results"]
        if results is not None:
            prod = results[0] 
            match_prod.append({"ingredient": i, "product": prod})
        else: 
            match_prod.append({"ingredient": i, "product": None})
    
    return match_prod

def print_ingredients(matches):
    for m in matches:
        if m['product'] is not None:
            print(f"{m['ingredient']:20}{m['product']['name']:40} {(m['product']['manufacturer'] or ''):15} {m['product']['code']}")
        else:
            print(f"{m['ingredient']:20}{'NO MATCH':40}")

In [20]:
print_ingredients(get_matching_products("https://www.hemkop.se/recept/snabb_jalapeno"))

['picklad jalapeño', 'majonnäs', 'crème fraiche', 'salt', 'svartpeppar']
picklad jalapeño    Green Jalapeño Hot                       Santa Maria     100295878_ST
majonnäs            Majonnäs Äkta                            Kavli           100451529_ST
crème fraiche       Crème Fraiche 32%                        Garant          101245382_ST
salt                Salt med Jod                             Jozo            101227778_ST
svartpeppar         Svartpeppar Hel Påse                     Eldorado        101197966_ST


In [23]:
print_ingredients(get_matching_products("https://www.hemkop.se/recept/sallad_tomater"))

['• lättkokta haricots verts', '• blandade tomater', '• anjovis', '• selleriblad', '• gräslök', '• rödvinsvinäger', '• olivolja', '• flingsalt', '• peppar']
• lättkokta haricots vertsHaricots Verts 3-pack                    Dáucy           100242417_ST
• blandade tomater  Tomater Cocktail Klass 1                                 100814709_ST
• anjovis           Sill Matjes i Bitar                      Eldorado        101353781_ST
• selleriblad       
• gräslök           Västkustchips Vitlök Gräslök Chili       Estrella        101264352_ST
• rödvinsvinäger    Rödvinsvinäger                           Garant          101281863_ST
• olivolja          Olivolja Gentile D-vitaminberikad        Zeta            101426939_ST
• flingsalt         Flingsalt                                Garant          101213737_ST
• peppar            Fem Peppar Bistro                        Blå Band        101205325_ST
