<a href="https://colab.research.google.com/github/darshan-hindocha/lab/blob/main/search_engine_for_recipes_dictionary_using_python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset is available [here](https://drive.google.com/file/d/18ygiW-A8BYmwEOs2-O5shh1JHPW9Sg6r/view?usp=sharing)

### Libraries for studying and chilling

In [None]:
import numpy
import json
import string
import time
import math

### Tokenisation

In [None]:
def tokenisation(strin):
    strin=str(strin)
    tokens=''
    space=string.punctuation+string.digits
    for i in strin:
        if i in space:
            tokens += ' '
        else: tokens += i
    
    tokens=tokens.lower()
    tokens=tokens.split()
    tokens=[t for t in tokens if len(t)>2]
    return tokens

### Parse Dataset

 * `title` : Name of recipe; you can assume these are unique
 * `categories` : A list of tags assigned to the recipe
 * `ingredients` : What is in it, as a list
 * `directions` : List of steps to make the recipe
 * `rating` : A rating, out of 5, of how good it is
 * `calories` : How many calories it has
 * `protein` : How much protein is in it
 * `fat` : How much fat is in it

In [None]:
with open('recipes.json') as json_file:
    frecipebook=json.load(json_file)
  

 
checkall=[]
checknormal=[]
for l in range(len(frecipebook)):
    checknormal.append([])

for i,recipe in enumerate(frecipebook):
    checkall.append(tokenisation(recipe.values()))
    checknormal[i].append(tokenisation(recipe['title']))
    
    try: frecipebook[i]['categories']
    except KeyError: frecipebook[i]['categories']=[]
    checknormal[i].append(tokenisation(recipe['categories']))
    
    try: frecipebook[i]['ingredients']
    except KeyError: frecipebook[i]['ingredients']=[]
    checknormal[i].append(tokenisation(recipe['ingredients']))
    
    try: frecipebook[i]['directions']
    except KeyError: frecipebook[i]['directions']=[]
    checknormal[i].append(tokenisation(recipe['directions']))
        

    
    



### Ordering

#### Normal

The score sums the following terms (repeated words are counted multiple times, i.e. "cheese cheese cheese" is $3$ matches to "cheese"):

     * 8x Number of times a query word appears in the title
     * 4x Number of times a query word appears in the categories
     * 2x Number of times a query word appears in the ingredients
     * 1x Number of times a query word appears in the directions
     * The `rating` of the recipe (if not available assume 0)

In [None]:
def queryscore(query,ind):
    out=0
    for q in query:
        if q not in checkall[ind]:
            return 0
        
        out+=8*(checknormal[ind][0].count(q))
        out+=4*(checknormal[ind][1].count(q))
        out+=2*(checknormal[ind][2].count(q))
        out+=checknormal[ind][3].count(q)
    try: out+=frecipebook[ind]['rating']
    except KeyError: pass
        
    return out


def normalorder(query,count):
    score=numpy.zeros((len(frecipebook),2))
    score[:,1]=numpy.arange(len(frecipebook))
    
    for ind in range(len(frecipebook)):
        score[ind][0]=queryscore(query,ind)
                
    #no or few matches
    score=score[numpy.where(score[:,0]>0)]
    
    #order descending according to score[:,0]
    out = score[(-score[:,0]).argsort()][:count].astype(int)

    return out



#### Simple

In [None]:


def simpleorder(recipes,count):
    score=numpy.zeros((len(recipes),2))
    score[:,0]=1000
    indexers=numpy.arange(len(recipes))
    score[:,1]=indexers
    
    for ind,r in enumerate(recipes): 
        numing=len(r['ingredients'])
        numstep=len(r['directions'])
        if numing>1 and numstep>1:
            score[ind][0]=numing*numstep

    score=score[numpy.where(score[:,0]<1000)]

    out=score[(score[:,0]).argsort()][:count].astype(int)
    # add part that doesn't output anything if there is no match
    
    return out
            
            
            

#### Healthy
 $$\frac{|\texttt{calories} - 510n|}{510} + 2\frac{|\texttt{protein} - 18n|}{18} + 4\frac{|\texttt{fat} - 150n|}{150}$$

In [None]:

def healthyorder(recipes,count):
    score=numpy.zeros((len(recipes),2))
    score[:,0]=100000
    indexers=numpy.arange(len(recipes))
    score[:,1]=indexers
    for ind,r in enumerate(recipes):          
            try:
                prot=r['protein']
                cal=r['calories']
                fat=r['fat']
            except KeyError: pass
            else:
                nmin={'metric':10000}
                for n in range(1,11):
                    metric = (abs(cal-(510*n))/510) +(abs(prot-(18*n))/9) + 4*(abs(fat-(150*n))/150)
                    if metric < nmin['metric']:
                        nmin['metric']=metric
                score[ind][0]=nmin['metric']
            
    score=score[numpy.where(score[:,0]<100000)]
    out=score[(score[:,0]).argsort()][:count].astype(int)
    
    return out
                
            

### Search Function
 It `print`s out the results of the search, subject to the following rules:
 1. It selects from the set of all recipes that contain __all__ of the words in the query (the positions/order of the words in the recipe are to be ignored).
 2. It orders them based on the provided ordering (a string, meaning defined below).
 3. It `print`s the top `count` matches only, preserving the order from best to worst. Must `print` just their title, one per line. Must be less than `count` if the search is specific enough that less than `count` recipes match.

In [None]:

def search(query, ordering = 'normal', count = 10):
    
    query=tokenisation(query)
    
    #tic=time.process_time()

    if ordering=='normal':
        out=normalorder(query,count) 
        for o in out:
            print(frecipebook[o[1]]['title'])
    else:
        recipes=[]
        for ind,rec in enumerate(frecipebook):
            if all(q in checkall[ind] for q in query):
                recipes.append(rec)
        
        if recipes==[]:
            return

        if ordering == 'simple':
            out = simpleorder(recipes,count)
        elif ordering == 'healthy':
            out = healthyorder(recipes,count)

        for o in out: #search results must be in order at this point
            print(recipes[o[1]]['title'])

    #toc=time.process_time()    
    #print(toc-tic)
    return 
