In [2]:
import nltk 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import config

In [3]:
data = pd.read_csv("Cleaned_Indian_Food_Dataset.csv")
data.head()

Unnamed: 0,TranslatedRecipeName,TranslatedIngredients,TotalTimeInMins,Cuisine,TranslatedInstructions,URL,Cleaned-Ingredients,image-url,Ingredient-count
0,Masala Karela Recipe,"1 tablespoon Red Chilli powder,3 tablespoon Gr...",45,Indian,"To begin making the Masala Karela Recipe,de-se...",https://www.archanaskitchen.com/masala-karela-...,"salt,amchur (dry mango powder),karela (bitter ...",https://www.archanaskitchen.com/images/archana...,10
1,Spicy Tomato Rice (Recipe),"2 teaspoon cashew - or peanuts, 1/2 Teaspoon ...",15,South Indian Recipes,"To make tomato puliogere, first cut the tomato...",https://www.archanaskitchen.com/spicy-tomato-r...,"tomato,salt,chickpea lentils,green chilli,rice...",https://www.archanaskitchen.com/images/archana...,12
2,Ragi Semiya Upma Recipe - Ragi Millet Vermicel...,"1 Onion - sliced,1 teaspoon White Urad Dal (Sp...",50,South Indian Recipes,"To begin making the Ragi Vermicelli Recipe, fi...",https://www.archanaskitchen.com/ragi-vermicell...,"salt,rice vermicelli noodles (thin),asafoetida...",https://www.archanaskitchen.com/images/archana...,12
3,Gongura Chicken Curry Recipe - Andhra Style Go...,"1/2 teaspoon Turmeric powder (Haldi),1 tablesp...",45,Andhra,To begin making Gongura Chicken Curry Recipe f...,https://www.archanaskitchen.com/gongura-chicke...,"tomato,salt,ginger,sorrel leaves (gongura),fen...",https://www.archanaskitchen.com/images/archana...,15
4,Andhra Style Alam Pachadi Recipe - Adrak Chutn...,"oil - as per use, 1 tablespoon coriander seed...",30,Andhra,"To make Andhra Style Alam Pachadi, first heat ...",https://www.archanaskitchen.com/andhra-style-a...,"tomato,salt,ginger,red chillies,curry,asafoeti...",https://www.archanaskitchen.com/images/archana...,12


In [4]:
data.columns

Index(['TranslatedRecipeName', 'TranslatedIngredients', 'TotalTimeInMins',
       'Cuisine', 'TranslatedInstructions', 'URL', 'Cleaned-Ingredients',
       'image-url', 'Ingredient-count'],
      dtype='object')

In [5]:
def get_and_sort_corpus(data):
  corpus_sorted = []
  for doc in data['Cleaned-Ingredients'].values:
    doc = sorted(doc.split(sep = ','))
    corpus_sorted.append(doc)
  #print(corpus_sorted)
  return corpus_sorted
  
def get_window(corpus):
  lengths = [len(doc) for doc in corpus]
  avg_len = float(sum(lengths))/ len(lengths)
  return round(avg_len)

if __name__ == "__main__":
  corpus = get_and_sort_corpus(data)
  print(f"Length of corpus", data['Ingredient-count'])
  #training the word2vec model for parsing ingredients 
  model_cbow = Word2Vec(corpus, sg = 0,workers = 8, window = get_window(corpus), min_count = 1, vector_size = 100, )
  model_cbow.save('model_cbow.model')
  print('Word2Vec model trained successfully!')

Length of corpus 0       10
1       12
2       12
3       15
4       12
        ..
5933     7
5934    14
5935     8
5936    13
5937    13
Name: Ingredient-count, Length: 5938, dtype: int64
Word2Vec model trained successfully!


In [6]:
print(model_cbow.wv.most_similar(u'tomato'))

[('potato (aloo)', 0.9976038932800293), ('onion', 0.9956145882606506), ('green chilli', 0.9936761260032654), ('red chilli powder', 0.9931477904319763), ('ginger', 0.9898263812065125), ('sunflower oil', 0.9880262017250061), ('green chillies', 0.9876503348350525), ('methi leaves (fenugreek leaves)', 0.9874364733695984), ('kala chana (brown chickpeas)', 0.9865450263023376), ('coriander (dhania)', 0.9850648045539856)]


In [7]:
class MeanEmbeddingVectoriser(object):
  def __init__(self,model_cbow):
    self.model_cbow = model_cbow
    self.vector_size = model_cbow.wv.vector_size

  def fit(self):
      return self
  
  def transform(self,docs):
      doc_vector = self.doc_average_list(docs)
      return doc_vector

  def doc_average(self,doc):
      mean = []
      for word in doc:
        if word in self.model_cbow.wv.index_to_key:
            mean.append(self.model_cbow.wv.get_vector(word))

      if not mean:
        return np.zeros(self.vector_size)

      else:
        mean = np.array(mean).mean(axis = 0)
        return mean

  def doc_average_list(self, docs):
      return np.vstack([self.doc_average(doc) for doc in docs])

In [8]:
class tfidfEmbeddingVectorizer(object):

    def __init__(self, model_cbow):

      self.model_cbow = model_cbow
      self.vector_size = model_cbow.vector_size
      self.word_idf_weight = None

    def fit(self, docs):
    #building a tfidf model to compute each words idf as its weight

      text_docs = []
      for doc in docs:
        text_docs.append(" ".join(doc))

      tfidf = TfidfVectorizer()
      tfidf.fit(text_docs)

      #if a word was never seen before, it is given idf of max of known idf values
      max_idf = max(tfidf.idf_)
      self.word_idf_weight = defaultdict(lambda: max_idf, [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items() ],
                                       )
      return self

    def transform(self, docs):
      doc_word_vector = self.doc_average_list(docs)
      return doc_word_vector

    def doc_average(self, doc):
      # compute the weighted mean of document's word embeddings 
      mean = []
      for word in doc:
        if word in self.model_cbow.wv.index_to_key:
            mean.append(self.model_cbow.wv.get_vector(word) * self.word_idf_weight[word])

      if not mean:
        return np.zeros(self.vector_size)

      else:
        mean = np.array(mean).mean(axis = 0)
        return mean

    def doc_average_list(self,docs):
      return np.vstack([self.doc_average(doc) for doc in docs])

In [9]:
def get_recommendations(N, scores):
    """
    Rank scores and output a pandas data frame containing all the details of the top N recipes.
    :param scores: list of cosine similarities
    """
    # load in recipe dataset
    df_recipes = pd.read_csv("Cleaned_Indian_Food_Dataset.csv")
    # order the scores with and filter to get the highest N scores
    top = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:N]
    # create dataframe to load in recommendations
    recommendation = pd.DataFrame(columns=["recipe", "ingredients", "url"])
    count = 0
    for i in top:
        recommendation.at[count, "recipe"] = df_recipes["TranslatedRecipeName"][i]
        recommendation.at[count, "ingredients"] = df_recipes["Cleaned-Ingredients"][i]
        recommendation.at[count, "url"] = df_recipes["URL"][i]
        count += 1
    return recommendation

def get_recs(ingredients,mean=False, N=5):
  #load word2vec model
  model = Word2Vec.load("model_cbow.model")

  #normalize embeddings
  model.init_sims(replace = True)
  if model:
    print("Successfully loaded model")
  
  #create corpus
  corpus = get_and_sort_corpus(data)
  

  if mean:

    mean_vec_tr = MeanEmbeddingVectoriser(model)
    doc_vec = mean_vec_tr.transform(corpus)
    doc_vec = [doc.reshape(1,-1) for doc in doc_vec]
    assert len(doc_vec) == len(corpus)

  else:
    tfidf_vec_tr = tfidfEmbeddingVectorizer(model)
    tfidf_vec_tr.fit(corpus)
    doc_vec = tfidf_vec_tr.transform(corpus)
    doc_vec = [doc.reshape(1,-1) for doc in doc_vec]
    assert len(doc_vec) == len(corpus)

  #create embeddings for input
  input =  ingredients
  input = input.split(",")

  if mean:
    input_embedding = mean_vec_tr.transform([input])[0].reshape(1,-1)

  else:
    input_embedding = tfidf_vec_tr.transform([input])[0].reshape(1,-1)

      
      # get cosine similarity between input embedding and all the document embeddings
  cos_sim = map(lambda x: cosine_similarity(input_embedding, x)[0][0], doc_vec)
  scores = list(cos_sim)
    # Filter top N recommendations
  recommendations = get_recommendations(N, scores)
  return recommendations

if __name__ == "__main__":
  input = "potato, okra, bell pepper"
  rec = get_recs(input)
  print(rec)

  model.init_sims(replace = True)


Successfully loaded model
                                              recipe  \
0  Homemade Multigrain Baby Cereal Recipe (Baby P...   
1                    Apple And Raisin Chutney Recipe   
2                  Pineapple and Dill Granita Recipe   
3                         Kiwi Basil Lemonade Recipe   
4                        Spiced Prune Chutney Recipe   

                                         ingredients  \
0  barley (seeds),brown rice,instant oats (oatmea...   
1  salt,sunflower oil,apples,raisins,lemon,sugar,...   
2        pineapple,ice cubes,water,sugar,dill leaves   
3  black salt,ice cubes,water,sugar syrup,basil l...   
4  tomato,salt,prune,nutmeg powder,celery,red chi...   

                                                 url  
0  https://www.archanaskitchen.com/homemade-multi...  
1  https://www.archanaskitchen.com/apple-and-rais...  
2  https://www.archanaskitchen.com/pineapple-and-...  
3  https://www.archanaskitchen.com/kiwi-basil-lem...  
4  https://www.archanaski