<a href="https://colab.research.google.com/github/dudaholandah/NLP/blob/main/Projeto/IndianFood_Text_Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
!cp -r '/content/drive/MyDrive/NLP-Project/indian-food' 'indian-food'

# Imports

In [3]:
!pip install gensim --upgrade
!pip install glove-python-binary

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 50.7 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting glove-python-binary
  Downloading glove_python_binary-0.2.0-cp37-cp37m-manylinux1_x86_64.whl (948 kB)
[K     |████████████████████████████████| 948 kB 37.4 MB/s 
Installing collected packages: glove-python-binary
Successfully installed glove-python-binary-0.2.0


In [4]:
#WORD2VEC
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
#GLOVE
from glove import Corpus, Glove
#Data
import json
import os
import pandas as pd
import re
import numpy as np

## Download

# Data

In [7]:
path = 'indian-food/indian_food.csv'
df_indian = pd.read_csv(path)

In [8]:
df_indian.head(5)

Unnamed: 0,name,ingredients,diet,prep_time,cook_time,flavor_profile,course,state,region
0,Balu shahi,"Maida flour, yogurt, oil, sugar",vegetarian,45,25,sweet,dessert,West Bengal,East
1,Boondi,"Gram flour, ghee, sugar",vegetarian,80,30,sweet,dessert,Rajasthan,West
2,Gajar ka halwa,"Carrots, milk, sugar, ghee, cashews, raisins",vegetarian,15,60,sweet,dessert,Punjab,North
3,Ghevar,"Flour, ghee, kewra, milk, clarified butter, su...",vegetarian,15,30,sweet,dessert,Rajasthan,West
4,Gulab jamun,"Milk powder, plain flour, baking powder, ghee,...",vegetarian,15,40,sweet,dessert,West Bengal,East


## Pre-Processing

In [9]:
def pre_processing(text):
  new = ""
  for word in text.split(','):
    word = re.sub(r'[^\w\s]', '', word.lower())
    word = re.sub(r'[0-9]+', '', word)
    word = re.sub(r'\boz\b', '', word)
    word = re.sub(r'\ba taste of\b', '', word)
    new = new + word.strip() + " "

  return new[:-1]

In [10]:
df_indian['ingredients'] = [pre_processing(x) for x in df_indian['ingredients']]

In [12]:
df_ingredients = df_indian['ingredients']
df_diet = df_indian['diet']
df_flavor_profile = df_indian['flavor_profile']
df_course = df_indian['course']

# Different Embeddings

## Word2Vec

In [13]:
def word_embedding_w2v(tokenizer, model, vocab_size, embedding_dim):
  embedding_w2v = np.zeros((vocab_size, embedding_dim))
  for word, i in tokenizer.word_index.items():
    if word in model.wv:
      embedding_w2v[i] = model.wv.get_vector(word)
    else:
      embedding_w2v[i]=np.random.normal(0,np.sqrt(0.25), embedding_dim)
  return embedding_w2v

In [14]:
def sentence_embedding_w2v(model, doc_tokens, embedding_dim):
  embeddings = []
  for tok in doc_tokens:
    if tok in model.wv:
      embeddings.append(model.wv.get_vector(tok))
    else:
      embeddings.append(np.random.normal(0,np.sqrt(0.25), embedding_dim))
  return np.mean(embeddings, axis=0)

In [15]:
# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_ingredients)
vocab_size_cuisine = len(tokenizer.word_index) + 1

In [16]:
# EMBEDDING LAYER

EMBEDDING_DIM_W2V = 100

sentences = [sentence.split() for sentence in df_ingredients]
model_word2vec = Word2Vec(sentences, vector_size=EMBEDDING_DIM_W2V, min_count=1, window=5, sg=1,workers=4)

embedding_w2v = word_embedding_w2v(tokenizer, model_word2vec, vocab_size_cuisine, EMBEDDING_DIM_W2V)
sentences_w2v = df_ingredients.apply(lambda x : sentence_embedding_w2v(model_word2vec, x.split(), EMBEDDING_DIM_W2V))

## GloVe

In [17]:
def create_embedding_glove(glove, vocab_size, embedding_dim):
  embedding_glove = np.zeros((vocab_size, embedding_dim))
  for i, word in enumerate(glove.dictionary):
    embedding_glove[i+1] = glove.word_vectors[i]
  return embedding_glove

In [18]:
def sentence_embedding_glove(model, doc_tokens, embedding_dim):
  embeddings = []
  for tok in doc_tokens:
    if tok in glove.dictionary:
      embeddings.append(glove.word_vectors[glove.dictionary[tok]])
    else:
      embeddings.append(np.random.normal(0,np.sqrt(0.25), embedding_dim))
  return np.mean(embeddings, axis=0)

In [19]:
corpus = Corpus() 
 
sentences = [sentence.split() for sentence in df_ingredients]
corpus.fit(sentences, window=10)

In [20]:
# EMBEDDING LAYER

EMBEDDING_DIM_GLV = 100

glove = Glove(no_components=EMBEDDING_DIM_GLV, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30, no_threads=2)
glove.add_dictionary(corpus.dictionary)

embedding_glove = create_embedding_glove(glove, vocab_size_cuisine, EMBEDDING_DIM_GLV)
sentences_glove = df_ingredients.apply(lambda x : sentence_embedding_glove(glove, x.split(), EMBEDDING_DIM_GLV))

# Text Retrieval

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
def text_retrieval(dataset, embedding):
  top10 = [0] * 10
  size = len(dataset)

  for i in range(size):
    similarity = cosine_similarity([embedding[i]], embedding[:size])
    similarity = [(similarity, idx) for idx,similarity in enumerate(similarity[0])]

    similarity.sort(reverse=True)
    top = similarity[1:11]

    cat_orig = dataset.iloc[i]

    for idx, [sim, j] in enumerate(top):
      top10[idx] += cat_orig in dataset.iloc[j]

  return top10

Word2Vec

In [35]:
top10_diet_word2vec = text_retrieval(df_diet, sentences_w2v.tolist())

In [36]:
top10_diet_word2vec

[233, 235, 233, 230, 230, 232, 228, 228, 234, 228]

In [37]:
top10_flavor_profile_word2vec = text_retrieval(df_flavor_profile, sentences_w2v.tolist())

In [38]:
top10_flavor_profile_word2vec

[168, 164, 164, 166, 155, 156, 154, 154, 157, 130]

In [39]:
top10_course_word2vec = text_retrieval(df_course, sentences_w2v.tolist())

In [40]:
top10_course_word2vec

[164, 164, 169, 169, 166, 157, 151, 150, 152, 144]

GloVe

In [41]:
top10_diet_glove = text_retrieval(df_diet, sentences_glove.tolist())

In [42]:
top10_diet_glove

[236, 240, 233, 235, 236, 231, 234, 231, 230, 227]

In [43]:
top10_flavor_profile_glove = text_retrieval(df_flavor_profile, sentences_glove.tolist())

In [44]:
top10_flavor_profile_glove

[168, 177, 167, 166, 176, 171, 170, 162, 153, 148]

In [45]:
top10_course_glove = text_retrieval(df_course, sentences_glove.tolist())

In [46]:
top10_course_glove

[166, 174, 168, 161, 172, 170, 175, 160, 152, 144]