<a href="https://colab.research.google.com/github/dudaholandah/NLP/blob/main/Projeto/Cuisine_Text_Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp -r '/content/drive/MyDrive/NLP-Project/cuisine-classification-ingredients' 'cuisine-classification-ingredients'

# Imports

In [None]:
!pip install gensim --upgrade
!pip install glove-python-binary

In [4]:
#WORD2VEC
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
#GLOVE
from glove import Corpus, Glove
#Data
import json
import os
import pandas as pd
import re
import numpy as np

# Data

In [5]:
path = 'cuisine-classification-ingredients/train.json'

with open(path, 'r') as f:
  json = json.load(f)

In [None]:
df_cuisine = pd.DataFrame(json, columns=['id', 'cuisine', 'ingredients'])

In [7]:
df_cuisine = df_cuisine.drop(['id'], axis=1)

In [8]:
df_cuisine = df_cuisine[:10000]

In [9]:
df_cuisine.head(5)

Unnamed: 0,cuisine,ingredients
0,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,"[water, vegetable oil, wheat, salt]"
4,indian,"[black pepper, shallots, cornflour, cayenne pe..."


## Pre-Processing

In [10]:
def pre_processing(text):
  new = ""
  for word in text.split(','):
    word = re.sub(r'[^\w\s]', '', word.lower())
    word = re.sub(r'[0-9]+', '', word)
    word = re.sub(r'\boz\b', '', word)
    word = re.sub(r'\ba taste of\b', '', word)
    new = new + word.strip() + " "

  return new[:-1]

In [11]:
df_cuisine['ingredients'] = [','.join(x).strip() for x in df_cuisine['ingredients']]

In [12]:
df_cuisine['ingredients'] = [pre_processing(x) for x in df_cuisine['ingredients']]

In [13]:
df_cuisine.head(5)

Unnamed: 0,cuisine,ingredients
0,greek,romaine lettuce black olives grape tomatoes ga...
1,southern_us,plain flour ground pepper salt tomatoes ground...
2,filipino,eggs pepper salt mayonaise cooking oil green c...
3,indian,water vegetable oil wheat salt
4,indian,black pepper shallots cornflour cayenne pepper...


# Different Embeddings

## Word2Vec

In [14]:
def word_embedding_w2v(tokenizer, model, vocab_size, embedding_dim):
  embedding_w2v = np.zeros((vocab_size, embedding_dim))
  for word, i in tokenizer.word_index.items():
    if word in model.wv:
      embedding_w2v[i] = model.wv.get_vector(word)
    else:
      embedding_w2v[i]=np.random.normal(0,np.sqrt(0.25), embedding_dim)
  return embedding_w2v

In [15]:
def sentence_embedding_w2v(model, doc_tokens, embedding_dim):
  embeddings = []
  for tok in doc_tokens:
    if tok in model.wv:
      embeddings.append(model.wv.get_vector(tok))
    else:
      embeddings.append(np.random.normal(0,np.sqrt(0.25), embedding_dim))
  return np.mean(embeddings, axis=0)

In [16]:
# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_cuisine['ingredients'])
vocab_size_cuisine = len(tokenizer.word_index) + 1

In [17]:
# EMBEDDING LAYER

EMBEDDING_DIM_W2V = 100

sentences = [sentence.split() for sentence in df_cuisine['ingredients']]
model_word2vec = Word2Vec(sentences, vector_size=EMBEDDING_DIM_W2V, min_count=1, window=5, sg=1,workers=4)

embedding_w2v = word_embedding_w2v(tokenizer, model_word2vec, vocab_size_cuisine, EMBEDDING_DIM_W2V)
sentences_w2v = df_cuisine['ingredients'].apply(lambda x : sentence_embedding_w2v(model_word2vec, x.split(), EMBEDDING_DIM_W2V))

## GloVe

In [18]:
def create_embedding_glove(glove, vocab_size, embedding_dim):
  embedding_glove = np.zeros((vocab_size, embedding_dim))
  for i, word in enumerate(glove.dictionary):
    embedding_glove[i+1] = glove.word_vectors[i]
  return embedding_glove

In [19]:
def sentence_embedding_glove(model, doc_tokens, embedding_dim):
  embeddings = []
  for tok in doc_tokens:
    if tok in glove.dictionary:
      embeddings.append(glove.word_vectors[glove.dictionary[tok]])
    else:
      embeddings.append(np.random.normal(0,np.sqrt(0.25), embedding_dim))
  return np.mean(embeddings, axis=0)

In [20]:
corpus = Corpus() 
 
sentences = [sentence.split() for sentence in df_cuisine['ingredients']]
corpus.fit(sentences, window=10)

In [21]:
# EMBEDDING LAYER

EMBEDDING_DIM_GLV = 100

glove = Glove(no_components=EMBEDDING_DIM_GLV, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30, no_threads=2)
glove.add_dictionary(corpus.dictionary)

embedding_glove = create_embedding_glove(glove, vocab_size_cuisine, EMBEDDING_DIM_GLV)
sentences_glove = df_cuisine['ingredients'].apply(lambda x : sentence_embedding_glove(glove, x.split(), EMBEDDING_DIM_GLV))

# Text Retrieval

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
def text_retrieval(dataset, embedding):
  top10 = [0] * 10
  size = len(dataset)

  for i in range(size):
    similarity = cosine_similarity([embedding[i]], embedding[:size])
    similarity = [(similarity, idx) for idx,similarity in enumerate(similarity[0])]

    similarity.sort(reverse=True)
    top = similarity[1:11]

    cat_orig = dataset.iloc[i]

    for idx, [sim, j] in enumerate(top):
      top10[idx] += cat_orig in dataset.iloc[j]

  return top10

Word2Vec

In [24]:
top10_word2vec = text_retrieval(df_cuisine['cuisine'], sentences_w2v.tolist())

In [25]:
top10_word2vec

[6335, 5951, 5889, 5800, 5707, 5576, 5623, 5537, 5421, 5465]

GloVe

In [26]:
top10_glove = text_retrieval(df_cuisine['cuisine'], sentences_glove.tolist())

In [27]:
top10_glove

[5615, 5394, 5234, 5142, 4972, 5021, 4911, 4881, 4859, 4779]