<a href="https://colab.research.google.com/github/dudaholandah/NLP/blob/main/Projeto/Text_Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!cp -r '/content/drive/MyDrive/Datasets/cuisine-classification-ingredients' 'cuisine-classification-ingredients'

# Imports

In [None]:
!pip install -q -U "tensorflow-text==2.8.*"
!pip install -q -U tf-models-official==2.7.0
!pip install -U tfds-nightly
!pip install -U sentence-transformers

In [6]:
import json
import os
import pandas as pd
import re
import numpy as np

In [5]:
#TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
#WORD2VEC
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
#BERT
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow_addons as tfa
from official.nlp import optimization
from sentence_transformers import SentenceTransformer
tf.get_logger().setLevel('ERROR')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Data

In [7]:
train_path = 'cuisine-classification-ingredients/train.json'
test_path = 'cuisine-classification-ingredients/test.json'

with open(train_path, 'r') as f:
  train_json = json.load(f)

In [8]:
train_df = pd.DataFrame(train_json, columns=['id', 'cuisine', 'ingredients'])

In [9]:
train_df = train_df.drop(['id'], axis=1)

## Pre-Processing

In [10]:
def pre_processing(text):
  new = ""
  for word in text.split(','):
    word = re.sub(r'[^\w\s]', '', word.lower())
    word = re.sub(r'[0-9]+', '', word)
    word = re.sub(r'\boz\b', '', word)
    word = re.sub(r'\ba taste of\b', '', word)
    new = new + word.strip() + " "

  return new[:-1]

In [11]:
train_df['ingredients'] = [','.join(x).strip() for x in train_df['ingredients']]

In [12]:
train_df['ingredients'] = [pre_processing(x) for x in train_df['ingredients']]

In [13]:
train_df['ingredients']

0        romaine lettuce black olives grape tomatoes ga...
1        plain flour ground pepper salt tomatoes ground...
2        eggs pepper salt mayonaise cooking oil green c...
3                           water vegetable oil wheat salt
4        black pepper shallots cornflour cayenne pepper...
                               ...                        
39769    light brown sugar granulated sugar butter warm...
39770    kraft zesty italian dressing purple onion broc...
39771    eggs citrus fruit raisins sourdough starter fl...
39772    boneless chicken skinless thigh minced garlic ...
39773    green chile jalapeno chilies onions ground bla...
Name: ingredients, Length: 39774, dtype: object

# Different Embeddings

## TFIDF

In [14]:
tfidf_vectorizer = TfidfVectorizer() #tokenizer=(lambda x : x.split(',')))

In [15]:
ingredients = train_df['ingredients']
train_tfidf = tfidf_vectorizer.fit_transform(ingredients)

In [16]:
print(tfidf_vectorizer.get_feature_names_out())

['abalone' 'abbamele' 'absinthe' ... 'ziti' 'zucchini' 'épices']


In [17]:
tfidf_df = pd.DataFrame(train_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
#tfidf_df = tfidf_df.drop([''], axis=1)

In [18]:
len(tfidf_vectorizer.get_feature_names_out())

3057

## Word2Vec

In [19]:
ingredients = '. '.join(train_df['ingredients'])
sentences = nltk.sent_tokenize(ingredients)
sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

In [20]:
model_word2vec = Word2Vec(sentences, size=300, min_count=1, window=5, sg=1,workers=4)

In [None]:
model_word2vec.wv.vocab

In [22]:
def get_embedding_w2v(doc_tokens):
    embeddings = []
    for tok in doc_tokens:
        if tok in model_word2vec.wv.vocab:
                embeddings.append(model_word2vec.wv.word_vec(tok))
        else:
            embeddings.append(np.random.rand(300))
    return np.mean(embeddings, axis=0)

In [23]:
senteces_word2vec = train_df['ingredients'].apply(lambda x : get_embedding_w2v(x.split()))

## BERT

In [24]:
bert_model_name = 'small_bert/bert_en_uncased_L-2_H-512_A-8' 
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

print('BERT model selected           :', tfhub_handle_encoder)
print('Preprocessing model auto-selected:', tfhub_handle_preprocess)

BERT model selected           : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1
Preprocessing model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [25]:
bert_preprocess_model = hub.load(tfhub_handle_preprocess)
text_test = ['this is such an amazing movie!']
text_preprocessed = bert_preprocess_model(text_test)

In [26]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

### https://www.tensorflow.org/text/tutorials/classify_text_with_bert
### https://colab.research.google.com/github/tensorflow/text/blob/master/docs/tutorials/bert_glue.ipynb#scrollTo=aksj743St9ga

In [None]:
bert_model_sentence = SentenceTransformer('bert-base-nli-mean-tokens')

In [28]:
sentence_vecs = bert_model_sentence.encode(train_df['cuisine'])

# Different Classificators

# Text Retrieval

In [29]:
def cosine_similarity(vetor1,vetor2):

    prod_interno = 0

    for i in range(len(vetor1)):
        prod_interno += vetor1[i]*vetor2[i]

    norma_vetor1 = sum([x**2 for x in vetor1])**0.5
    norma_vetor2 = sum([x**2 for x in vetor2])**0.5

    return prod_interno/(norma_vetor1*norma_vetor2)

In [None]:
categ = train_df['cuisine'][:1000]
top10_tfidf = [0] * 10

for i in range(len(categ)):
    best = []
    for j in range(len(categ)):
        if i == j: continue
        best.append( (cosine_similarity(tfidf_df.iloc[j], tfidf_df.iloc[i]), j))

    best.sort(reverse=True)
    top = best[:10]

    cat_orig = train_df['cuisine'].iloc[i]

    for idx, [sim, j] in enumerate(top):
        top10_tfidf[idx] += cat_orig in train_df['cuisine'].iloc[j]

In [None]:
top10_tfidf

[543, 467, 486, 461, 431, 427, 424, 417, 425, 387]

In [30]:
categ = train_df['cuisine'][:1000]
top10_bert = [0] * 10

for i in range(len(categ)):
    best = []
    for j in range(len(categ)):
        if i == j: continue
        best.append( (cosine_similarity(sentence_vecs[i], sentence_vecs[j]), j))

    best.sort(reverse=True)
    top = best[:10]

    cat_orig = train_df['cuisine'].iloc[i]

    for idx, [sim, j] in enumerate(top):
        top10_bert[idx] += cat_orig in train_df['cuisine'].iloc[j]

In [31]:
top10_bert

[1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]

In [32]:
categ = train_df['cuisine'][:1000]
top10_word2vec = [0] * 10

for i in range(len(categ)):
    best = []
    for j in range(len(categ)):
        if i == j: continue
        best.append( (cosine_similarity(senteces_word2vec[i], senteces_word2vec[j]), j))

    best.sort(reverse=True)
    top = best[:10]

    cat_orig = train_df['cuisine'].iloc[i]

    for idx, [sim, j] in enumerate(top):
        top10_word2vec[idx] += cat_orig in train_df['cuisine'].iloc[j]

In [33]:
top10_word2vec

[595, 556, 551, 538, 508, 506, 486, 484, 474, 480]