### Text Preprocessing

In [35]:
#load datas
import pandas as pd
#df = pd.read_csv('data/winemag-data_first150k.csv')
#df = pd.read_csv('data/winemag-data-130k-v2.csv')
df = pd.read_json('data/winemag-data-130k-v2.json')
print(df.head())

   points                                              title  \
0      87                  Nicosia 2013 Vulkà Bianco  (Etna)   
1      87      Quinta dos Avidagos 2011 Avidagos Red (Douro)   
2      87      Rainstorm 2013 Pinot Gris (Willamette Valley)   
3      87  St. Julian 2013 Reserve Late Harvest Riesling ...   
4      87  Sweet Cheeks 2012 Vintner's Reserve Wild Child...   

                                         description         taster_name  \
0  Aromas include tropical fruit, broom, brimston...       Kerin O’Keefe   
1  This is ripe and fruity, a wine that is smooth...          Roger Voss   
2  Tart and snappy, the flavors of lime flesh and...        Paul Gregutt   
3  Pineapple rind, lemon pith and orange blossom ...  Alexander Peartree   
4  Much like the regular bottling from 2012, this...        Paul Gregutt   

  taster_twitter_handle  price                         designation  \
0          @kerinokeefe    NaN                        Vulkà Bianco   
1            @voss

In [36]:
print(df.shape)

(129971, 13)


In [37]:
corpus = df['description']

In [38]:
#delete stopwords
from nltk.corpus import stopwords
import string

stop_words = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stop_words.update(punctuations)


In [39]:
def clean_text(text):
    # Convertir el texto a minúsculas
    text = text.lower()
    # Eliminar signos de puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Eliminar stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [40]:
df['cleaned_description'] = corpus.apply(clean_text)
print(df['cleaned_description'])

0         aromas include tropical fruit broom brimstone ...
1         ripe fruity wine smooth still structured firm ...
2         tart snappy flavors lime flesh rind dominate g...
3         pineapple rind lemon pith orange blossom start...
4         much like regular bottling 2012 comes across r...
                                ...                        
129966    notes honeysuckle cantaloupe sweeten delicious...
129967    citation given much decade bottle age prior re...
129968    welldrained gravel soil gives wine crisp dry c...
129969    dry style pinot gris crisp acidity also weight...
129970    big rich offdry powered intense spiciness roun...
Name: cleaned_description, Length: 129971, dtype: object


In [41]:
textos = df['cleaned_description'].tolist()
len(textos)

129971

### Bert

In [69]:
from transformers import BertTokenizer, TFBertModel

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [70]:
import numpy as np 

def generate_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='tf', padding=True, truncation=True)
        outputs = bert_model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :])  # Use [CLS] token representation
    return np.array(embeddings).transpose(0,2,1)

corpus_bert = generate_bert_embeddings(df['cleaned_description'][:10])

In [71]:
corpus_bert.shape

(10, 768, 1)

Step 7: Generate Embeddings for the Query

Generate embeddings for the query using the same model used for the documents.

In [72]:
query = ['two words']
query_bert = generate_bert_embeddings(query)
#query_bert.shape

Step 8: Compute Similarity Between Query and Documents

Compute the similarity between the query embedding and each document embedding.

In [73]:
from sklearn.metrics.pairwise import cosine_similarity
# Calcular la similitud coseno entre la consulta y los embeddings del corpus
cos_similarities = cosine_similarity(query_bert.reshape(1,768), corpus_bert.reshape(10,768))
cos_similarities_df = pd.DataFrame(cos_similarities.T, columns=['sim'])
# Agregar la columna 'designation' del DataFrame original
cos_similarities_df['Wine'] = df['title']
# Mostrar el DataFrame resultante
cos_similarities_df

Unnamed: 0,sim,Wine
0,0.755257,Nicosia 2013 Vulkà Bianco (Etna)
1,0.77296,Quinta dos Avidagos 2011 Avidagos Red (Douro)
2,0.735794,Rainstorm 2013 Pinot Gris (Willamette Valley)
3,0.777312,St. Julian 2013 Reserve Late Harvest Riesling ...
4,0.809283,Sweet Cheeks 2012 Vintner's Reserve Wild Child...
5,0.815357,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...
6,0.80068,Terre di Giurfo 2013 Belsito Frappato (Vittoria)
7,0.824024,Trimbach 2012 Gewurztraminer (Alsace)
8,0.847462,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...
9,0.766631,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...


Step 9: Retrieve and Rank Documents Based on Similarity Scores

Retrieve and rank the documents based on their similarity scores to the query.

In [74]:
def systemRI(query):
    query_bert = generate_bert_embeddings(query)
    cos_similarities = cosine_similarity(query_bert.reshape(1,768), corpus_bert.reshape(10,768))
    cos_similarities_df = pd.DataFrame(cos_similarities.T, columns=['sim'])
    # Agregar la columna 'designation' del DataFrame original
    cos_similarities_df['Wine'] = df['title']
    # Mostrar el DataFrame resultante
    return cos_similarities_df

In [75]:
df_RI_Bert = systemRI(['guerrerin cabra'])
df_RI_Bert

Unnamed: 0,sim,Wine
0,0.773238,Nicosia 2013 Vulkà Bianco (Etna)
1,0.78675,Quinta dos Avidagos 2011 Avidagos Red (Douro)
2,0.776004,Rainstorm 2013 Pinot Gris (Willamette Valley)
3,0.81266,St. Julian 2013 Reserve Late Harvest Riesling ...
4,0.827445,Sweet Cheeks 2012 Vintner's Reserve Wild Child...
5,0.853319,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...
6,0.832088,Terre di Giurfo 2013 Belsito Frappato (Vittoria)
7,0.837732,Trimbach 2012 Gewurztraminer (Alsace)
8,0.866765,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...
9,0.782025,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...


### Word2Vec

In [49]:
import gensim

In [50]:
model_path = ('data/GoogleNews-vectors-negative300.bin.gz')
# Cargar el modelo de Word2Vec
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

In [51]:
def generate_word2vec_embeddings(texts):
    embeddings = []
    for text in texts:
        tokens = text.lower().split()
        word_vectors = [word2vec_model[word] for word in tokens if word in word2vec_model]
        if word_vectors:
            embeddings.append(np.mean(word_vectors, axis=0))
        else:
            embeddings.append(np.zeros(word2vec_model.vector_size))
    return np.array(embeddings)

Step 7.1: Generate Embeddings for the Query

Generate embeddings for the query using the same model used for the documents.

In [52]:
corpus_word2vec = generate_word2vec_embeddings(textos[:10])

In [53]:
corpus_word2vec.shape

(10, 300)

In [54]:
query_word2vec = generate_word2vec_embeddings(query)
query_word2vec.shape

(1, 300)

Step 8.1: Compute Similarity Between Query and Documents

Compute the similarity between the query embedding and each document embedding.

In [58]:
from sklearn.metrics.pairwise import cosine_similarity
# Calcular la similitud coseno entre la consulta y los embeddings del corpus
cos_similarities = cosine_similarity(query_word2vec.reshape(1,300), corpus_word2vec.reshape(10,300))
cos_similarities_df = pd.DataFrame(cos_similarities.T, columns=['sim'])
# Agregar la columna 'designation' del DataFrame original
cos_similarities_df['Wine'] = df['title']
# Mostrar el DataFrame resultante
cos_similarities_df

Unnamed: 0,sim,Wine
0,0.235284,Nicosia 2013 Vulkà Bianco (Etna)
1,0.179255,Quinta dos Avidagos 2011 Avidagos Red (Douro)
2,0.205595,Rainstorm 2013 Pinot Gris (Willamette Valley)
3,0.250051,St. Julian 2013 Reserve Late Harvest Riesling ...
4,0.260209,Sweet Cheeks 2012 Vintner's Reserve Wild Child...
5,0.236145,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...
6,0.205847,Terre di Giurfo 2013 Belsito Frappato (Vittoria)
7,0.166919,Trimbach 2012 Gewurztraminer (Alsace)
8,0.221394,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...
9,0.182069,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...


Step 9: Retrieve and Rank Documents Based on Similarity Scores

Retrieve and rank the documents based on their similarity scores to the query.

In [64]:
def systemRI(query):
    query_word2vec = generate_word2vec_embeddings(query)
    cos_similarities = cosine_similarity(query_word2vec.reshape(1,300), corpus_word2vec.reshape(10,300))
    cos_similarities_df = pd.DataFrame(cos_similarities.T, columns=['sim'])
    # Agregar la columna 'designation' del DataFrame original
    cos_similarities_df['Wine'] = df['title']
    # Mostrar el DataFrame resultante
    return cos_similarities_df

In [68]:
df_RI_word2Vec = systemRI(['messi best'])
df_RI_word2Vec

Unnamed: 0,sim,Wine
0,0.18279,Nicosia 2013 Vulkà Bianco (Etna)
1,0.24426,Quinta dos Avidagos 2011 Avidagos Red (Douro)
2,0.206796,Rainstorm 2013 Pinot Gris (Willamette Valley)
3,0.239908,St. Julian 2013 Reserve Late Harvest Riesling ...
4,0.273934,Sweet Cheeks 2012 Vintner's Reserve Wild Child...
5,0.224171,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...
6,0.228074,Terre di Giurfo 2013 Belsito Frappato (Vittoria)
7,0.179272,Trimbach 2012 Gewurztraminer (Alsace)
8,0.186118,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...
9,0.234555,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...
