### Text Preprocessing

In [1]:
#load datas
import pandas as pd
#df = pd.read_csv('data/winemag-data_first150k.csv')
#df = pd.read_csv('data/winemag-data-130k-v2.csv')
df = pd.read_json('data/winemag-data-130k-v2.json')
print(df.head())

   points                                              title  \
0      87                  Nicosia 2013 Vulkà Bianco  (Etna)   
1      87      Quinta dos Avidagos 2011 Avidagos Red (Douro)   
2      87      Rainstorm 2013 Pinot Gris (Willamette Valley)   
3      87  St. Julian 2013 Reserve Late Harvest Riesling ...   
4      87  Sweet Cheeks 2012 Vintner's Reserve Wild Child...   

                                         description         taster_name  \
0  Aromas include tropical fruit, broom, brimston...       Kerin O’Keefe   
1  This is ripe and fruity, a wine that is smooth...          Roger Voss   
2  Tart and snappy, the flavors of lime flesh and...        Paul Gregutt   
3  Pineapple rind, lemon pith and orange blossom ...  Alexander Peartree   
4  Much like the regular bottling from 2012, this...        Paul Gregutt   

  taster_twitter_handle  price                         designation  \
0          @kerinokeefe    NaN                        Vulkà Bianco   
1            @voss

In [2]:
print(df.shape)

(129971, 13)


In [3]:
corpus = df['description']

In [4]:
#delete stopwords
from nltk.corpus import stopwords
import string

stop_words = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stop_words.update(punctuations)


In [5]:
def clean_text(text):
    # Convertir el texto a minúsculas
    text = text.lower()
    # Eliminar signos de puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Eliminar stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [28]:
df['cleaned_description'] = corpus.apply(clean_text)
print(df['cleaned_description'])

0    aromas include tropical fruit broom brimstone ...
1    ripe fruity wine smooth still structured firm ...
2    tart snappy flavors lime flesh rind dominate g...
3    pineapple rind lemon pith orange blossom start...
4    much like regular bottling 2012 comes across r...
5    blackberry raspberry aromas show typical navar...
6    heres bright informal red opens aromas candied...
7    dry restrained wine offers spice profusion bal...
8    savory dried thyme notes accent sunnier flavor...
9    great depth flavor fresh apple pear fruits tou...
Name: cleaned_description, dtype: object


In [29]:
textos = df['cleaned_description'].tolist()
len(textos)

10

### Bert

In [7]:
from transformers import BertTokenizer, TFBertModel

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on,

In [11]:
import numpy as np 

def generate_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='tf', padding=True, truncation=True)
        outputs = bert_model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :])  # Use [CLS] token representation
    return np.array(embeddings).transpose(0,2,1)

corpus_bert = generate_bert_embeddings(df['cleaned_description'][:10])

In [12]:
corpus_bert.shape

(10, 768, 1)

Step 7: Generate Embeddings for the Query

Generate embeddings for the query using the same model used for the documents.

In [13]:
query = ['two words']
query_bert = generate_bert_embeddings(query)
#query_bert.shape

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
# Calcular la similitud coseno entre la consulta y los embeddings del corpus
cos_similarities = cosine_similarity(query_bert.reshape(1,768), corpus_bert.reshape(10,768))
cos_similarities_df = pd.DataFrame(cos_similarities.T, columns=['sim'])
# Agregar la columna 'designation' del DataFrame original
cos_similarities_df['Wine'] = df['title']
# Mostrar el DataFrame resultante
cos_similarities_df

Unnamed: 0,sim,Wine
0,0.755257,Nicosia 2013 Vulkà Bianco (Etna)
1,0.77296,Quinta dos Avidagos 2011 Avidagos Red (Douro)
2,0.735794,Rainstorm 2013 Pinot Gris (Willamette Valley)
3,0.777312,St. Julian 2013 Reserve Late Harvest Riesling ...
4,0.809283,Sweet Cheeks 2012 Vintner's Reserve Wild Child...
5,0.815357,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...
6,0.80068,Terre di Giurfo 2013 Belsito Frappato (Vittoria)
7,0.824024,Trimbach 2012 Gewurztraminer (Alsace)
8,0.847462,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...
9,0.766631,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...


In [15]:
def systemRI(query):
    query_bert = generate_bert_embeddings(query)
    cos_similarities = cosine_similarity(query_bert.reshape(1,768), corpus_bert.reshape(10,768))
    cos_similarities_df = pd.DataFrame(cos_similarities.T, columns=['sim'])
    # Agregar la columna 'designation' del DataFrame original
    cos_similarities_df['Wine'] = df['title']
    # Mostrar el DataFrame resultante
    return cos_similarities_df

In [16]:
df = systemRI(['guerrerin cabra'])
df

Unnamed: 0,sim,Wine
0,0.773238,Nicosia 2013 Vulkà Bianco (Etna)
1,0.78675,Quinta dos Avidagos 2011 Avidagos Red (Douro)
2,0.776004,Rainstorm 2013 Pinot Gris (Willamette Valley)
3,0.81266,St. Julian 2013 Reserve Late Harvest Riesling ...
4,0.827445,Sweet Cheeks 2012 Vintner's Reserve Wild Child...
5,0.853319,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...
6,0.832088,Terre di Giurfo 2013 Belsito Frappato (Vittoria)
7,0.837732,Trimbach 2012 Gewurztraminer (Alsace)
8,0.866765,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...
9,0.782025,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...


# Computacion Paralela

In [17]:
# Función para dividir el corpus en fragmentos
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [19]:
# Función para generar embeddings paralelo BERT
def generate_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='tf', padding=True, truncation=True)
        outputs = bert_model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].numpy())  # Use [CLS] token representation
    return embeddings

In [30]:
#Definir el número de fragmentos y de procesos paralelos
num_chunks = 4  # Ajusta según tu hardware
chunk_size = len(textos) // num_chunks
text_chunks = list(chunks(textos, chunk_size))

In [32]:
from multiprocessing import Pool
# Generar embeddings en paralelo
with Pool(num_chunks) as pool:
    result = pool.map(generate_bert_embeddings, text_chunks)


In [None]:
# Combinar los resultados
corpus_bert_paralelo = np.vstack(result)
corpus_bert_paralelo = np.array(corpus_bert).transpose(0, 2, 1)