### Imports

In [9]:
import torch
import pandas as pd
from nltk import word_tokenize
from datasets import Dataset
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search
import customized_headlines.data.load as load

### Load model [`gtr-t5-large`](https://huggingface.co/sentence-transformers/gtr-t5-large) form Hugging Face

In [2]:
# Load model
model = SentenceTransformer("sentence-transformers/gtr-t5-large")

### Load data

In [6]:
news = load.news
data_raw = load.data_raw
data_processed = load.data_processed

# Get the csv files from the path provided
files = load.get_csv_dir(data_raw)

### Read data

In [7]:
df = load.read_csv(news)

# Show dataframe
df.head(1)

Unnamed: 0,archived_url,article_url,id,language,media_name,media_url,publish_date,title,url,category,text,text_cleanned
0,https://web.archive.org/web/20230307062430/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230307062430,es,24-horas.mx,http://24-horas.mx,2023-03-06 00:00:00,AMLO; apoya a Samuel García ante conflicto con...,https://www.24-horas.mx/2023/03/06/amlo-apoya-...,negocios,AMLO; apoya a Samuel García ante conflicto con...,AMLO; apoya a Samuel García ante conflicto con...


### Get tokens

In [None]:
max_tokens = 512

# Remove reviews that are too long to embed
df["n_tokens"] = df.text_cleanned.apply(lambda x: len(encoding.encode(x)))
df_filtered = df[df.n_tokens <= max_tokens]
len(df_filtered)

In [14]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

# Tokenize text in PyTorch with model
df['tokens-bert'] = df.apply(lambda row: model(tokenizer(row['text_cleanned'], return_tensors='pt')), axis=1)
df

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Token indices sequence length is longer than the specified maximum sequence length for this model (12670 > 512). Running this sequence through the model wi

AttributeError: 

In [4]:
# Tokenize column
df['tokens'] = df.apply(lambda row: word_tokenize(row['text_cleanned']), axis=1)

# Clean tokens and count them
df['token_count'] = df.tokens.replace(',','').str.len()

# Show dataframe
df.head(1)

Unnamed: 0,archived_url,article_url,id,language,media_name,media_url,publish_date,title,url,category,text,text_cleanned,tokens,token_count
0,https://web.archive.org/web/20230307062430/htt...,https://wayback-api.archive.org/colsearch/v1/m...,20230307062430,es,24-horas.mx,http://24-horas.mx,2023-03-06 00:00:00,AMLO; apoya a Samuel García ante conflicto con...,https://www.24-horas.mx/2023/03/06/amlo-apoya-...,negocios,AMLO; apoya a Samuel García ante conflicto con...,AMLO; apoya a Samuel García ante conflicto con...,"[AMLO, ;, apoya, a, Samuel, García, ante, conf...",7295


In [5]:
# Calculate mean of tokens from column
tokens_mean = df['token_count'].mean()
tokens_mean

1013.3

In [19]:
# Create a list o column values
texts = df['text_cleanned'].to_list()

### Create embeddings

In [20]:
# Create embeddings
embeddings = model.encode(texts)

In [26]:
# Convert embeddings to dataframe
df_emb = pd.DataFrame(embeddings)

# Save dataframe as csv file
df_emb.to_csv(f'{data_processed}/embeddings-t5.csv', index=False)

### Semantic search

In [27]:
# Load dataset from dataframe
dataset = Dataset.from_pandas(df_emb)

# Create a tensor
dataset_embeddings = torch.from_numpy(dataset.to_pandas().to_numpy()).to(torch.float)

In [28]:
interests = ["Deportes, tecnología"]
output = model.encode(interests)

query_embeddings = torch.FloatTensor(output)

In [29]:
hits = semantic_search(query_embeddings, dataset_embeddings, top_k=3)
hits

[[{'corpus_id': 187, 'score': 0.7427448630332947},
  {'corpus_id': 141, 'score': 0.7378556132316589},
  {'corpus_id': 89, 'score': 0.7344669699668884}]]

In [30]:
texts[hits[0][0]['corpus_id']]

'Danza y teatro en la CDMX: ¿Qué ver del 3 al 5 de marzo?  Plan B de la Electoral Población Económicamente Activa Situación en Nicaragua Guerra Rusia - Ucrania Temas del día Home Opinión Secciones México Ciudad Estados Mundo Negocios Deportes Entretenimiento Cultura Salud Tecnología Autos Estilos Virales Videos Infografías El Cultural Suplementos Versión impresa Home Opinión Secciones México Ciudad Estados Negocios Mundo Deportes Entretenimiento Cultura Salud Tecnología Autos Estilos Virales Infografías El Cultural Suplementos Versión impresa Actividades culturales Danza y teatro en la CDMX: ¿Qué ver del 3 al 5 de marzo? Te decimos cuáles son las actividades más destacadas en la CDMX este fin de semana Danza y teatro en la CDMX, ¿Qué ver del 3 al 5 de marzo? Foto: Especial Por: Daniel Lugo - 03/03/2023 15:27 Instagram: @lugo_photographer Ciudad de México.- El mes de marzo llegó con una serie de acontecimientos artísticos y contemporáneos que no querrás perderte y de seguro desearás dis