In [1]:
!pip install -r requirements.txt



In [2]:
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
seed = 777
random.seed(seed)

torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [4]:
import torch
 
print(torch.cuda.is_available())

True


In [5]:
import torch

print(torch.cuda.is_available())
print(f"CUDA version: {torch.version.cuda}")

# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(torch.cuda.current_device())
	
print(torch.cuda.get_device_name(cuda_id))


True
CUDA version: 11.8
0
NVIDIA GeForce RTX 3050 Laptop GPU


In [6]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [8]:
df = pd.read_csv('datasets/DfCleaned.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,resp_text,clarity,Class,tokens,lemma,ngram2,ngram3
0,0,Prezado Sr Jose Taunai Em atenção ao seu pe...,c5,2,"['prezado', 'sr', 'jose', 'taunai', 'em', 'ate...",prezar sr jose taunai em atenção a o seu pedir...,"[('prezar', 'sr'), ('sr', 'jose'), ('jose', 't...","[('prezar', 'sr', 'jose'), ('sr', 'jose', 'tau..."
1,1,"""A pedido do Pró-Reitor de Graduação, informa...",c5,2,"['a', 'pedido', 'do', 'próreitor', 'de', 'grad...",o pedir de o próreitor de graduação informamo ...,"[('o', 'pedir'), ('pedir', 'de'), ('de', 'o'),...","[('o', 'pedir', 'de'), ('pedir', 'de', 'o'), (..."
2,2,"""Prezado (a) Sr. (a), Agradecemos o contato e...",c234,1,"['prezado', 'a', 'sr', 'a', 'agradecemos', 'o'...",prezar o sr o agradecer o contato e informamo ...,"[('prezar', 'o'), ('o', 'sr'), ('sr', 'o'), ('...","[('prezar', 'o', 'sr'), ('o', 'sr', 'o'), ('sr..."
3,3,"""Prezado (a) Sr. (a), Agradecemos o contato e...",c234,1,"['prezado', 'a', 'sr', 'a', 'agradecemos', 'o'...",prezar o sr o agradecer o contato e informamo ...,"[('prezar', 'o'), ('o', 'sr'), ('sr', 'o'), ('...","[('prezar', 'o', 'sr'), ('o', 'sr', 'o'), ('sr..."
4,4,"""Prezado Prof. Gilberto Tadeu Reis da Silva ...",c234,1,"['prezado', 'prof', 'gilberto', 'tadeu', 'reis...",prezar prof gilberto tader rei de o Silva em a...,"[('prezar', 'prof'), ('prof', 'gilberto'), ('g...","[('prezar', 'prof', 'gilberto'), ('prof', 'gil..."


In [9]:
encoding = tokenizer.batch_encode_plus(
    df['tokens'].head(100),                    # List of input texts
    padding=True,              # Pad to the maximum sequence length
    truncation=True,           # Truncate to the maximum sequence length if necessary
    return_tensors='pt',      # Return PyTorch tensors
    add_special_tokens=True    # Add special tokens CLS and SEP
).to(device)

input_ids = encoding['input_ids'] 
print(f"Input ID: {input_ids}")
attention_mask = encoding['attention_mask']  # Attention mask
# print attention mask
print(f"Attention mask: {attention_mask}")

Input ID: tensor([[  101,  1031,  1005,  ...,     0,     0,     0],
        [  101,  1031,  1005,  ...,  1005, 23384,   102],
        [  101,  1031,  1005,  ...,     0,     0,     0],
        ...,
        [  101,  1031,  1005,  ...,  2229,  1005,   102],
        [  101,  1031,  1005,  ...,     0,     0,     0],
        [  101,  1031,  1005,  ...,     0,     0,     0]], device='cuda:0')
Attention mask: tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')


In [10]:
# Generate embeddings using BERT model
with torch.no_grad():
	outputs = model(input_ids, attention_mask=attention_mask)
	word_embeddings = outputs.last_hidden_state # This contains the embeddings

# Output the shape of word embeddings
print(f"Shape of Word Embeddings: {word_embeddings.shape}")


Shape of Word Embeddings: torch.Size([100, 512, 768])
