## BERT Embedding

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import random
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd

In [3]:
seed = 777
random.seed(seed)

torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [4]:
print(torch.cuda.is_available())
print(f"CUDA version: {torch.version.cuda}")

False
CUDA version: 12.1


In [None]:
# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(torch.cuda.current_device())
	
print(torch.cuda.get_device_name(cuda_id))

### Dataset

In [6]:
df = pd.read_csv('../data/enc_tok_nopunct_lemm.csv')

df.head()

Unnamed: 0,req_text,age,age_encoded,word_tokens,lemma
0,quantos empregados em cada um dos atuais níve...,a2,1,"['quantos', 'empregados', 'em', 'cada', 'um', ...",quanto empregado em cada um de o atual nível s...
1,solicito cópia das atas do conselho de admini...,a2,1,"['solicito', 'cópia', 'das', 'atas', 'do', 'co...",solicito cópia de o ata de o conselho de admin...
2,solicito informar a norma lei decreto portari...,a2,1,"['solicito', 'informar', 'a', 'norma', 'lei', ...",solicito informar o norma lei decreto portaria...
3,solicito por gentileza a informação sobre a q...,a2,1,"['solicito', 'por', 'gentileza', 'a', 'informa...",solicito por gentileza o informação sobre o qu...
4,solicito por gentileza a informação sobre a q...,a2,1,"['solicito', 'por', 'gentileza', 'a', 'informa...",solicito por gentileza o informação sobre o qu...


### Model

In [7]:
model_name = "neuralmind/bert-base-portuguese-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Downloading vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [9]:
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512, add_special_tokens = True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.sum(dim=1).squeeze().cpu().numpy()
    r = list(embeddings)
    return r

In [11]:
embeddings = []
for text in df['lemma']:
    embeddings.append(get_bert_embeddings(text))

In [None]:
labels = ['emb_' + str(i) for i in range(len(embeddings[0]))]

In [None]:
df_embeddings = pd.DataFrame(embeddings, columns=labels)

### Embedding

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler 
import numpy as np 

In [None]:
x = df_embeddings.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_embeddings = pd.DataFrame(x_scaled)

In [None]:
df_embeddings.head()

In [None]:
df_embeddings.shape

In [None]:
final = pd.concat([df, df_embeddings], axis=1)

In [None]:
final = final.iloc[:,1:]

In [None]:
final.iloc[0:3000]

In [None]:
final.head()

In [None]:
final.isnull().sum().sum()

In [None]:
f1 = final.iloc[0:3000]
f2 = final.iloc[3000:]

In [None]:
f1.to_csv('../emebeddings/BERT_Embeddings_1.csv', index=False)
f2.to_csv('../emebeddings/BERT_Embeddings_2.csv', index=False)

In [None]:
final.head()