## BERT Embedding

In [1]:
import warnings

warnings.filterwarnings("ignore")

In [2]:
import random
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd

In [3]:
seed = 777
random.seed(seed)

torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [4]:
print(torch.cuda.is_available())
print(f"CUDA version: {torch.version.cuda}")

False
CUDA version: 12.1


In [6]:
# Storing ID of current CUDA device
# cuda_id = torch.cuda.current_device()
# print(torch.cuda.current_device())
	
# print(torch.cuda.get_device_name(cuda_id))


### Dataset

In [7]:
df = pd.read_csv('../data/enc_tok_nopunct_lemm.csv')

df.head()

Unnamed: 0,req_text,age,age_encoded,word_tokens,lemma
0,quantos empregados em cada um dos atuais níve...,a2,1,"['quantos', 'empregados', 'em', 'cada', 'um', ...",quanto empregado em cada um de o atual nível s...
1,solicito cópia das atas do conselho de admini...,a2,1,"['solicito', 'cópia', 'das', 'atas', 'do', 'co...",solicito cópia de o ata de o conselho de admin...
2,solicito informar a norma lei decreto portari...,a2,1,"['solicito', 'informar', 'a', 'norma', 'lei', ...",solicito informar o norma lei decreto portaria...
3,solicito por gentileza a informação sobre a q...,a2,1,"['solicito', 'por', 'gentileza', 'a', 'informa...",solicito por gentileza o informação sobre o qu...
4,solicito por gentileza a informação sobre a q...,a2,1,"['solicito', 'por', 'gentileza', 'a', 'informa...",solicito por gentileza o informação sobre o qu...


### Model

In [10]:
model_name = "neuralmind/bert-base-portuguese-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [11]:
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
#model = model.to(device)

In [12]:
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512, add_special_tokens = True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.sum(dim=1).squeeze().cpu().numpy()
    r = list(embeddings)
    return r

In [13]:
embeddings = []
for text in df['lemma']:
    embeddings.append(get_bert_embeddings(text))

In [14]:
labels = ['emb_' + str(i) for i in range(len(embeddings[0]))]

In [15]:
df_embeddings = pd.DataFrame(embeddings, columns=labels)

### Embedding

In [16]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler 
import numpy as np 

In [17]:
x = df_embeddings.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_embeddings = pd.DataFrame(x_scaled)

In [18]:
df_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.564466,0.815525,0.107565,0.319572,0.101842,0.1051,0.209152,0.239094,0.322012,0.626018,...,0.275671,0.21938,0.934174,0.90927,0.090046,0.89756,0.590714,0.291341,0.416531,0.900556
1,0.52097,0.815053,0.120955,0.300525,0.079995,0.149684,0.218145,0.244269,0.318568,0.669064,...,0.282352,0.235483,0.916027,0.947861,0.102334,0.876205,0.603262,0.254797,0.394609,0.841805
2,0.535018,0.845178,0.10662,0.330766,0.061033,0.094648,0.204396,0.249104,0.32117,0.648407,...,0.288288,0.232071,0.927528,0.953746,0.102037,0.927873,0.533769,0.266102,0.429789,0.89574
3,0.548547,0.784378,0.057764,0.275971,0.123137,0.096042,0.181221,0.243667,0.323316,0.618222,...,0.306542,0.244469,0.954206,0.941051,0.075034,0.899374,0.585249,0.279545,0.380809,0.939743
4,0.559062,0.799921,0.060454,0.274691,0.135161,0.079634,0.183316,0.250197,0.31749,0.6165,...,0.304113,0.247853,0.955085,0.938762,0.081348,0.894296,0.595016,0.278718,0.383074,0.934677


In [19]:
df_embeddings.shape

(8200, 768)

In [20]:
final = pd.concat([df, df_embeddings], axis=1)

In [21]:
final = final.iloc[:,1:]

In [22]:
final.iloc[0:3000]

Unnamed: 0,age,age_encoded,word_tokens,lemma,0,1,2,3,4,5,...,758,759,760,761,762,763,764,765,766,767
0,a2,1,"['quantos', 'empregados', 'em', 'cada', 'um', ...",quanto empregado em cada um de o atual nível s...,0.564466,0.815525,0.107565,0.319572,0.101842,0.105100,...,0.275671,0.219380,0.934174,0.909270,0.090046,0.897560,0.590714,0.291341,0.416531,0.900556
1,a2,1,"['solicito', 'cópia', 'das', 'atas', 'do', 'co...",solicito cópia de o ata de o conselho de admin...,0.520970,0.815053,0.120955,0.300525,0.079995,0.149684,...,0.282352,0.235483,0.916027,0.947861,0.102334,0.876205,0.603262,0.254797,0.394609,0.841805
2,a2,1,"['solicito', 'informar', 'a', 'norma', 'lei', ...",solicito informar o norma lei decreto portaria...,0.535018,0.845178,0.106620,0.330766,0.061033,0.094648,...,0.288288,0.232071,0.927528,0.953746,0.102037,0.927873,0.533769,0.266102,0.429789,0.895740
3,a2,1,"['solicito', 'por', 'gentileza', 'a', 'informa...",solicito por gentileza o informação sobre o qu...,0.548547,0.784378,0.057764,0.275971,0.123137,0.096042,...,0.306542,0.244469,0.954206,0.941051,0.075034,0.899374,0.585249,0.279545,0.380809,0.939743
4,a2,1,"['solicito', 'por', 'gentileza', 'a', 'informa...",solicito por gentileza o informação sobre o qu...,0.559062,0.799921,0.060454,0.274691,0.135161,0.079634,...,0.304113,0.247853,0.955085,0.938762,0.081348,0.894296,0.595016,0.278718,0.383074,0.934677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,a3,2,"['boa', 'tarde', 'solicito', 'informação', 'a'...",bom tarde solicito informação o respeito de o ...,0.542148,0.822603,0.077989,0.320832,0.112735,0.119918,...,0.303343,0.245750,0.940113,0.952605,0.084993,0.914505,0.611352,0.264548,0.330339,0.946582
2996,a3,2,"['boa', 'tarde', 'solicito', 'informação', 'de...",bom tarde solicito informação detalhar sobre q...,0.563591,0.813597,0.108228,0.270326,0.133398,0.099736,...,0.297511,0.239185,0.936234,0.941488,0.080255,0.880642,0.667738,0.240480,0.409088,0.905547
2997,a2,1,"['boa', 'tarde', 'solicito', 'informação', 'qu...",bom tarde solicito informação quanto a o númer...,0.629978,0.801672,0.105231,0.332028,0.118094,0.114269,...,0.310733,0.266573,0.917570,0.917114,0.103660,0.881889,0.567379,0.244934,0.423801,0.902122
2998,a3,2,"['boa', 'tarde', 'solicito', 'informação', 'so...",bom tarde solicito informação sobre como proce...,0.531621,0.624915,0.209710,0.437029,0.235883,0.346450,...,0.345820,0.203675,0.877608,0.806585,0.230891,0.827337,0.613719,0.341670,0.361410,0.825245


In [23]:
final.head()

Unnamed: 0,age,age_encoded,word_tokens,lemma,0,1,2,3,4,5,...,758,759,760,761,762,763,764,765,766,767
0,a2,1,"['quantos', 'empregados', 'em', 'cada', 'um', ...",quanto empregado em cada um de o atual nível s...,0.564466,0.815525,0.107565,0.319572,0.101842,0.1051,...,0.275671,0.21938,0.934174,0.90927,0.090046,0.89756,0.590714,0.291341,0.416531,0.900556
1,a2,1,"['solicito', 'cópia', 'das', 'atas', 'do', 'co...",solicito cópia de o ata de o conselho de admin...,0.52097,0.815053,0.120955,0.300525,0.079995,0.149684,...,0.282352,0.235483,0.916027,0.947861,0.102334,0.876205,0.603262,0.254797,0.394609,0.841805
2,a2,1,"['solicito', 'informar', 'a', 'norma', 'lei', ...",solicito informar o norma lei decreto portaria...,0.535018,0.845178,0.10662,0.330766,0.061033,0.094648,...,0.288288,0.232071,0.927528,0.953746,0.102037,0.927873,0.533769,0.266102,0.429789,0.89574
3,a2,1,"['solicito', 'por', 'gentileza', 'a', 'informa...",solicito por gentileza o informação sobre o qu...,0.548547,0.784378,0.057764,0.275971,0.123137,0.096042,...,0.306542,0.244469,0.954206,0.941051,0.075034,0.899374,0.585249,0.279545,0.380809,0.939743
4,a2,1,"['solicito', 'por', 'gentileza', 'a', 'informa...",solicito por gentileza o informação sobre o qu...,0.559062,0.799921,0.060454,0.274691,0.135161,0.079634,...,0.304113,0.247853,0.955085,0.938762,0.081348,0.894296,0.595016,0.278718,0.383074,0.934677


In [24]:
final.isnull().sum().sum()

0

In [25]:
# f1 = final.iloc[0:3000]
# f2 = final.iloc[3000:]
# f1.to_csv('../embeddings/BERT_Embeddings_1.csv', index=False)
# f2.to_csv('../embeddings/BERT_Embeddings_2.csv', index=False)

In [26]:
final.to_csv('../embeddings/BERT_Embeddings_full.csv', index=False)

In [27]:
final.head()

Unnamed: 0,age,age_encoded,word_tokens,lemma,0,1,2,3,4,5,...,758,759,760,761,762,763,764,765,766,767
0,a2,1,"['quantos', 'empregados', 'em', 'cada', 'um', ...",quanto empregado em cada um de o atual nível s...,0.564466,0.815525,0.107565,0.319572,0.101842,0.1051,...,0.275671,0.21938,0.934174,0.90927,0.090046,0.89756,0.590714,0.291341,0.416531,0.900556
1,a2,1,"['solicito', 'cópia', 'das', 'atas', 'do', 'co...",solicito cópia de o ata de o conselho de admin...,0.52097,0.815053,0.120955,0.300525,0.079995,0.149684,...,0.282352,0.235483,0.916027,0.947861,0.102334,0.876205,0.603262,0.254797,0.394609,0.841805
2,a2,1,"['solicito', 'informar', 'a', 'norma', 'lei', ...",solicito informar o norma lei decreto portaria...,0.535018,0.845178,0.10662,0.330766,0.061033,0.094648,...,0.288288,0.232071,0.927528,0.953746,0.102037,0.927873,0.533769,0.266102,0.429789,0.89574
3,a2,1,"['solicito', 'por', 'gentileza', 'a', 'informa...",solicito por gentileza o informação sobre o qu...,0.548547,0.784378,0.057764,0.275971,0.123137,0.096042,...,0.306542,0.244469,0.954206,0.941051,0.075034,0.899374,0.585249,0.279545,0.380809,0.939743
4,a2,1,"['solicito', 'por', 'gentileza', 'a', 'informa...",solicito por gentileza o informação sobre o qu...,0.559062,0.799921,0.060454,0.274691,0.135161,0.079634,...,0.304113,0.247853,0.955085,0.938762,0.081348,0.894296,0.595016,0.278718,0.383074,0.934677
