In [None]:
#!pip install -r ../requirements.txt

In [44]:
import warnings

warnings.filterwarnings("ignore")

In [45]:
import random
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd

In [46]:
seed = 777
random.seed(seed)

torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [47]:
import torch
 
print(torch.cuda.is_available())

True


In [48]:
import torch

print(torch.cuda.is_available())
print(f"CUDA version: {torch.version.cuda}")

# Storing ID of current CUDA device
cuda_id = torch.cuda.current_device()
print(torch.cuda.current_device())
	
print(torch.cuda.get_device_name(cuda_id))


True
CUDA version: 11.8
0
NVIDIA GeForce RTX 3050 Laptop GPU


In [49]:
model_name = "neuralmind/bert-base-portuguese-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [50]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [51]:
df = pd.read_csv('../data/esic2023_cleaned.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,resp_text,clarity,Class,tokens,lemma,ngram2,ngram3
0,0,"Prezada Senhora, Informa-se que o DNIT é uma ...",c1,0,"['prezada', 'senhora', 'informase', 'que', 'o'...",prezar senhor informase que o dnit ser um auta...,"[('prezar', 'senhor'), ('senhor', 'informase')...","[('prezar', 'senhor', 'informase'), ('senhor',..."
1,1,"Prezada, Os dados se referem somente aos na...",c1,0,"['prezada', 'os', 'dados', 'se', 'referem', 's...",prezar o dar se referir somente a o nascido vi...,"[('prezar', 'o'), ('o', 'dar'), ('dar', 'se'),...","[('prezar', 'o', 'dar'), ('o', 'dar', 'se'), (..."
2,2,"Prezado Usuário, Inicialmente agradecemos o s...",c1,0,"['prezado', 'usuário', 'inicialmente', 'agrade...",prezar usuário inicialmente agradecer o seu co...,"[('prezar', 'usuário'), ('usuário', 'inicialme...","[('prezar', 'usuário', 'inicialmente'), ('usuá..."
3,3,"Prezada, Referimo-nos a sua demanda, regist...",c1,0,"['prezada', 'referimonos', 'a', 'sua', 'demand...",prezar referimono o seu demanda registrar em o...,"[('prezar', 'referimono'), ('referimono', 'o')...","[('prezar', 'referimono', 'o'), ('referimono',..."
4,4,"Prezada, Segue anexa a resposta da área com...",c1,0,"['prezada', 'segue', 'anexa', 'a', 'resposta',...",prezar seguir anexo o resposta de o área compe...,"[('prezar', 'seguir'), ('seguir', 'anexo'), ('...","[('prezar', 'seguir', 'anexo'), ('seguir', 'an..."


In [52]:
def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512, add_special_tokens = True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.sum(dim=1).squeeze().cpu().numpy()
    r = list(embeddings)
    return r

In [53]:
#df['bert_embeddings'] = df['resp_text'].apply(get_bert_embeddings)
#df.head()

In [54]:
embeddings = []
for text in df['resp_text']:
    embeddings.append(get_bert_embeddings(text))

In [55]:
labels = ['emb_' + str(i) for i in range(len(embeddings[0]))]

In [56]:
df_embeddings = pd.DataFrame(embeddings, columns=labels)

In [57]:
from sklearn.preprocessing import MinMaxScaler 
import numpy as np 

In [58]:
from sklearn import preprocessing

x = df_embeddings.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_embeddings = pd.DataFrame(x_scaled)

In [59]:
df_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,0.505127,0.766378,0.1053,0.285969,0.092293,0.249332,0.465746,0.324952,0.691307,0.839414,...,0.275737,0.716399,0.905849,0.88919,0.045048,0.879967,0.528071,0.09311,0.53186,0.877069
1,0.469561,0.688681,0.113576,0.264515,0.116232,0.267609,0.468771,0.290847,0.706276,0.769807,...,0.262989,0.694362,0.917148,0.832956,0.092697,0.904497,0.501777,0.160684,0.560638,0.876449
2,0.532808,0.769794,0.053227,0.283687,0.03809,0.184949,0.500581,0.308323,0.679207,0.865095,...,0.264708,0.72642,0.949025,0.940053,0.048876,0.917774,0.479433,0.100289,0.538197,0.934336
3,0.351271,0.637699,0.287832,0.25746,0.198257,0.396646,0.468053,0.28673,0.582105,0.683174,...,0.274973,0.694385,0.649727,0.706238,0.181229,0.676903,0.572161,0.224631,0.527368,0.72708
4,0.407822,0.682736,0.366654,0.400153,0.280517,0.355127,0.448908,0.301572,0.707264,0.858857,...,0.461279,0.629783,0.594298,0.649282,0.243664,0.698317,0.506059,0.309669,0.537712,0.584884


In [60]:
df_embeddings.shape

(6000, 768)

In [61]:
final = pd.concat([df, df_embeddings], axis=1)

In [62]:
final = final.iloc[:,1:]

In [63]:
final.iloc[0:3000]

Unnamed: 0,resp_text,clarity,Class,tokens,lemma,ngram2,ngram3,0,1,2,...,758,759,760,761,762,763,764,765,766,767
0,"Prezada Senhora, Informa-se que o DNIT é uma ...",c1,0,"['prezada', 'senhora', 'informase', 'que', 'o'...",prezar senhor informase que o dnit ser um auta...,"[('prezar', 'senhor'), ('senhor', 'informase')...","[('prezar', 'senhor', 'informase'), ('senhor',...",0.505127,0.766378,0.105300,...,0.275737,0.716399,0.905849,0.889190,0.045048,0.879967,0.528071,0.093110,0.531860,0.877069
1,"Prezada, Os dados se referem somente aos na...",c1,0,"['prezada', 'os', 'dados', 'se', 'referem', 's...",prezar o dar se referir somente a o nascido vi...,"[('prezar', 'o'), ('o', 'dar'), ('dar', 'se'),...","[('prezar', 'o', 'dar'), ('o', 'dar', 'se'), (...",0.469561,0.688681,0.113576,...,0.262989,0.694362,0.917148,0.832956,0.092697,0.904497,0.501777,0.160684,0.560638,0.876449
2,"Prezado Usuário, Inicialmente agradecemos o s...",c1,0,"['prezado', 'usuário', 'inicialmente', 'agrade...",prezar usuário inicialmente agradecer o seu co...,"[('prezar', 'usuário'), ('usuário', 'inicialme...","[('prezar', 'usuário', 'inicialmente'), ('usuá...",0.532808,0.769794,0.053227,...,0.264708,0.726420,0.949025,0.940053,0.048876,0.917774,0.479433,0.100289,0.538197,0.934336
3,"Prezada, Referimo-nos a sua demanda, regist...",c1,0,"['prezada', 'referimonos', 'a', 'sua', 'demand...",prezar referimono o seu demanda registrar em o...,"[('prezar', 'referimono'), ('referimono', 'o')...","[('prezar', 'referimono', 'o'), ('referimono',...",0.351271,0.637699,0.287832,...,0.274973,0.694385,0.649727,0.706238,0.181229,0.676903,0.572161,0.224631,0.527368,0.727080
4,"Prezada, Segue anexa a resposta da área com...",c1,0,"['prezada', 'segue', 'anexa', 'a', 'resposta',...",prezar seguir anexo o resposta de o área compe...,"[('prezar', 'seguir'), ('seguir', 'anexo'), ('...","[('prezar', 'seguir', 'anexo'), ('seguir', 'an...",0.407822,0.682736,0.366654,...,0.461279,0.629783,0.594298,0.649282,0.243664,0.698317,0.506059,0.309669,0.537712,0.584884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,"Prezada, O organizador é o Centro de Seleção ...",c234,1,"['prezada', 'o', 'organizador', 'é', 'o', 'cen...",prezar o organizador ser o centro de seleção d...,"[('prezar', 'o'), ('o', 'organizador'), ('orga...","[('prezar', 'o', 'organizador'), ('o', 'organi...",0.530569,0.774251,0.071944,...,0.266448,0.734691,0.925980,0.882472,0.092065,0.895249,0.543895,0.089107,0.523779,0.891611
2996,"Prezada, os dados estão disponíveis na Plataf...",c234,1,"['prezada', 'os', 'dados', 'estão', 'disponíve...",prezar o dar estar disponívil em o plataforma ...,"[('prezar', 'o'), ('o', 'dar'), ('dar', 'estar...","[('prezar', 'o', 'dar'), ('o', 'dar', 'estar')...",0.505773,0.757485,0.078151,...,0.271834,0.697474,0.941125,0.917829,0.089754,0.931985,0.514931,0.142427,0.555219,0.890816
2997,"Prezada, pedimos desculpas pela demora... A ...",c234,1,"['prezada', 'pedimos', 'desculpas', 'pela', 'd...",prezar pedimos desculpa por o demor o nomencla...,"[('prezar', 'pedimos'), ('pedimos', 'desculpa'...","[('prezar', 'pedimos', 'desculpa'), ('pedimos'...",0.498600,0.829107,0.183060,...,0.284812,0.732802,0.866494,0.838716,0.168953,0.825449,0.566021,0.164663,0.512016,0.919914
2998,O Serviço de Informações ao Cidadão (SIC) da ...,c234,1,"['o', 'serviço', 'de', 'informações', 'ao', 'c...",o serviço de informação a o cidadão sic de o a...,"[('o', 'serviço'), ('serviço', 'de'), ('de', '...","[('o', 'serviço', 'de'), ('serviço', 'de', 'in...",0.185752,0.593819,0.419010,...,0.391033,0.534651,0.462995,0.513996,0.315993,0.554416,0.705462,0.325962,0.527642,0.486246


In [64]:
final.head()

Unnamed: 0,resp_text,clarity,Class,tokens,lemma,ngram2,ngram3,0,1,2,...,758,759,760,761,762,763,764,765,766,767
0,"Prezada Senhora, Informa-se que o DNIT é uma ...",c1,0,"['prezada', 'senhora', 'informase', 'que', 'o'...",prezar senhor informase que o dnit ser um auta...,"[('prezar', 'senhor'), ('senhor', 'informase')...","[('prezar', 'senhor', 'informase'), ('senhor',...",0.505127,0.766378,0.1053,...,0.275737,0.716399,0.905849,0.88919,0.045048,0.879967,0.528071,0.09311,0.53186,0.877069
1,"Prezada, Os dados se referem somente aos na...",c1,0,"['prezada', 'os', 'dados', 'se', 'referem', 's...",prezar o dar se referir somente a o nascido vi...,"[('prezar', 'o'), ('o', 'dar'), ('dar', 'se'),...","[('prezar', 'o', 'dar'), ('o', 'dar', 'se'), (...",0.469561,0.688681,0.113576,...,0.262989,0.694362,0.917148,0.832956,0.092697,0.904497,0.501777,0.160684,0.560638,0.876449
2,"Prezado Usuário, Inicialmente agradecemos o s...",c1,0,"['prezado', 'usuário', 'inicialmente', 'agrade...",prezar usuário inicialmente agradecer o seu co...,"[('prezar', 'usuário'), ('usuário', 'inicialme...","[('prezar', 'usuário', 'inicialmente'), ('usuá...",0.532808,0.769794,0.053227,...,0.264708,0.72642,0.949025,0.940053,0.048876,0.917774,0.479433,0.100289,0.538197,0.934336
3,"Prezada, Referimo-nos a sua demanda, regist...",c1,0,"['prezada', 'referimonos', 'a', 'sua', 'demand...",prezar referimono o seu demanda registrar em o...,"[('prezar', 'referimono'), ('referimono', 'o')...","[('prezar', 'referimono', 'o'), ('referimono',...",0.351271,0.637699,0.287832,...,0.274973,0.694385,0.649727,0.706238,0.181229,0.676903,0.572161,0.224631,0.527368,0.72708
4,"Prezada, Segue anexa a resposta da área com...",c1,0,"['prezada', 'segue', 'anexa', 'a', 'resposta',...",prezar seguir anexo o resposta de o área compe...,"[('prezar', 'seguir'), ('seguir', 'anexo'), ('...","[('prezar', 'seguir', 'anexo'), ('seguir', 'an...",0.407822,0.682736,0.366654,...,0.461279,0.629783,0.594298,0.649282,0.243664,0.698317,0.506059,0.309669,0.537712,0.584884


In [65]:
final.isnull().sum().sum()

0

In [66]:
f1 = final.iloc[0:3000]
f2 = final.iloc[3000:]

In [67]:
f1.to_csv('../emebeddings/BERT_Embeddings_1.csv', index=False)
f2.to_csv('../emebeddings/BERT_Embeddings_2.csv', index=False)

In [68]:
final.head()

Unnamed: 0,resp_text,clarity,Class,tokens,lemma,ngram2,ngram3,0,1,2,...,758,759,760,761,762,763,764,765,766,767
0,"Prezada Senhora, Informa-se que o DNIT é uma ...",c1,0,"['prezada', 'senhora', 'informase', 'que', 'o'...",prezar senhor informase que o dnit ser um auta...,"[('prezar', 'senhor'), ('senhor', 'informase')...","[('prezar', 'senhor', 'informase'), ('senhor',...",0.505127,0.766378,0.1053,...,0.275737,0.716399,0.905849,0.88919,0.045048,0.879967,0.528071,0.09311,0.53186,0.877069
1,"Prezada, Os dados se referem somente aos na...",c1,0,"['prezada', 'os', 'dados', 'se', 'referem', 's...",prezar o dar se referir somente a o nascido vi...,"[('prezar', 'o'), ('o', 'dar'), ('dar', 'se'),...","[('prezar', 'o', 'dar'), ('o', 'dar', 'se'), (...",0.469561,0.688681,0.113576,...,0.262989,0.694362,0.917148,0.832956,0.092697,0.904497,0.501777,0.160684,0.560638,0.876449
2,"Prezado Usuário, Inicialmente agradecemos o s...",c1,0,"['prezado', 'usuário', 'inicialmente', 'agrade...",prezar usuário inicialmente agradecer o seu co...,"[('prezar', 'usuário'), ('usuário', 'inicialme...","[('prezar', 'usuário', 'inicialmente'), ('usuá...",0.532808,0.769794,0.053227,...,0.264708,0.72642,0.949025,0.940053,0.048876,0.917774,0.479433,0.100289,0.538197,0.934336
3,"Prezada, Referimo-nos a sua demanda, regist...",c1,0,"['prezada', 'referimonos', 'a', 'sua', 'demand...",prezar referimono o seu demanda registrar em o...,"[('prezar', 'referimono'), ('referimono', 'o')...","[('prezar', 'referimono', 'o'), ('referimono',...",0.351271,0.637699,0.287832,...,0.274973,0.694385,0.649727,0.706238,0.181229,0.676903,0.572161,0.224631,0.527368,0.72708
4,"Prezada, Segue anexa a resposta da área com...",c1,0,"['prezada', 'segue', 'anexa', 'a', 'resposta',...",prezar seguir anexo o resposta de o área compe...,"[('prezar', 'seguir'), ('seguir', 'anexo'), ('...","[('prezar', 'seguir', 'anexo'), ('seguir', 'an...",0.407822,0.682736,0.366654,...,0.461279,0.629783,0.594298,0.649282,0.243664,0.698317,0.506059,0.309669,0.537712,0.584884
