In [None]:

import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from unidecode import unidecode
## ler arquivo json em pandas
df = pd.read_json('/root/projects/personal/comitiva_esperanca/label-studio/data/export/project-1-at-2023-05-07-03-57-20fa509e.json')

df = df.drop(['id'], axis=1)
## expandir coluna annotations
df = pd.concat([df.drop(['annotations'], axis=1), df['annotations'].apply(pd.Series)], axis=1)
## expandir coluna 0 e renomear para annotations
df = pd.concat([df.drop([0], axis=1), df[0].apply(pd.Series)], axis=1)
## expandir coluna result e renomear para result
df = pd.concat([df.drop(['result'], axis=1), df['result'].apply(pd.Series)], axis=1)
df = df.drop(['id'], axis=1)

## expandir coluna 0 e renomear para result
df = pd.concat([df.drop([0], axis=1), df[0].apply(pd.Series)], axis=1)
## expandir coluna value e renomear para value
df = pd.concat([df.drop(['value'], axis=1), df['value'].apply(pd.Series)], axis=1)
## dropar choices nulos
df = df.dropna(subset=['choices'])
## obter choices 
df['choices'] = df['choices'].apply(lambda x: x[0])

## expandir coluna data
df = pd.concat([df.drop(['data'], axis=1), df['data'].apply(pd.Series)], axis=1)

df_noticia_original = df.copy()


padrao_data_cepea = r"Cepea, \d{2}/\d{2}/\d{4} - "
df['noticia'] = df['noticia'].apply(lambda x: re.sub(padrao_data_cepea, '', x))

## remover a palavra 'cepea' das noticias
padrao_cepea = r"Cepea"
df['noticia'] = df['noticia'].apply(lambda x: re.sub(padrao_cepea, '', x, flags=re.IGNORECASE))

## remover numeros das noticias
padrao_numeros = r'[0-9]+'
df['noticia'] = df['noticia'].apply(lambda x: re.sub(padrao_numeros, '', x))

## noticia que contem a palavra 'soja'
df = df[df['titulo'].str.contains('soja', flags=re.IGNORECASE)]

## remover noticias com choice 'desclassificar'
df = df[df['choices'] != 'Desclassificar']
df.count()

In [None]:
# Selecionar apenas as colunas necessárias
columns_to_select = ['id', 'data', 'noticia', 'titulo', 'choices', 'unique_id']

df = df[columns_to_select]
df.dropna(subset=['noticia'])

# Pré-processamento dos dados
stop_words = set(stopwords.words('portuguese'))

def preprocess_text(text):
    # remover acentuação
    text = unidecode(text)
    # Remover pontuações
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenização
    words = word_tokenize(text.lower())
    # Remover stopwords
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['noticia'] = df['noticia'].apply(preprocess_text)
df_treino = df[:208]
df_validacao = df[208:]


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer  # Or BertTokenizer
from transformers import AutoModelForPreTraining  # Or BertForPreTraining for loading pretraining heads
from transformers import AutoModel  # or BertModel, for BERT without pretraining heads
import numpy as np

from typing import List, Optional, Tuple, Union

# Carregar o modelo pré-treinado BERTimbau
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained("bert-base-multilingual-cased")

modelo_soja = model

modelo_soja.config

In [None]:
class LIABertClassifier(nn.Module):
    def __init__(self,model,num_labels):
        super(LIABertClassifier,self).__init__()
        self.bert = model
        self.config = model.config
        self.num_labels = num_labels
        self.cls = nn.Linear(self.config.hidden_size,200)
        self.dropout = nn.Dropout(p=0.5)
        self.cls2 = nn.Linear(200,num_labels)

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        ) ->Tuple[torch.Tensor]:

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        sequence_output = outputs[0][:,0,:]
        prediction = self.cls(sequence_output)
        prediction = self.dropout(prediction)
        prediction = self.cls2(prediction)
        return prediction
        

In [None]:
torch.cuda.empty_cache()

from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler
import sklearn.model_selection as model_selection
from torch.optim.lr_scheduler import OneCycleLR

# Map string labels to integer labels
label_map = {'Negativa': 0, 'Positiva': 1, 'Neutra': 2}
ytrain_global = np.array([label_map[label] for label in df_treino['choices']])
xtrain_global = np.array(df_treino['noticia'])



xtrain, xval, ytrain, yval = model_selection.train_test_split(xtrain_global, ytrain_global, test_size=0.30, random_state=42,shuffle=True)

train_encodings = tokenizer(xtrain.tolist(), truncation=True, padding=True, max_length=512, return_tensors='pt')
val_encodings = tokenizer(xval.tolist(), truncation=True, padding=True,max_length=512, return_tensors='pt')

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        label = torch.tensor(self.labels[idx])
        return (item,label)

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, ytrain)
val_dataset = MyDataset(val_encodings, yval)

device = torch.device('cpu')

model = LIABertClassifier(modelo_soja, 3)

model.to(device)

dl_train = DataLoader(train_dataset,batch_size=8)
dl_eval  = DataLoader(val_dataset,batch_size=8)
x,y = next(iter(dl_train))

batch = {k: v.to(device) for k, v in x.items()}
model.to(device)

optim = AdamW(model.parameters(), lr=0.000050)

loss_fct = nn.CrossEntropyLoss()

num_epochs = 20

from transformers import get_scheduler

scheduler = get_scheduler(
    "linear",
    optim,
    num_warmup_steps=0,
    num_training_steps=num_epochs*len(dl_train)
)

# scheduler = OneCycleLR(optim, max_lr=max_lr, total_steps=total_steps,
#                        div_factor=div_factor, pct_start=pct_start)

for epoch in range(num_epochs):
    lepochs = []
    for batch,y in dl_train:
        model.train()
        batch = {k: v.to(device) for k, v in batch.items()}
        y = y.to(device)
        outputs = model(**batch)
        loss = loss_fct(outputs,y)
        lepochs.append(loss.cpu().item())
        loss.backward()
        optim.step()
        scheduler.step()
        optim.zero_grad()
        current_lr = optim.param_groups[0]['lr']

        print(f"Epoch {epoch}, Loss={loss:.4f}, LR={current_lr:.6f}")

model.eval()
ytrue = []
ypred = []
for batch,y in dl_eval:
    batch = {k: v.to(device) for k, v in batch.items()}
    y = y.to(device)
    with torch.no_grad():
        outputs = model(**batch)
        ytrue.extend(y.cpu().numpy().tolist())
        ypred.extend(outputs.argmax(axis=1).cpu().numpy().tolist())

print(classification_report(ytrue,ypred))

