In [None]:

import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from unidecode import unidecode
## ler arquivo json em pandas
df = pd.read_json('C:\\Users\\mayco\\Documents\\projetos\\data-pantanaldev\\label-studio\\data\\export\\project-1-at-2023-04-30-05-08-956f4e3c.json')

df = df.drop(['id'], axis=1)
## expandir coluna annotations
df = pd.concat([df.drop(['annotations'], axis=1), df['annotations'].apply(pd.Series)], axis=1)
## expandir coluna 0 e renomear para annotations
df = pd.concat([df.drop([0], axis=1), df[0].apply(pd.Series)], axis=1)
## expandir coluna result e renomear para result
df = pd.concat([df.drop(['result'], axis=1), df['result'].apply(pd.Series)], axis=1)
df = df.drop(['id'], axis=1)

## expandir coluna 0 e renomear para result
df = pd.concat([df.drop([0], axis=1), df[0].apply(pd.Series)], axis=1)
## expandir coluna value e renomear para value
df = pd.concat([df.drop(['value'], axis=1), df['value'].apply(pd.Series)], axis=1)
## dropar choices nulos
df = df.dropna(subset=['choices'])
## obter choices 
df['choices'] = df['choices'].apply(lambda x: x[0])

## expandir coluna data
df = pd.concat([df.drop(['data'], axis=1), df['data'].apply(pd.Series)], axis=1)

df_noticia_original = df.copy()


padrao_data_cepea = r"Cepea, \d{2}/\d{2}/\d{4} - "
df['noticia'] = df['noticia'].apply(lambda x: re.sub(padrao_data_cepea, '', x))

## remover a palavra 'cepea' das noticias
padrao_cepea = r"Cepea"
df['noticia'] = df['noticia'].apply(lambda x: re.sub(padrao_cepea, '', x, flags=re.IGNORECASE))

## remover numeros das noticias
padrao_numeros = r'[0-9]+'
df['noticia'] = df['noticia'].apply(lambda x: re.sub(padrao_numeros, '', x))

## noticia que contem a palavra 'soja'
df = df[df['titulo'].str.contains('soja', flags=re.IGNORECASE)]

## remover noticias com choice 'desclassificar'
df = df[df['choices'] != 'Desclassificar']
df.count()

In [None]:
# Selecionar apenas as colunas necessárias
columns_to_select = ['id', 'data', 'noticia', 'titulo', 'choices', 'unique_id']

df = df[columns_to_select]
df.dropna(subset=['noticia'])

# Pré-processamento dos dados
stop_words = set(stopwords.words('portuguese'))

def preprocess_text(text):
    # remover acentuação
    text = unidecode(text)
    # Remover pontuações
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenização
    words = word_tokenize(text.lower())
    # Remover stopwords
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['noticia'] = df['noticia'].apply(preprocess_text)

df_treino = df[:210]
df_validacao = df[210:]

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import AutoModel, AutoTokenizer

# Criar modelo de classificação de sentimento
vocab_size = 1000
embedding_dim = 100
max_length = 125
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(50, activation='relu'),
    keras.layers.Dense(25, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# Compilar o modelo
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Pré-processar os dados
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df['noticia'])
sequences = tokenizer.texts_to_sequences(df['noticia'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
labels = df['choices'].map({'Positiva': 1, 'Negativa': 0, 'Neutra': 0.5})

# Treinar o modelo
history = model.fit(padded_sequences, labels, epochs=100, validation_split=0.3)

# Avaliar o modelo
test_sequences = tokenizer.texts_to_sequences(df_validacao['noticia'])
padded_test_sequences = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')
test_labels = df_validacao['choices'].map({'Positiva': 1, 'Negativa': 0, 'Neutra': 0.5})
test_loss, test_accuracy = model.evaluate(padded_test_sequences, test_labels)
print('Test accuracy:', test_accuracy)

# Salvar o modelo
model.save('modelo.h5')

In [None]:
## criar uma tabela de predições
df_predicoes = df_validacao.copy()
df_predicoes['predicao'] = model.predict(padded_test_sequences)
df_predicoes['predicao'] = df_predicoes['predicao'].apply(lambda x: 'Positiva' if x > 0.5 else 'Negativa' if x < 0.5 else 'Neutra')
df_predicoes

##  quantas noticias foram classificadas como positivas, negativas e neutras
df_predicoes['predicao'].value_counts()

## quantas noticias foram classificadas erradas
df_predicoes

In [None]:
## prever uma noticia
df_validacao = df_validacao.reset_index(drop=True)
noticia = df_validacao['noticia'][0]
noticia = preprocess_text(noticia)
noticia = [noticia]
noticia = tokenizer.texts_to_sequences(noticia)
noticia = pad_sequences(noticia, maxlen=max_length, padding='post', truncating='post')
val = model.predict(noticia)[0][0]


if val > 0.5:
    print('Positiva')
elif val < 0.5:
    print('Negativa')
else:
    print('Neutra')



In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification

# Carregar o modelo pré-treinado BERTimbau
model = TFBertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

# Compilar o modelo
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss=model.compute_loss, metrics=['accuracy'])

# Pré-processar os dados
sequences = tokenizer(df['noticia'].tolist(), padding=True, truncation=True, max_length=125)
labels = df['choices'].map({'Positiva': 1, 'Negativa': 0, 'Neutra': 0.5})

# Treinar o modelo
history = model.fit(sequences, labels, epochs=100)

# Predict
test_sequences = tokenizer(df_validacao['noticia'].tolist(), padding=True, truncation=True, max_length=125)
test_labels = df_validacao['choices'].map({'Positiva': 1, 'Negativa': 0, 'Neutra': 0.5})
test_loss, test_accuracy = model.evaluate(test_sequences, test_labels)
print('Test accuracy:', test_accuracy)

# Salvar o modelo
model.save_pretrained('bertimbau.h5')
