In [399]:

import pandas as pd
import numpy as np
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from unidecode import unidecode
## ler arquivo json em pandas
df = pd.read_json('C:\\Users\\mayco\\Documents\\projetos\\data-pantanaldev\\label-studio\\data\\export\\project-1-at-2023-04-27-03-26-58dd1708.json')
df = df.drop(['id'], axis=1)
## expandir coluna annotations
df = pd.concat([df.drop(['annotations'], axis=1), df['annotations'].apply(pd.Series)], axis=1)
## expandir coluna 0 e renomear para annotations
df = pd.concat([df.drop([0], axis=1), df[0].apply(pd.Series)], axis=1)
## expandir coluna result e renomear para result
df = pd.concat([df.drop(['result'], axis=1), df['result'].apply(pd.Series)], axis=1)
df = df.drop(['id'], axis=1)

## expandir coluna 0 e renomear para result
df = pd.concat([df.drop([0], axis=1), df[0].apply(pd.Series)], axis=1)
## expandir coluna value e renomear para value
df = pd.concat([df.drop(['value'], axis=1), df['value'].apply(pd.Series)], axis=1)
## dropar choices nulos
df = df.dropna(subset=['choices'])
## obter choices 
df['choices'] = df['choices'].apply(lambda x: x[0])

## expandir coluna data
df = pd.concat([df.drop(['data'], axis=1), df['data'].apply(pd.Series)], axis=1)

df_noticia_original = df.copy()


padrao_data_cepea = r"Cepea, \d{2}/\d{2}/\d{4} - "
df['noticia'] = df['noticia'].apply(lambda x: re.sub(padrao_data_cepea, '', x))

## remover a palavra 'cepea' das noticias
padrao_cepea = r"Cepea"
df['noticia'] = df['noticia'].apply(lambda x: re.sub(padrao_cepea, '', x, flags=re.IGNORECASE))

## remover numeros das noticias
padrao_numeros = r'[0-9]+'
df['noticia'] = df['noticia'].apply(lambda x: re.sub(padrao_numeros, '', x))

## noticia que contem a palavra 'soja'
df = df[df['noticia'].str.contains('soja', flags=re.IGNORECASE)]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mayco\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [400]:
# Selecionar apenas as colunas necessárias
columns_to_select = ['id', 'data', 'noticia', 'titulo', 'choices', 'unique_id']
df = df[columns_to_select]
df.dropna(subset=['noticia'])

# Pré-processamento dos dados
stop_words = set(stopwords.words('portuguese'))

def preprocess_text(text):
    # remover acentuação
    text = unidecode(text)
    # Remover pontuações
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenização
    words = word_tokenize(text.lower())
    # Remover stopwords
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['noticia'] = df['noticia'].apply(preprocess_text)

df_treino = df[:119]
df_validacao = df[119:]


# Criar modelo de classificação
X_train, X_test, y_train, y_test = train_test_split(df_treino['noticia'], df_treino['choices'], test_size=0.3)

# Definir pipeline com CountVectorizer e MultinomialNB
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', None)
])

# Definir grade de parâmetros a serem testados
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
              'classifier': [MultinomialNB(), DecisionTreeClassifier()],
              }

# Criar objeto GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=5, scoring='accuracy')

# Treinar modelos em diferentes combinações de hiperparâmetros
grid_search.fit(X_train, y_train)

# Selecionar o melhor modelo com base na pontuação de validação cruzada
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
y_probs = best_model.predict_proba(X_test)

## avalia o modelo, gerando report, matriz de confusão e acurácia
print(best_model.score(X_test, y_test))

## printa o nome do melhor modelo escolhido pelo gridsearch
print(best_model.named_steps['classifier'])

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

print(accuracy_score(y_test, y_pred))

## salva o modelo

import pickle
pickle.dump(best_model, open('modelo.pkl', 'wb'))


0.6666666666666666
MultinomialNB()
              precision    recall  f1-score   support

    Negativa       0.64      0.88      0.74         8
      Neutra       1.00      0.10      0.18        10
    Positiva       0.67      0.89      0.76        18

    accuracy                           0.67        36
   macro avg       0.77      0.62      0.56        36
weighted avg       0.75      0.67      0.60        36

[[ 7  0  1]
 [ 2  1  7]
 [ 2  0 16]]
0.6666666666666666
0.6666666666666666


In [401]:
## treina no conjunto de validação

model = pickle.load(open('modelo.pkl', 'rb'))

df_validacao = df_validacao.dropna(subset=['noticia']).reset_index(drop=True)
y_pred = model.predict_proba(df_validacao['noticia'])

## eliminar o e+01 do valor numerico em pandas
pd.options.display.float_format = '{:.2f}'.format

df_predict = pd.DataFrame(y_pred, columns=clf.classes_)
df_predict['predict_final'] = df_predict.idxmax(axis=1)
print(df_predict.columns)

df_pred_concatenado = pd.concat([df_validacao, df_predict], axis=1)
df_pred_concatenado[['id', 'data', 'titulo', 'noticia', 'choices', 'predict_final', 'Desclassificar', 'Negativa', 'Positiva']].to_excel('validacao.xlsx', index=False)

Index(['Desclassificar', 'Negativa', 'Neutra', 'Positiva', 'predict_final'], dtype='object')
