[Classificação de frases por setor](https://handtalk.notion.site/Classifica-o-de-frases-por-setor-18c80adbbf874c519c9efe19678ac4c1)

In [23]:
import pickle
import re

import nltk
import numpy as np
import pandas as pd
import spacy
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, hamming_loss,
                             jaccard_score, precision_score, recall_score)
from sklearn.model_selection import KFold, train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.tree import DecisionTreeClassifier

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
/kaggle/input/text-classification/dataset.csv


References:

- [Must Known Techniques for text preprocessing in NLP](https://www.analyticsvidhya.com/blog/2021/06/must-known-techniques-for-text-preprocessing-in-nlp/)
</br>
- [Quick Introduction to Bag-of-Words (BoW) and TF-IDF for Creating Features from Text](https://www.analyticsvidhya.com/blog/2020/02/quick-introduction-bag-of-words-bow-tf-idf/)
</br>
- [Multilabel Classification: An Introduction with Python’s Scikit-Learn](https://www.kdnuggets.com/2023/08/multilabel-classification-introduction-python-scikitlearn.html)

In [None]:
!python -m spacy download pt_core_news_sm

In [25]:
df = pd.read_csv("./data/dataset.csv")

In [26]:
df

Unnamed: 0,sentence,category
0,"Auxílio-Doença Previdenciário, Benefícios em E...",orgão público
1,"PAGAR TODAS AS CONTAS EM ATRASO R$1.290,90.",finanças
2,Então encontraremos na próxima aula.,educação
3,Veja os resultados da categoria de ofertas do ...,indústrias
4,"Além disso, a embalagem é reutilizável e 100% ...","indústrias,varejo"
...,...,...
516,"Selecione o local de estudo, curso sem encontr...",educação
517,ESTUDANTES DA REDE MUNICIPAL VOLTAM ÀS AULAS E...,"educação,orgão público"
518,Empresas e órgãos públicos,orgão público
519,DGE – Departamento de Gestão Estratégica Metas...,orgão público


In [27]:
train, test = train_test_split(df, test_size=0.2)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

# Preprocessing

In [28]:
train["category"] = train["category"].str.split(",")
test["category"] = test["category"].str.split(",")

mlb = MultiLabelBinarizer()

one_hot_encoded_train = mlb.fit_transform(train['category'])
one_hot_train_df = pd.DataFrame(one_hot_encoded_train, columns=mlb.classes_)
train = pd.concat([train, one_hot_train_df], axis=1).drop('category', axis=1)

one_hot_encoded_test = mlb.transform(test['category'])
one_hot_test_df = pd.DataFrame(one_hot_encoded_test, columns=mlb.classes_)
test = pd.concat([test, one_hot_test_df], axis=1).drop('category', axis=1)

In [29]:
nlp = spacy.load("pt_core_news_sm")
def process_string(string):
    string = string.strip().lower()
    string = re.sub(r'[^\w\s]', '', string)
    string = " ".join([token.lemma_ for token in nlp(string)])
    string = re.sub(' +', ' ', string)
    return string

In [30]:
train["sentence"] = train["sentence"].apply(process_string)
test["sentence"] = test["sentence"].apply(process_string)

In [31]:
train

Unnamed: 0,sentence,educação,finanças,indústrias,orgão público,varejo
0,o união entre o dois banco ser aprovar por o c...,0,1,0,0,0
1,o estudante dever estudar sozinho sem contato ...,1,0,0,0,0
2,prefeitura superar o marca de 25 milhão de mar...,0,0,0,1,0
3,como fazer para concorrer a o vaga de estágio ...,1,0,0,0,0
4,político de desconto curso por área e conheça ...,1,0,0,0,0
...,...,...,...,...,...,...
411,tubo de combustível,0,0,1,0,0
412,o reunião de o comissão ser a o quartasfeira a...,0,0,0,1,0
413,prefeitura de São José de o Campos edital 0120...,0,0,0,1,0
414,ter ciência que o documento descrito acima dev...,1,0,0,0,0


In [36]:
labels = ['educação', 'finanças', 'indústrias',
          'orgão público', 'varejo']
X_train = train.sentence
X_test = test.sentence
y_train = train[labels]
y_test = test[labels]

In [37]:
stop_words_pt = stopwords.words('portuguese')
vectorizer = TfidfVectorizer(stop_words=stop_words_pt)

X_train_matrix = vectorizer.fit_transform(X_train)
X_test_matrix = vectorizer.transform(X_test)

In [38]:
X_train_matrix

<416x1467 sparse matrix of type '<class 'numpy.float64'>'
	with 2716 stored elements in Compressed Sparse Row format>

In [39]:
X_test_matrix

<105x1467 sparse matrix of type '<class 'numpy.float64'>'
	with 439 stored elements in Compressed Sparse Row format>

# Training

In [48]:
kf = KFold(n_splits=5)
accuracy_list = []
hamming_list = []
for i, (train_index, test_index) in enumerate(kf.split(X_train_matrix)):
    X_tr, X_te = X_train_matrix[train_index], X_train_matrix[test_index]
    y_tr, y_te = y_train.loc[train_index], y_train.loc[test_index]
    print(f"Training in Fold: {i}")
    clf = MultiOutputClassifier(DecisionTreeClassifier()).fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)
    accuracy = round(accuracy_score(y_te, y_pred), 2)
    hamming = round(hamming_loss(y_te, y_pred), 2)
    accuracy_list.append(accuracy)
    hamming_list.append(hamming)
    final_model = clf
    if hamming < min(hamming_list):
        final_model = clf
    print('Accuracy Score: ', accuracy)
    print('Hamming Loss: ', hamming)
    print("*" * 25)

Training in Fold: 0
Accuracy Score:  0.48
Hamming Loss:  0.13
*************************
Training in Fold: 1
Accuracy Score:  0.48
Hamming Loss:  0.17
*************************
Training in Fold: 2
Accuracy Score:  0.54
Hamming Loss:  0.11
*************************
Training in Fold: 3
Accuracy Score:  0.37
Hamming Loss:  0.15
*************************
Training in Fold: 4
Accuracy Score:  0.36
Hamming Loss:  0.16
*************************


In [49]:
final_model

# Validation

In [51]:
predictions = final_model.predict(X_test_matrix)
accuracy = round(accuracy_score(y_test, predictions), 2)
hamming = round(hamming_loss(y_test, predictions), 2)
print('Accuracy Score on the validation set: ', accuracy)
print('Hamming Loss on the validation set: ', hamming)

Accuracy Score on the validation set:  0.48
Hamming Loss on the validation set:  0.13


In [53]:
accuracy = np.round(accuracy_score(y_test, predictions), 4)
hamming = np.round(hamming_loss(y_test, predictions), 4)
precision = np.round(precision_score(y_test, predictions, average='macro'), 4)
recall = np.round(recall_score(y_test, predictions, average='macro'), 4)
f1 = np.round(f1_score(y_test, predictions, average='macro'), 4)
jaccard = np.round(jaccard_score(y_test, predictions, average='macro'), 4)

print(f"Accuracy: {accuracy}\nHamming Loss: {hamming}\nPrecision: {precision}\nRecall: {recall}\nF1 Score: {f1}\nJaccard Score: {jaccard}")

Accuracy: 0.4762
Hamming Loss: 0.1333
Precision: 0.7426
Recall: 0.5707
F1 Score: 0.6419
Jaccard Score: 0.478


# Prediction

In [92]:
# sample = "Melhor política industrial é acabar com isenção para compras internacionais, diz presidente da Fiemg"
# sample = "Chevrolet Spin recebe mudanças, mas mantém antigo motor 1.8"
# sample = "Bancos estão mudando datas de fechamento das faturas dos cartões?"
# sample = "Reddit: ação fecha com disparada de 48% em sessão de estreia na Bolsa de NY"
sample = "Quanto ganhou quem investiu R$ 100 mil nos FIIs que mais pagaram dividendos no ano? "

In [93]:
sample = process_string(sample)
sample_matrix = vectorizer.transform([sample])
predictions = final_model.predict(sample_matrix)
bool_mask = predictions.astype(bool)
predicted_categories = np.array(labels)[predictions.astype(bool)[0]]
print(f"Segundo nosso modelo o texto digitado possui maior probabilidade de ser da(s) categoria(s): {predicted_categories}")

Segundo nosso modelo o texto digitado possui maior probabilidade de ser da(s) categoria(s): ['finanças']


In [None]:
with open('./model/text_classification_model.pkl', 'wb') as file:
    pickle.dump(final_model, file)