In [1]:
from config import set_path

In [2]:
set_path()

In [3]:
import sys
print(sys.version)


3.12.3 (main, Feb  4 2025, 14:48:35) [GCC 13.3.0]


In [4]:
from core import load_data

In [5]:
data = load_data('parquet')

In [6]:
data.head()

Unnamed: 0,excerto,categoria
0,A gente sente falta também de metas mais espec...,Meio Ambiente
1,A segunda preocupação é com a questão da área ...,Meio Ambiente
2,"Acho, que a primeira manifestação dos coletivo...",Outros temas
3,"E para finaliza gostaria de dizer, que nós esp...",Outros temas
4,"Em tempos de pandemias, como disse o primeiro ...",Outros temas


In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score, classification_report

In [8]:
X = data['excerto'].values
y = data['categoria']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Define a pipeline with preprocessing and Naive Bayes
pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()),  # Convert text to TF-IDF features
    ("normalizer", Normalizer()),       # Normalize data
    ("classifier", MultinomialNB())     # Train Multinomial Naïve Bayes
])

In [11]:
# Train the model
pipeline.fit(X_train, y_train)

In [12]:
# Predictions
y_pred = pipeline.predict(X_test)

In [13]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.4456
Classification Report:
                                         precision    recall  f1-score   support

                    Assistência Social       0.44      0.07      0.12        59
                               Cultura       0.00      0.00      0.00        39
  Desenvolvimento Econômico e Trabalho       0.36      0.71      0.48        70
          Direitos Humanos e Cidadania       0.00      0.00      0.00        45
                              Educação       1.00      0.02      0.04        56
                       Esporte e Lazer       0.00      0.00      0.00        21
                             Habitação       1.00      0.26      0.42        34
         Infraestrutura Urbana e Obras       0.00      0.00      0.00        22
                 Inovação e Tecnologia       0.00      0.00      0.00        13
                         Meio Ambiente       0.68      0.29      0.40        66
                          Outros temas       0.64      0.42      0.51        7

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
from sklearn.tree import DecisionTreeClassifier

In [15]:
# Define a pipeline with preprocessing and Naive Bayes
pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()),  # Convert text to TF-IDF features
    ("normalizer", Normalizer()),       # Normalize data
    ("classifier", DecisionTreeClassifier(max_depth=5))     # Train Decision Tree

])

In [16]:
pipeline.fit(X_train, y_train)

In [17]:
# Predictions
y_pred = pipeline.predict(X_test)

In [18]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.2356
Classification Report:
                                         precision    recall  f1-score   support

                    Assistência Social       0.00      0.00      0.00        59
                               Cultura       0.00      0.00      0.00        39
  Desenvolvimento Econômico e Trabalho       0.50      0.03      0.05        70
          Direitos Humanos e Cidadania       0.00      0.00      0.00        45
                              Educação       0.00      0.00      0.00        56
                       Esporte e Lazer       0.00      0.00      0.00        21
                             Habitação       0.90      0.26      0.41        34
         Infraestrutura Urbana e Obras       0.00      0.00      0.00        22
                 Inovação e Tecnologia       0.00      0.00      0.00        13
                         Meio Ambiente       0.59      0.33      0.43        66
                          Outros temas       0.00      0.00      0.00        7

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
# Define a pipeline with preprocessing and Naive Bayes
pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()),  # Convert text to TF-IDF features
    ("normalizer", Normalizer()),       # Normalize data
    ("classifier", RandomForestClassifier())     # Train Random Forest

])

In [21]:
pipeline.fit(X_train, y_train)

In [22]:
y_pred = pipeline.predict(X_test)

In [23]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5611
Classification Report:
                                         precision    recall  f1-score   support

                    Assistência Social       0.43      0.41      0.42        59
                               Cultura       0.58      0.64      0.61        39
  Desenvolvimento Econômico e Trabalho       0.48      0.59      0.53        70
          Direitos Humanos e Cidadania       0.40      0.18      0.25        45
                              Educação       0.62      0.45      0.52        56
                       Esporte e Lazer       0.60      0.14      0.23        21
                             Habitação       0.64      0.79      0.71        34
         Infraestrutura Urbana e Obras       0.27      0.14      0.18        22
                 Inovação e Tecnologia       0.38      0.23      0.29        13
                         Meio Ambiente       0.59      0.62      0.60        66
                          Outros temas       0.46      0.51      0.48        7

In [24]:
from sklearn.svm import SVC

In [25]:
# Define a pipeline with preprocessing and SVC
pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()),  # Convert text to TF-IDF features
    ("normalizer", Normalizer()),       # Normalize data
    ("classifier", SVC(kernel="linear", probability=True))     # Train SVC

])

In [26]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [27]:
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6100
Classification Report:
                                         precision    recall  f1-score   support

                    Assistência Social       0.57      0.54      0.56        59
                               Cultura       0.66      0.64      0.65        39
  Desenvolvimento Econômico e Trabalho       0.54      0.59      0.56        70
          Direitos Humanos e Cidadania       0.40      0.22      0.29        45
                              Educação       0.75      0.54      0.62        56
                       Esporte e Lazer       0.83      0.48      0.61        21
                             Habitação       0.77      0.79      0.78        34
         Infraestrutura Urbana e Obras       0.38      0.23      0.29        22
                 Inovação e Tecnologia       0.78      0.54      0.64        13
                         Meio Ambiente       0.72      0.65      0.68        66
                          Outros temas       0.45      0.69      0.54        7