In [10]:
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
import json
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import multiprocessing
import pandas as pd
from sklearn.metrics import classification_report

In [11]:
TEST_SIZE = 0.2
CPU_COUNT = multiprocessing.cpu_count()

In [12]:
def transform_data(texts):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    
    return vectorizer, X

In [13]:
def split_dataset(X, y):
    X_train, X_test, Y_train, Y_test = train_test_split(
                                                        X, y,
                                                        test_size = TEST_SIZE,
                                                        random_state = 1234,
                                                        stratify = y
                                                       )
    return X_train, X_test, Y_train, Y_test

In [14]:
def xgboost_train(X_train, Y_train, X_test):
    model = XGBClassifier(n_jobs=CPU_COUNT-4)
    fitted_model = model.fit(X_train, Y_train)
    pred_ = model.predict(X_test)
    return model, pred_

In [15]:
df = pd.read_parquet('./Data/training_data/df_v1.parquet.gzip')

In [16]:
df = df[df.y.isin(['Território', 'Quilombolas', 'Território;Quilombolas', 'identidade', 'Conflito', 'identidade e território', 'identidade e território'])]

In [17]:
df = df[~df.y.isna()]

In [18]:
y = df['y'].tolist()

In [19]:
df.y.value_counts()

Território                 148
Quilombolas                 81
Território;Quilombolas      70
Conflito                    19
identidade e território      8
identidade                   4
Name: y, dtype: int64

In [20]:
vectorizer, X = transform_data(df.texto.tolist())

In [13]:
X_train, X_test, Y_train, Y_test = split_dataset(X, y)

In [14]:
model, predict = xgboost_train(X_train, Y_train, X_test)





In [15]:
import pickle
pickle.dump(model, open("./model.p", "wb"))
pickle.dump(predict, open("./predict.p", "wb"))
pickle.dump(Y_test, open("./Y_test.p", "wb"))

In [7]:
import pickle
model = pickle.load(open("./model.p", "rb"))
predict = pickle.load(open("./predict.p", "rb"))
Y_test = pickle.load(open("./Y_test.p", "rb"))

In [8]:
print(classification_report(Y_test, predict))

                         precision    recall  f1-score   support

               Conflito       1.00      1.00      1.00         4
            Quilombolas       0.78      0.88      0.82        16
             Território       0.77      0.93      0.84        29
 Território;Quilombolas       0.86      0.43      0.57        14
             identidade       0.00      0.00      0.00         1
identidade e território       0.50      0.50      0.50         2

               accuracy                           0.79        66
              macro avg       0.65      0.62      0.62        66
           weighted avg       0.79      0.79      0.77        66



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
pickle.dump(vectorizer, open("./vectorizer.p", "wb"))