In [1]:
import numpy as np
from scipy.sparse import load_npz

In [2]:
X_train = load_npz('../vectors/tfidf_lemma/X_train_tfidf.npz')
y_train = np.load('../vectors/tfidf_lemma/y_train.npy')

In [3]:
from sklearn.model_selection import cross_val_score

In [17]:
from sklearn import naive_bayes

NB = naive_bayes.MultinomialNB()

scores = cross_val_score(NB, X_train, y_train, cv=10)

print(scores)
scores.mean()

[0.44359756 0.47256098 0.4222561  0.44207317 0.44359756 0.43597561
 0.42835366 0.44817073 0.4527439  0.46189024]


0.4451219512195122

In [18]:
from sklearn.svm import SVC

SVM = SVC()

scores = cross_val_score(SVM, X_train, y_train, cv=10)

print(scores)
scores.mean()

[0.51981707 0.51676829 0.48932927 0.49847561 0.50304878 0.50304878
 0.49390244 0.51829268 0.52896341 0.49695122]


0.5068597560975611

In [19]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()

RF_scores = cross_val_score(RF, X_train, y_train, cv=10)

print(RF_scores)
RF_scores.mean()

[0.49085366 0.5320122  0.49847561 0.49390244 0.50914634 0.51219512
 0.52591463 0.52286585 0.50762195 0.5304878 ]


0.5123475609756099

## Encontrando melhores hiperparametros para o RFC

In [4]:
import optuna
from sklearn.ensemble import RandomForestClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 50, 200),
    }

    model = RandomForestClassifier(**params)

    score = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=5).mean()

    return score

In [8]:
study = optuna.create_study(direction='maximize')  # Estamos maximizando a acurácia
study.optimize(objective, n_trials=50)

[I 2023-12-14 19:10:33,164] A new study created in memory with name: no-name-dbc6fa88-993d-48b1-b539-04a45ad1e7e8
[I 2023-12-14 19:11:17,743] Trial 0 finished with value: 0.5010670731707317 and parameters: {'n_estimators': 64, 'max_depth': 136}. Best is trial 0 with value: 0.5010670731707317.
[I 2023-12-14 19:12:23,332] Trial 1 finished with value: 0.5057926829268292 and parameters: {'n_estimators': 88, 'max_depth': 176}. Best is trial 1 with value: 0.5057926829268292.
[I 2023-12-14 19:13:33,002] Trial 2 finished with value: 0.5091463414634146 and parameters: {'n_estimators': 82, 'max_depth': 79}. Best is trial 2 with value: 0.5091463414634146.
[I 2023-12-14 19:15:05,376] Trial 3 finished with value: 0.5091463414634146 and parameters: {'n_estimators': 138, 'max_depth': 200}. Best is trial 2 with value: 0.5091463414634146.
[I 2023-12-14 19:17:17,574] Trial 4 finished with value: 0.50625 and parameters: {'n_estimators': 175, 'max_depth': 109}. Best is trial 2 with value: 0.50914634146341

In [9]:
print("Melhores Hiperparâmetros:", study.best_params)
print("Melhor Acurácia Encontrada:", study.best_value)

Melhores Hiperparâmetros: {'n_estimators': 171, 'max_depth': 118}
Melhor Acurácia Encontrada: 0.5153963414634146


In [13]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=118, max_depth=143)

RF_scores = cross_val_score(RF, X_train, y_train, cv=10)

print(RF_scores)
RF_scores.mean()

[0.51981707 0.51371951 0.52439024 0.52896341 0.51219512 0.51981707
 0.52439024 0.52134146 0.51067073 0.5304878 ]


0.520579268292683

In [14]:
from sklearn.metrics import accuracy_score

X_test = load_npz('../vectors/tfidf_lemma/X_test_tfidf.npz')
y_test = np.load('../vectors/tfidf_lemma/y_test.npy')

RF.fit(X_train, y_train)

y_pred = RF.predict(X_test)

accuracy_score(y_test, y_pred)

0.5128048780487805