### Import data

In [3]:
import numpy as np
from scipy.sparse import load_npz

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [5]:
X_train = load_npz('../vectors/tfidf_lemma/X_train_tfidf.npz')
y_train = np.load('../vectors/tfidf_lemma/y_train.npy')

In [6]:
X_test = load_npz('../vectors/tfidf_lemma/X_test_tfidf.npz')
y_test = np.load('../vectors/tfidf_lemma/y_test.npy')

### Naive Bayes

In [7]:
from sklearn import naive_bayes

NB = naive_bayes.MultinomialNB()

NB_scores = cross_val_score(NB, X_train, y_train, cv=10)

print(NB_scores)
NB_scores.mean()

[0.44359756 0.47256098 0.4222561  0.44207317 0.44359756 0.43597561
 0.42835366 0.44817073 0.4527439  0.46189024]


0.4451219512195122

In [8]:
NB.fit(X_train, y_train)
y_pred = NB.predict(X_test)

accuracy_score(y_pred=y_pred, y_true=y_test)

0.4530487804878049

### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
RF = RandomForestClassifier()
RF_scores = cross_val_score(RF, X_train, y_train, cv=10)

print(RF_scores)
RF_scores.mean()

[0.52591463 0.5152439  0.50304878 0.5152439  0.51371951 0.51067073
 0.51981707 0.51829268 0.5304878  0.5304878 ]


0.5182926829268293

In [11]:
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)

accuracy_score(y_pred=y_pred, y_true=y_test)

0.5140243902439025

### Support Vector Machine

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [13]:
SVM = svm.SVC(C=1.0, kernel='linear', gamma='auto')

scores = cross_val_score(SVM, X_train, y_train, cv=10)

print(scores)
scores.mean()

[0.49085366 0.48018293 0.46493902 0.48170732 0.49085366 0.48170732
 0.45731707 0.50609756 0.51829268 0.48780488]


0.48597560975609755

In [14]:
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [1.0, 1.5, 2.0, 2.5, 3.0, 5.0, 10]
}

In [15]:
grid_search = GridSearchCV(SVM, param_grid, cv=10, scoring='accuracy')

grid_search.fit(X_train, y_train)

print("Melhores hiperparâmetros:", grid_search.best_params_)

Melhores hiperparâmetros: {'C': 1.0, 'kernel': 'linear'}


In [16]:
accuracy = grid_search.score(X_test, y_test)
print("Acurácia no conjunto de teste:", accuracy)

Acurácia no conjunto de teste: 0.4951219512195122
