## Packages and Assets

In [99]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

import seaborn as sns

import matplotlib.pyplot as plt
import plotly.express as px
import pickle


## Dependencies

In [100]:
test = pd.read_csv('../../assets/data/splits/test/preprocessed.csv')
y_test = test['label']

In [101]:
with open('../../assets/trad_assets/cv_set.pkl', 'rb') as fout:
    cv_vec, cv_best_models = pickle.load(fout)

with open('../../assets/trad_assets/tfidf_set.pkl', 'rb') as fout:
    tfidf_vec, tfidf_best_models = pickle.load(fout)


In [102]:
cv_best_models

[KNeighborsClassifier(metric='cosine', n_neighbors=23, weights='distance'),
 SVC(C=1),
 MultinomialNB(alpha=10),
 LogisticRegression(C=0.1, solver='sag')]

In [103]:
tfidf_best_models

[KNeighborsClassifier(metric='cosine', n_neighbors=23, weights='distance'),
 SVC(C=1),
 MultinomialNB(alpha=1),
 LogisticRegression(C=1, penalty='l1', solver='liblinear')]

## Functions

In [104]:
def viewPredictedRows(X_test, y_test, y_pred):
    df = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    df['correct'] = df['y_test'] == df['y_pred']
    df['correct'] = df['correct'].apply(lambda x: 'Correct' if x else 'Incorrect')
    df['title'] = X_test
    return df


In [105]:
def show_confusion_matrix(cm):
        print("Confusion Matrix")
        plt.figure(figsize=(10, 7))

        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'],
                    yticklabels=['Negative', 'Positive'])
        plt.xlabel('Predicted')
        plt.ylabel('Truth')
        plt.title('Confusion Matrix')
        plt.show()

In [106]:
def show_graph_metrics(y_test, y_pred, modelo):
    df_metrics = pd.DataFrame(index=['Desceu', 'Constante', 'Subiu'])
    df_metrics['Acurácia'] = accuracy_score(y_true=y_test, y_pred=y_pred)
    df_metrics['Precisão'] = precision_score(y_true=y_test, y_pred=y_pred, average=None)
    df_metrics['Recall'] = recall_score(y_true=y_test, y_pred=y_pred, average=None)
    df_metrics['F1-Score'] = f1_score(y_true=y_test, y_pred=y_pred, average=None)
    fig = px.bar(df_metrics, height=500, width=750,  x=df_metrics.index, y=["Acurácia", "Precisão", "Recall", "F1-Score"],
             barmode="group", title=f"Desempenho de {modelo} em relação a precisão, recall e F1-Score", labels={'index': 'Classes', 'value': 'Porcentagem (%)', 'variable': 'Métricas'})

    fig.show()

## Predictions considering best traditional models for both vectorization

### CountVectorizer

In [107]:
X_test = cv_vec.transform(test['title']).toarray()
X_test_names = pd.DataFrame(X_test, columns=cv_vec.get_feature_names_out())
X_test_names

Unnamed: 0,abaixo,abastecimento,aberto,abertura,abicom,absorver,absurdo,aceita,acelen,acelera,...,vitoria,volatil,volatilidade,voltar,voltou,volume,vontade,votacao,votada,warren
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
440,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
441,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
442,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
print("CountVectorizer models")
for model in cv_best_models:
    print("Model: ", model.__class__.__name__)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    show_graph_metrics(y_test, y_pred, model.__class__.__name__)




CountVectorizer models
Model:  KNeighborsClassifier
              precision    recall  f1-score   support

           0       0.51      0.71      0.59       214
           1       0.00      0.00      0.00        35
           2       0.51      0.36      0.42       195

    accuracy                           0.50       444
   macro avg       0.34      0.36      0.34       444
weighted avg       0.47      0.50      0.47       444



Model:  SVC
              precision    recall  f1-score   support

           0       0.50      0.72      0.59       214
           1       0.00      0.00      0.00        35
           2       0.49      0.34      0.40       195

    accuracy                           0.50       444
   macro avg       0.33      0.35      0.33       444
weighted avg       0.46      0.50      0.46       444




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Model:  MultinomialNB
              precision    recall  f1-score   support

           0       0.50      0.75      0.60       214
           1       0.00      0.00      0.00        35
           2       0.51      0.32      0.40       195

    accuracy                           0.50       444
   macro avg       0.34      0.36      0.33       444
weighted avg       0.47      0.50      0.46       444




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



Model:  LogisticRegression
              precision    recall  f1-score   support

           0       0.49      0.69      0.57       214
           1       0.00      0.00      0.00        35
           2       0.48      0.35      0.41       195

    accuracy                           0.49       444
   macro avg       0.32      0.35      0.33       444
weighted avg       0.45      0.49      0.45       444



In [109]:
# df_results = viewPredictedRows(test['title'], y_test, y_pred)
# df_results
# cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
# show_confusion_matrix(cm)

### TfidfVectorizer

In [110]:
X_test = tfidf_vec.transform(test['title']).toarray()
X_test_names = pd.DataFrame(X_test, columns=tfidf_vec.get_feature_names_out())
X_test_names

Unnamed: 0,abaixo,abastecimento,aberto,abertura,abicom,absorver,absurdo,aceita,acelen,acelera,...,vitoria,volatil,volatilidade,voltar,voltou,volume,vontade,votacao,votada,warren
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
440,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
print("TFIDF models")
for model in tfidf_best_models:
    print("Model: ", model.__class__.__name__)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    show_graph_metrics(y_test, y_pred, model.__class__.__name__)


TFIDF models
Model:  KNeighborsClassifier
              precision    recall  f1-score   support

           0       0.50      0.67      0.57       214
           1       0.00      0.00      0.00        35
           2       0.50      0.39      0.44       195

    accuracy                           0.50       444
   macro avg       0.33      0.35      0.34       444
weighted avg       0.46      0.50      0.47       444



TypeError: show_graph_metrics() missing 1 required positional argument: 'modelo'