In [None]:
import sys
sys.path.append('../preprocessing/')

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from preprocess import *
from vector_optimization import VectorOptimizer
from shallow_learning import *

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import time
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import random

np.random.seed(42)
random.seed(42)

def treinando(df_name, column, target, algorithm, param_grid, sample_size=15000):
    inicio_tempo = time.time()
    df = pd.read_csv(f'../datasets/{df_name}')
    df = df[[column,target]].copy()
    display(df.head())
    
    preprocessing_functions = ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords']
    
    processed_df = preprocess_data(data=df, 
                                   preprocessing_funcs=preprocessing_functions, 
                                   column=column, target=target, verbose=1)
    
    # Verificar o tamanho do dataset e fazer a amostragem, se necessário
    is_large_dataset = len(processed_df) > sample_size
    if is_large_dataset:
        print(f"Dataset grande detectado ({len(df)} instâncias). Usando amostra de {sample_size} instâncias.")
        sampled_data = processed_df.sample(sample_size, random_state=42)
    else:
        sampled_data = processed_df
    
    otimizacao = VectorOptimizer()
    best_params, _ = otimizacao.optimize_vectorizer(data=sampled_data, text_column='prep', vectorizer_type=algorithm, parameters_grid=param_grid, verbose=1)

    print()
    
    if algorithm == 'tf-idf':
        vectorizer = TfidfVectorizer(**best_params)
        term_matrix = vectorizer.fit_transform(sampled_data[column])
        term_matrix = pd.DataFrame.sparse.from_spmatrix(term_matrix)
        term_matrix.columns = vectorizer.get_feature_names_out()
        term_matrix = pd.concat([term_matrix, sampled_data[target].reset_index(drop=True)], axis=1)
    elif algorithm == 'bow':
        vectorizer = CountVectorizer(**best_params)
        term_matrix = vectorizer.fit_transform(sampled_data[column])
        term_matrix = pd.DataFrame.sparse.from_spmatrix(term_matrix)
        term_matrix.columns = vectorizer.get_feature_names_out()
        term_matrix = pd.concat([term_matrix, sampled_data[target].reset_index(drop=True)], axis=1)
    else: 
        term_matrix = tokenize(sampled_data)
        vectorizer = Word2Vec(term_matrix['tokens'], **best_params)
        term_matrix['word2vec_features'] = term_matrix['tokens'].apply(get_word2vec_features, args=(vectorizer,))
        term_matrix = pd.DataFrame(term_matrix['word2vec_features'].tolist())
        term_matrix.columns = term_matrix.columns.astype(str)
        term_matrix.fillna(0, inplace=True)
        term_matrix = pd.concat([term_matrix, sampled_data[target].reset_index(drop=True)], axis=1)

    X_train, X_test, y_train, y_test = split(term_matrix, target=target)

    # Run training with predefined parameters and multiple metrics
    _, model, score = training_model(X_train, X_test, y_train, y_test, 
                                                  algorithms=['rf', 'svm', 'nb', 'knn', 'lr'], verbose=1)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    end_time = time.time()
    
    execution_time = end_time - inicio_tempo
    print(f"F1-score: {score}")
    print(f"Accuracy: {accuracy}")
    print(f"Tempo de execução: {execution_time} segundos")

## Dataset 1 - https://www.kaggle.com/code/hetulmehta/classification-of-websites?select=website_classification.csv

In [2]:
df_name = 'website_classification.csv'
column = 'cleaned_website_text'
target = 'Category'

In [None]:
algorithm = 'tf-idf'

param_grid = {
    'max_features': [100, 1000, 5000],
    'ngram_range': [(1, 1), (1, 2), (1, 3)],
    'min_df': [1, 10],
    'max_df': [0.8, 1.0],
    'norm': ['l1', 'l2']
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,cleaned_website_text,Category
0,official site good hotel accommodation big sav...,Travel
1,expedia hotel book sites like use vacation wor...,Travel
2,tripadvisor hotel book sites like previously d...,Travel
3,cheap flights search compare flights momondo f...,Travel
4,bot create free account create free account si...,Travel



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Done!



Dataset pequeno. Iniciando com tfidf padrão.
Similaridade inicial: 0.0193

[1/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l1'}
Similarity: 0.1235 | Melhor até agora: 0.0193
Tempo decorrido: 0.50s | Tempo estimado restante: 35.73s

[2/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2'}
Similarity: 0.1235 | Melhor até agora: 0.0193
Tempo decorrido: 1.01s | Tempo estimado restante: 35.27s

[3/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l1'}
Similarity: 0.1235 | Melhor até agora: 0.0193
Tempo decorrido: 3.05s | Tempo estimado restante: 70.24s

[4/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l2'}
Similarity: 0.1235 | Melhor até agora: 0.0193
Tempo decorrido: 5.15s | Tempo estimado restante: 87.57s

[5/72] Testando parâmetros: {'max_df': 0

BayesSearchCV: 100%|██████████| 32/32 [02:49<00:00,  5.30s/it]


Best parameters for rf: OrderedDict([('max_depth', 23), ('n_estimators', 93)])
Best f1_macro score with rf for test data: 0.7563079313345485

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [33:25<00:00, 62.67s/it]


Best parameters for svm: OrderedDict([('C', 10.0), ('gamma', 0.13450070930459385)])
Best f1_macro score with svm for test data: 0.8417255346620903

Running Bayesian optimization for algorithm nb


BayesSearchCV: 100%|██████████| 32/32 [00:38<00:00,  1.20s/it]


Best parameters for nb: OrderedDict([('alpha', 0.0033095943240520564)])
Best f1_macro score with nb for test data: 0.8535787588955117

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [01:00<00:00,  1.88s/it]


Best parameters for knn: OrderedDict([('leaf_size', 10), ('n_neighbors', 18)])
Best f1_macro score with knn for test data: 0.8861712526292634

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [12:49<00:00, 24.04s/it]


Best parameters for lr: OrderedDict([('C', 9.938222915554014), ('max_iter', 916)])
Best f1_macro score with lr for test data: 0.8793744454139236

Best algorithm: knn with best f1_macro score: 0.8861712526292634
Best parameters: OrderedDict([('leaf_size', 10), ('n_neighbors', 18)])
Classification Report in the test data:

                                 precision    recall  f1-score   support

                          Adult       1.00      0.83      0.91         6
             Business/Corporate       0.92      0.82      0.87        40
       Computers and Technology       0.78      0.84      0.81        25
                     E-Commerce       0.89      0.94      0.92        36
                      Education       0.93      0.96      0.95        45
                           Food       0.93      0.93      0.93        30
                         Forums       1.00      0.25      0.40         4
                          Games       1.00      0.91      0.95        33
             Health

In [6]:
algorithm = 'bow'

param_grid = {
    'max_features': [100, 1000, 5000],
    'ngram_range': [(1, 1), (1, 2), (1, 3)],
    'min_df': [1, 10],
    'max_df': [0.8, 1.0],
    'binary': [True, False]
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,cleaned_website_text,Category
0,official site good hotel accommodation big sav...,Travel
1,expedia hotel book sites like use vacation wor...,Travel
2,tripadvisor hotel book sites like previously d...,Travel
3,cheap flights search compare flights momondo f...,Travel
4,bot create free account create free account si...,Travel



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Done!



Dataset pequeno. Iniciando com bow padrão.
Similaridade inicial: 0.0469

[1/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1)}
Similarity: 0.3160 | Melhor até agora: 0.0469
Tempo decorrido: 0.48s | Tempo estimado restante: 34.06s

[2/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2)}
Similarity: 0.3175 | Melhor até agora: 0.0469
Tempo decorrido: 2.48s | Tempo estimado restante: 86.67s

[3/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 3)}
Similarity: 0.3175 | Melhor até agora: 0.0469
Tempo decorrido: 6.64s | Tempo estimado restante: 152.75s

[4/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 10, 'ngram_range': (1, 1)}
Similarity: 0.3160 | Melhor até agora: 0.0469
Tempo decorrido: 7.19s | Tempo estimado restante: 122.23s

[5/72] Testando parâmetros: {'b

BayesSearchCV: 100%|██████████| 32/32 [02:39<00:00,  4.98s/it]


Best parameters for rf: OrderedDict([('max_depth', 43), ('n_estimators', 100)])
Best f1_macro score with rf for test data: 0.731522657307667

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [35:02<00:00, 65.70s/it]


Best parameters for svm: OrderedDict([('C', 10.0), ('gamma', 6.440833872950001e-05)])
Best f1_macro score with svm for test data: 0.7071851656418005

Running Bayesian optimization for algorithm nb


BayesSearchCV: 100%|██████████| 32/32 [03:10<00:00,  5.96s/it]


Best parameters for nb: OrderedDict([('alpha', 0.0900424835484412)])
Best f1_macro score with nb for test data: 0.8459111980730994

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:51<00:00,  1.60s/it]


Best parameters for knn: OrderedDict([('leaf_size', 50), ('n_neighbors', 1)])
Best f1_macro score with knn for test data: 0.5567995409628171

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [28:27<00:00, 53.37s/it]


Best parameters for lr: OrderedDict([('C', 0.013770706919756749), ('max_iter', 101)])
Best f1_macro score with lr for test data: 0.825233596669563

Best algorithm: nb with best f1_macro score: 0.8459111980730994
Best parameters: OrderedDict([('alpha', 0.0900424835484412)])
Classification Report in the test data:

                                 precision    recall  f1-score   support

                          Adult       0.86      1.00      0.92         6
             Business/Corporate       0.69      0.68      0.68        40
       Computers and Technology       0.69      0.80      0.74        25
                     E-Commerce       0.86      0.89      0.88        36
                      Education       0.97      0.76      0.85        45
                           Food       0.96      0.90      0.93        30
                         Forums       0.17      0.25      0.20         4
                          Games       0.89      0.94      0.91        33
             Health and Fit

In [3]:
algorithm = 'word2vec'

param_grid = {
    'vector_size': [100, 1000, 5000],
    'window': [3, 5, 7],
    'min_count': [5, 10],
    'sg': [0, 1],
    'hs': [0, 1],
    'workers': [1]
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,cleaned_website_text,Category
0,official site good hotel accommodation big sav...,Travel
1,expedia hotel book sites like use vacation wor...,Travel
2,tripadvisor hotel book sites like previously d...,Travel
3,cheap flights search compare flights momondo f...,Travel
4,bot create free account create free account si...,Travel



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Done!



Dataset pequeno. Iniciando com word2vec padrão.
Similaridade inicial: 0.7649

[1/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 3, 'workers': 1}
Similarity: 0.8264 | Melhor até agora: 0.7649
Tempo decorrido: 8.20s | Tempo estimado restante: 581.93s

[2/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 5, 'workers': 1}
Similarity: 0.7649 | Melhor até agora: 0.7649
Tempo decorrido: 16.49s | Tempo estimado restante: 577.02s

[3/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 7, 'workers': 1}
Similarity: 0.7124 | Melhor até agora: 0.7124
Tempo decorrido: 25.11s | Tempo estimado restante: 577.58s

[4/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 1000, 'window': 3, 'workers': 1}
Similarity: 0.8477 | Melhor até agora: 0.7124
Tempo decorrido: 45.61s | Tempo estimado restante: 775.44s

[5/72] Testando parâmetros: {'hs': 0, 'min_count'

BayesSearchCV: 100%|██████████| 32/32 [01:13<00:00,  2.31s/it]


Best parameters for rf: OrderedDict([('max_depth', 50), ('n_estimators', 98)])
Best f1_macro score with rf for test data: 0.7918669204081709

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [00:28<00:00,  1.12it/s]


Best parameters for svm: OrderedDict([('C', 7.08702313439171), ('gamma', 0.05981829280794742)])
Best f1_macro score with svm for test data: 0.8384351117280654

Running Bayesian optimization for algorithm nb


BayesSearchCV:   0%|          | 0/32 [00:00<?, ?it/s]


An error occurred with algorithm nb: Negative values in data passed to MultinomialNB (input X)

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:15<00:00,  2.13it/s]


Best parameters for knn: OrderedDict([('leaf_size', 31), ('n_neighbors', 4)])
Best f1_macro score with knn for test data: 0.7941870772459533

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [00:42<00:00,  1.34s/it]


Best parameters for lr: OrderedDict([('C', 3.9348595118566116), ('max_iter', 100)])
Best f1_macro score with lr for test data: 0.8227340891913386

Best algorithm: svm with best f1_macro score: 0.8384351117280654
Best parameters: OrderedDict([('C', 7.08702313439171), ('gamma', 0.05981829280794742)])
Classification Report in the test data:

                                 precision    recall  f1-score   support

                          Adult       1.00      0.83      0.91         6
             Business/Corporate       0.74      0.70      0.72        40
       Computers and Technology       0.57      0.80      0.67        25
                     E-Commerce       0.86      0.83      0.85        36
                      Education       0.91      0.87      0.89        45
                           Food       0.93      0.93      0.93        30
                         Forums       0.25      0.25      0.25         4
                          Games       0.91      0.91      0.91        33
 

## Teste 2 - https://www.kaggle.com/datasets/alfathterry/bbc-full-text-document-classification

In [4]:
df_name = 'document_classification.csv'
column = 'data'
target = 'labels_target'

In [None]:
algorithm = 'tf-idf'

param_grid = {
    'max_features': [100, 1000, 5000],
    'ngram_range': [(1, 1), (1, 2), (1, 3)],
    'min_df': [1, 10],
    'max_df': [0.8, 1.0],
    'norm': ['l1', 'l2']
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,data,labels_target
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment
3,Snicket tops US box office chart The film ada...,entertainment
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Done!



Dataset pequeno. Iniciando com tfidf padrão.
Similaridade inicial: 0.0242

[1/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l1'}
Similarity: 0.1550 | Melhor até agora: 0.0242
Tempo decorrido: 0.31s | Tempo estimado restante: 21.94s

[2/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2'}
Similarity: 0.1550 | Melhor até agora: 0.0242
Tempo decorrido: 0.62s | Tempo estimado restante: 21.80s

[3/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l1'}
Similarity: 0.1550 | Melhor até agora: 0.0242
Tempo decorrido: 1.80s | Tempo estimado restante: 41.36s

[4/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l2'}
Similarity: 0.1550 | Melhor até agora: 0.0242
Tempo decorrido: 2.98s | Tempo estimado restante: 50.61s

[5/72] Testando parâmetros: {'max_df': 0

BayesSearchCV: 100%|██████████| 32/32 [02:15<00:00,  4.24s/it]


Best parameters for rf: OrderedDict([('max_depth', 31), ('n_estimators', 100)])
Best f1_macro score with rf for test data: 0.9682679164505725

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [44:54<00:00, 84.22s/it]


Best parameters for svm: OrderedDict([('C', 10.0), ('gamma', 0.17540820282219333)])
Best f1_macro score with svm for test data: 0.9790609121783952

Running Bayesian optimization for algorithm nb


BayesSearchCV: 100%|██████████| 32/32 [00:37<00:00,  1.18s/it]


Best parameters for nb: OrderedDict([('alpha', 0.01905626529237229)])
Best f1_macro score with nb for test data: 0.9807618999254484

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:55<00:00,  1.72s/it]


Best parameters for knn: OrderedDict([('leaf_size', 50), ('n_neighbors', 8)])
Best f1_macro score with knn for test data: 0.9043424354434458

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [04:49<00:00,  9.05s/it]

Best parameters for lr: OrderedDict([('C', 9.97938321582329), ('max_iter', 997)])
Best f1_macro score with lr for test data: 0.9790877587969368

Best algorithm: nb with best f1_macro score: 0.9807618999254484
Best parameters: OrderedDict([('alpha', 0.01905626529237229)])
Classification Report in the test data:

               precision    recall  f1-score   support

     business       0.99      0.96      0.97       179
entertainment       0.99      0.98      0.99       135
     politics       0.96      0.99      0.98       126
        sport       1.00      0.99      1.00       160
         tech       0.96      0.99      0.97       135

     accuracy                           0.98       735
    macro avg       0.98      0.98      0.98       735
 weighted avg       0.98      0.98      0.98       735

F1-score: 0.9807618999254484
Accuracy: 0.9809523809523809
Tempo de execução: 3351.9274389743805 segundos





In [14]:
algorithm = 'bow'

param_grid = {
    'max_features': [100, 1000, 5000],
    'ngram_range': [(1, 1), (1, 2), (1, 3)],
    'min_df': [1, 10],
    'max_df': [0.8, 1.0],
    'binary': [True, False]
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,data,labels_target
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment
3,Snicket tops US box office chart The film ada...,entertainment
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Done!



Dataset pequeno. Iniciando com bow padrão.
Similaridade inicial: 0.0757

[1/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1)}
Similarity: 0.2504 | Melhor até agora: 0.0757
Tempo decorrido: 0.36s | Tempo estimado restante: 25.89s

[2/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2)}
Similarity: 0.2504 | Melhor até agora: 0.0757
Tempo decorrido: 1.76s | Tempo estimado restante: 61.64s

[3/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 3)}
Similarity: 0.2504 | Melhor até agora: 0.0757
Tempo decorrido: 4.40s | Tempo estimado restante: 101.27s

[4/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 10, 'ngram_range': (1, 1)}
Similarity: 0.2504 | Melhor até agora: 0.0757
Tempo decorrido: 4.76s | Tempo estimado restante: 80.92s

[5/72] Testando parâmetros: {'bi

BayesSearchCV: 100%|██████████| 32/32 [00:46<00:00,  1.44s/it]


Best parameters for rf: OrderedDict([('max_depth', 31), ('n_estimators', 71)])
Best f1_macro score with rf for test data: 0.9583847928432169

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [04:42<00:00,  8.84s/it]


Best parameters for svm: OrderedDict([('C', 9.545244698039093), ('gamma', 0.00048537403065140167)])
Best f1_macro score with svm for test data: 0.963510280685678

Running Bayesian optimization for algorithm nb


BayesSearchCV: 100%|██████████| 32/32 [00:19<00:00,  1.66it/s]


Best parameters for nb: OrderedDict([('alpha', 0.4861518228524981)])
Best f1_macro score with nb for test data: 0.9740537298832466

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:22<00:00,  1.41it/s]


Best parameters for knn: OrderedDict([('leaf_size', 10), ('n_neighbors', 1)])
Best f1_macro score with knn for test data: 0.747424855241155

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [01:59<00:00,  3.75s/it]

Best parameters for lr: OrderedDict([('C', 0.8650311458300728), ('max_iter', 100)])
Best f1_macro score with lr for test data: 0.9725185802732851

Best algorithm: nb with best f1_macro score: 0.9740537298832466
Best parameters: OrderedDict([('alpha', 0.4861518228524981)])
Classification Report in the test data:

               precision    recall  f1-score   support

     business       0.97      0.97      0.97       179
entertainment       0.96      0.98      0.97       135
     politics       0.98      0.98      0.98       126
        sport       1.00      0.99      0.99       160
         tech       0.96      0.96      0.96       135

     accuracy                           0.97       735
    macro avg       0.97      0.97      0.97       735
 weighted avg       0.97      0.97      0.97       735

F1-score: 0.9740537298832466
Accuracy: 0.9741496598639455
Tempo de execução: 599.2062802314758 segundos





In [5]:
algorithm = 'word2vec'

param_grid = {
    'vector_size': [100, 1000, 5000],
    'window': [3, 5, 7],
    'min_count': [5, 10],
    'sg': [0, 1],
    'hs': [0, 1],
    'workers': [1]
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,data,labels_target
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment
3,Snicket tops US box office chart The film ada...,entertainment
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Done!



Dataset pequeno. Iniciando com word2vec padrão.
Similaridade inicial: 0.9539

[1/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 3, 'workers': 1}
Similarity: 0.9657 | Melhor até agora: 0.9539
Tempo decorrido: 3.94s | Tempo estimado restante: 279.68s

[2/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 5, 'workers': 1}
Similarity: 0.9539 | Melhor até agora: 0.9539
Tempo decorrido: 7.97s | Tempo estimado restante: 279.05s

[3/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 7, 'workers': 1}
Similarity: 0.9419 | Melhor até agora: 0.9419
Tempo decorrido: 12.06s | Tempo estimado restante: 277.41s

[4/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 1000, 'window': 3, 'workers': 1}
Similarity: 0.9744 | Melhor até agora: 0.9419
Tempo decorrido: 22.16s | Tempo estimado restante: 376.77s

[5/72] Testando parâmetros: {'hs': 0, 'min_count':

BayesSearchCV: 100%|██████████| 32/32 [00:35<00:00,  1.12s/it]


Best parameters for rf: OrderedDict([('max_depth', 31), ('n_estimators', 80)])
Best f1_macro score with rf for test data: 0.950190562280431

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [00:21<00:00,  1.52it/s]


Best parameters for svm: OrderedDict([('C', 1.541885581288399), ('gamma', 1.0006762778749292)])
Best f1_macro score with svm for test data: 0.957084020348109

Running Bayesian optimization for algorithm nb


BayesSearchCV:   0%|          | 0/32 [00:00<?, ?it/s]


An error occurred with algorithm nb: Negative values in data passed to MultinomialNB (input X)

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:13<00:00,  2.38it/s]


Best parameters for knn: OrderedDict([('leaf_size', 50), ('n_neighbors', 5)])
Best f1_macro score with knn for test data: 0.9477696421456627

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [00:16<00:00,  1.94it/s]

Best parameters for lr: OrderedDict([('C', 1.282942652357813), ('max_iter', 987)])
Best f1_macro score with lr for test data: 0.9518806156383489

Best algorithm: svm with best f1_macro score: 0.957084020348109
Best parameters: OrderedDict([('C', 1.541885581288399), ('gamma', 1.0006762778749292)])
Classification Report in the test data:

               precision    recall  f1-score   support

     business       0.93      0.98      0.95       179
entertainment       0.95      0.94      0.95       135
     politics       0.95      0.94      0.95       126
        sport       0.99      0.98      0.99       160
         tech       0.97      0.93      0.95       135

     accuracy                           0.96       735
    macro avg       0.96      0.96      0.96       735
 weighted avg       0.96      0.96      0.96       735

F1-score: 0.957084020348109
Accuracy: 0.9578231292517007
Tempo de execução: 5174.079828977585 segundos





## Teste 3 - https://www.kaggle.com/code/abhishek0032/mastering-email-spam-detection-eda-insights/input?select=train.csv

In [2]:
df_name = 'spam_or_ham.csv'
column = 'sms'
target = 'label'

In [None]:
algorithm = 'tf-idf'

param_grid = {
    'max_features': [100, 1000, 5000],
    'ngram_range': [(1, 1), (1, 2), (1, 3)],
    'min_df': [1, 10],
    'max_df': [0.8, 1.0],
    'norm': ['l1', 'l2']
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...
Done!



Dataset pequeno. Iniciando com tfidf padrão.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Similaridade inicial: 0.0067

[1/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l1'}
Similarity: 0.0221 | Melhor até agora: 0.0067
Tempo decorrido: 0.19s | Tempo estimado restante: 13.79s

[2/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2'}
Similarity: 0.0221 | Melhor até agora: 0.0067
Tempo decorrido: 0.39s | Tempo estimado restante: 13.70s

[3/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l1'}
Similarity: 0.0221 | Melhor até agora: 0.0067
Tempo decorrido: 0.65s | Tempo estimado restante: 14.90s

[4/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l2'}
Similarity: 0.0221 | Melhor até agora: 0.0067
Tempo decorrido: 0.90s | Tempo estimado restante: 15.35s

[5/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (

BayesSearchCV: 100%|██████████| 32/32 [03:22<00:00,  6.34s/it]


Best parameters for rf: OrderedDict([('max_depth', 50), ('n_estimators', 100)])
Best f1_macro score with rf for test data: 0.9454532780554936

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [27:31<00:00, 51.60s/it]


Best parameters for svm: OrderedDict([('C', 10.0), ('gamma', 0.14284098283042695)])
Best f1_macro score with svm for test data: 0.9621177820638888

Running Bayesian optimization for algorithm nb


BayesSearchCV: 100%|██████████| 32/32 [00:31<00:00,  1.02it/s]


Best parameters for nb: OrderedDict([('alpha', 0.08857287454272593)])
Best f1_macro score with nb for test data: 0.9662637669363143

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:51<00:00,  1.61s/it]


Best parameters for knn: OrderedDict([('leaf_size', 10), ('n_neighbors', 7)])
Best f1_macro score with knn for test data: 0.9333968869859959

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [01:01<00:00,  1.92s/it]

Best parameters for lr: OrderedDict([('C', 9.400086609900878), ('max_iter', 100)])
Best f1_macro score with lr for test data: 0.9556029114956843

Best algorithm: nb with best f1_macro score: 0.9662637669363143
Best parameters: OrderedDict([('alpha', 0.08857287454272593)])
Classification Report in the test data:

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1592
           1       0.93      0.95      0.94       246

    accuracy                           0.98      1838
   macro avg       0.96      0.97      0.97      1838
weighted avg       0.98      0.98      0.98      1838

F1-score: 0.9662637669363143
Accuracy: 0.984221980413493
Tempo de execução: 2045.933366060257 segundos





In [31]:
algorithm = 'bow'

param_grid = {
    'max_features': [100, 1000, 5000],
    'ngram_range': [(1, 1), (1, 2), (1, 3)],
    'min_df': [1, 10],
    'max_df': [0.8, 1.0],
    'binary': [True, False]
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...
Done!



Dataset pequeno. Iniciando com bow padrão.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Similaridade inicial: 0.0109

[1/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1)}
Similarity: 0.0243 | Melhor até agora: 0.0109
Tempo decorrido: 0.21s | Tempo estimado restante: 14.81s

[2/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2)}
Similarity: 0.0243 | Melhor até agora: 0.0109
Tempo decorrido: 0.47s | Tempo estimado restante: 16.50s

[3/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 3)}
Similarity: 0.0243 | Melhor até agora: 0.0109
Tempo decorrido: 0.78s | Tempo estimado restante: 18.05s

[4/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 10, 'ngram_range': (1, 1)}
Similarity: 0.0243 | Melhor até agora: 0.0109
Tempo decorrido: 0.99s | Tempo estimado restante: 16.82s

[5/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min

BayesSearchCV: 100%|██████████| 32/32 [02:27<00:00,  4.60s/it]


Best parameters for rf: OrderedDict([('max_depth', 50), ('n_estimators', 100)])
Best f1_macro score with rf for test data: 0.9429162212208655

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [13:10<00:00, 24.70s/it]


Best parameters for svm: OrderedDict([('C', 7.7401354958902315), ('gamma', 0.013986866269335829)])
Best f1_macro score with svm for test data: 0.964432207862923

Running Bayesian optimization for algorithm nb


BayesSearchCV: 100%|██████████| 32/32 [00:26<00:00,  1.23it/s]


Best parameters for nb: OrderedDict([('alpha', 0.7218586689127879)])
Best f1_macro score with nb for test data: 0.9669200074048172

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:40<00:00,  1.28s/it]


Best parameters for knn: OrderedDict([('leaf_size', 50), ('n_neighbors', 1)])
Best f1_macro score with knn for test data: 0.8688461889756587

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [00:40<00:00,  1.26s/it]

Best parameters for lr: OrderedDict([('C', 10.0), ('max_iter', 1000)])
Best f1_macro score with lr for test data: 0.9659694449734442

Best algorithm: nb with best f1_macro score: 0.9669200074048172
Best parameters: OrderedDict([('alpha', 0.7218586689127879)])
Classification Report in the test data:

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1592
           1       0.95      0.93      0.94       246

    accuracy                           0.98      1838
   macro avg       0.97      0.96      0.97      1838
weighted avg       0.98      0.98      0.98      1838

F1-score: 0.9669200074048172
Accuracy: 0.984766050054407
Tempo de execução: 1087.3191373348236 segundos





In [3]:
algorithm = 'word2vec'

param_grid = {
    'vector_size': [100, 1000, 5000],
    'window': [3, 5, 7],
    'min_count': [5, 10],
    'sg': [0, 1],
    'hs': [0, 1],
    'workers': [1]
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...\n,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...
Done!



Dataset pequeno. Iniciando com word2vec padrão.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Similaridade inicial: 0.9773

[1/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 3, 'workers': 1}
Similarity: 0.9773 | Melhor até agora: 0.9773
Tempo decorrido: 0.78s | Tempo estimado restante: 55.56s

[2/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 5, 'workers': 1}
Similarity: 0.9773 | Melhor até agora: 0.9773
Tempo decorrido: 1.56s | Tempo estimado restante: 54.58s

[3/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 7, 'workers': 1}
Similarity: 0.9771 | Melhor até agora: 0.9771
Tempo decorrido: 2.33s | Tempo estimado restante: 53.70s

[4/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 1000, 'window': 3, 'workers': 1}
Similarity: 0.9781 | Melhor até agora: 0.9771
Tempo decorrido: 4.23s | Tempo estimado restante: 71.95s

[5/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 1000, 'window': 5, 'workers': 1}
Si

BayesSearchCV: 100%|██████████| 32/32 [01:08<00:00,  2.13s/it]


Best parameters for rf: OrderedDict([('max_depth', 14), ('n_estimators', 100)])
Best f1_macro score with rf for test data: 0.9581914821021025

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [00:22<00:00,  1.40it/s]


Best parameters for svm: OrderedDict([('C', 2.3927818988403478), ('gamma', 1.3196334628059272)])
Best f1_macro score with svm for test data: 0.9638141871401853

Running Bayesian optimization for algorithm nb


BayesSearchCV:   0%|          | 0/32 [00:00<?, ?it/s]


An error occurred with algorithm nb: Negative values in data passed to MultinomialNB (input X)

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:14<00:00,  2.28it/s]


Best parameters for knn: OrderedDict([('leaf_size', 50), ('n_neighbors', 2)])
Best f1_macro score with knn for test data: 0.9569572198842631

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [00:15<00:00,  2.05it/s]

Best parameters for lr: OrderedDict([('C', 10.0), ('max_iter', 1000)])
Best f1_macro score with lr for test data: 0.9368722113460164

Best algorithm: svm with best f1_macro score: 0.9638141871401853
Best parameters: OrderedDict([('C', 2.3927818988403478), ('gamma', 1.3196334628059272)])
Classification Report in the test data:

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1592
           1       0.93      0.94      0.94       246

    accuracy                           0.98      1838
   macro avg       0.96      0.97      0.96      1838
weighted avg       0.98      0.98      0.98      1838

F1-score: 0.9638141871401853
Accuracy: 0.9831338411316648
Tempo de execução: 519.4998347759247 segundos





## Dataset 4 - https://www.kaggle.com/datasets/farwarizvi/emotions-6000?select=train_sam_shuffle.csv¶

In [5]:
df_name = 'emotions-6000.csv'
column = 'sentence'
target = 'label'

In [None]:
algorithm = 'tf-idf'

param_grid = {
    'max_features': [100, 1000, 5000],
    'ngram_range': [(1, 1), (1, 2), (1, 3)],
    'min_df': [1, 10],
    'max_df': [0.8, 1.0],
    'norm': ['l1', 'l2']
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,sentence,label
0,Getting lost in a captivating book,joy
1,No matter how hard I try I can t seem to shake...,sadness
2,"His anger was like a ticking time bomb, and he...",anger
3,Their patience and understanding during challe...,love
4,I am scared that my anxiety will never improve,fear



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...
Done!



Dataset pequeno. Iniciando com tfidf padrão.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Similaridade inicial: 0.0079

[1/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l1'}
Similarity: 0.0227 | Melhor até agora: 0.0079
Tempo decorrido: 0.13s | Tempo estimado restante: 9.06s

[2/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2'}
Similarity: 0.0227 | Melhor até agora: 0.0079
Tempo decorrido: 0.26s | Tempo estimado restante: 9.05s

[3/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l1'}
Similarity: 0.0223 | Melhor até agora: 0.0079
Tempo decorrido: 0.41s | Tempo estimado restante: 9.44s

[4/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l2'}
Similarity: 0.0223 | Melhor até agora: 0.0079
Tempo decorrido: 0.56s | Tempo estimado restante: 9.54s

[5/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 3

BayesSearchCV: 100%|██████████| 32/32 [01:20<00:00,  2.50s/it]


Best parameters for rf: OrderedDict([('max_depth', 50), ('n_estimators', 68)])
Best f1_macro score with rf for test data: 0.9133979380164755

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [22:46<00:00, 42.69s/it]


Best parameters for svm: OrderedDict([('C', 10.0), ('gamma', 1.935229816159643)])
Best f1_macro score with svm for test data: 0.9666428154242013

Running Bayesian optimization for algorithm nb


BayesSearchCV: 100%|██████████| 32/32 [00:23<00:00,  1.38it/s]


Best parameters for nb: OrderedDict([('alpha', 0.07264412922683165)])
Best f1_macro score with nb for test data: 0.9541706349933361

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:25<00:00,  1.27it/s]


Best parameters for knn: OrderedDict([('leaf_size', 50), ('n_neighbors', 1)])
Best f1_macro score with knn for test data: 0.8866396842481604

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [02:26<00:00,  4.58s/it]


Best parameters for lr: OrderedDict([('C', 9.98269302435298), ('max_iter', 116)])
Best f1_macro score with lr for test data: 0.9529440934995317

Best algorithm: svm with best f1_macro score: 0.9666428154242013
Best parameters: OrderedDict([('C', 10.0), ('gamma', 1.935229816159643)])
Classification Report in the test data:

              precision    recall  f1-score   support

       anger       0.99      0.99      0.99       252
        fear       0.96      0.98      0.97       221
         joy       0.95      0.92      0.94       244
        love       0.99      0.98      0.98       231
     neutral       0.94      0.97      0.96       218
     sadness       0.97      0.96      0.96       221

    accuracy                           0.97      1387
   macro avg       0.97      0.97      0.97      1387
weighted avg       0.97      0.97      0.97      1387

F1-score: 0.9666428154242013
Accuracy: 0.9668348954578226
Tempo de execução: 1686.8002259731293 segundos


In [33]:
algorithm = 'bow'

param_grid = {
    'max_features': [100, 1000, 5000],
    'ngram_range': [(1, 1), (1, 2), (1, 3)],
    'min_df': [1, 10],
    'max_df': [0.8, 1.0],
    'binary': [True, False]
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,sentence,label
0,Getting lost in a captivating book,joy
1,No matter how hard I try I can t seem to shake...,sadness
2,"His anger was like a ticking time bomb, and he...",anger
3,Their patience and understanding during challe...,love
4,I am scared that my anxiety will never improve,fear



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...
Done!



Dataset pequeno. Iniciando com bow padrão.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Similaridade inicial: 0.0129

[1/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1)}
Similarity: 0.0254 | Melhor até agora: 0.0129
Tempo decorrido: 0.12s | Tempo estimado restante: 8.75s

[2/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2)}
Similarity: 0.0249 | Melhor até agora: 0.0129
Tempo decorrido: 0.28s | Tempo estimado restante: 9.69s

[3/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 3)}
Similarity: 0.0249 | Melhor até agora: 0.0129
Tempo decorrido: 0.45s | Tempo estimado restante: 10.27s

[4/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 10, 'ngram_range': (1, 1)}
Similarity: 0.0253 | Melhor até agora: 0.0129
Tempo decorrido: 0.58s | Tempo estimado restante: 9.88s

[5/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df

BayesSearchCV: 100%|██████████| 32/32 [01:26<00:00,  2.70s/it]


Best parameters for rf: OrderedDict([('max_depth', 50), ('n_estimators', 100)])
Best f1_macro score with rf for test data: 0.9192205362603576

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [22:24<00:00, 42.00s/it]


Best parameters for svm: OrderedDict([('C', 10.0), ('gamma', 0.014887698536437939)])
Best f1_macro score with svm for test data: 0.9611087959558434

Running Bayesian optimization for algorithm nb


BayesSearchCV: 100%|██████████| 32/32 [00:27<00:00,  1.16it/s]


Best parameters for nb: OrderedDict([('alpha', 0.08524914549666639)])
Best f1_macro score with nb for test data: 0.9431205873691769

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:28<00:00,  1.13it/s]


Best parameters for knn: OrderedDict([('leaf_size', 10), ('n_neighbors', 1)])
Best f1_macro score with knn for test data: 0.8403484455444535

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [01:25<00:00,  2.68s/it]


Best parameters for lr: OrderedDict([('C', 0.9284036319422518), ('max_iter', 100)])
Best f1_macro score with lr for test data: 0.9612785255443875

Best algorithm: lr with best f1_macro score: 0.9612785255443875
Best parameters: OrderedDict([('C', 0.9284036319422518), ('max_iter', 100)])
Classification Report in the test data:

              precision    recall  f1-score   support

       anger       0.99      0.97      0.98       252
        fear       0.97      0.96      0.96       221
         joy       0.91      0.95      0.93       244
        love       0.97      0.97      0.97       231
     neutral       0.96      0.97      0.97       218
     sadness       0.97      0.95      0.96       221

    accuracy                           0.96      1387
   macro avg       0.96      0.96      0.96      1387
weighted avg       0.96      0.96      0.96      1387

F1-score: 0.9612785255443875
Accuracy: 0.9610670511896179
Tempo de execução: 1599.3241803646088 segundos


In [6]:
algorithm = 'word2vec'

param_grid = {
    'vector_size': [100, 1000, 5000],
    'window': [3, 5, 7],
    'min_count': [5, 10],
    'sg': [0, 1],
    'hs': [0, 1],
    'workers': [1]
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,sentence,label
0,Getting lost in a captivating book,joy
1,No matter how hard I try I can t seem to shake...,sadness
2,"His anger was like a ticking time bomb, and he...",anger
3,Their patience and understanding during challe...,love
4,I am scared that my anxiety will never improve,fear



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...
Done!



Dataset pequeno. Iniciando com word2vec padrão.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Similaridade inicial: 0.9900

[1/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 3, 'workers': 1}
Similarity: 0.9882 | Melhor até agora: 0.9882
Tempo decorrido: 0.44s | Tempo estimado restante: 31.48s

[2/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 5, 'workers': 1}
Similarity: 0.9900 | Melhor até agora: 0.9882
Tempo decorrido: 0.91s | Tempo estimado restante: 31.97s

[3/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 7, 'workers': 1}
Similarity: 0.9905 | Melhor até agora: 0.9882
Tempo decorrido: 1.36s | Tempo estimado restante: 31.20s

[4/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 1000, 'window': 3, 'workers': 1}
Similarity: 0.9893 | Melhor até agora: 0.9882
Tempo decorrido: 2.46s | Tempo estimado restante: 41.84s

[5/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 1000, 'window': 5, 'workers': 1}
Si

BayesSearchCV: 100%|██████████| 32/32 [01:17<00:00,  2.42s/it]


Best parameters for rf: OrderedDict([('max_depth', 37), ('n_estimators', 95)])
Best f1_macro score with rf for test data: 0.8657559945603808

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [00:36<00:00,  1.14s/it]


Best parameters for svm: OrderedDict([('C', 10.0), ('gamma', 0.8016787217068938)])
Best f1_macro score with svm for test data: 0.8946692671812196

Running Bayesian optimization for algorithm nb


BayesSearchCV:   0%|          | 0/32 [00:00<?, ?it/s]


An error occurred with algorithm nb: Negative values in data passed to MultinomialNB (input X)

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:14<00:00,  2.20it/s]


Best parameters for knn: OrderedDict([('leaf_size', 50), ('n_neighbors', 7)])
Best f1_macro score with knn for test data: 0.8488161187899492

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [00:19<00:00,  1.67it/s]


Best parameters for lr: OrderedDict([('C', 9.904938095682732), ('max_iter', 970)])
Best f1_macro score with lr for test data: 0.8476568116429686

Best algorithm: svm with best f1_macro score: 0.8946692671812196
Best parameters: OrderedDict([('C', 10.0), ('gamma', 0.8016787217068938)])
Classification Report in the test data:

              precision    recall  f1-score   support

       anger       0.93      0.96      0.95       252
        fear       0.92      0.93      0.93       221
         joy       0.82      0.78      0.80       244
        love       0.88      0.90      0.89       231
     neutral       0.89      0.90      0.89       218
     sadness       0.93      0.90      0.91       221

    accuracy                           0.89      1387
   macro avg       0.89      0.90      0.89      1387
weighted avg       0.89      0.89      0.89      1387

F1-score: 0.8946692671812196
Accuracy: 0.8947368421052632
Tempo de execução: 343.3820559978485 segundos


## Dataset 5 - https://www.kaggle.com/datasets/mohidabdulrehman/vs-sentiment-analysis

In [7]:
df_name = 'sentiment_reviews_dataset.csv'
column = 'sentence'
target = 'label'

In [None]:
algorithm = 'tf-idf'

param_grid = {
    'max_features': [100, 1000, 5000],
    'ngram_range': [(1, 1), (1, 2), (1, 3)],
    'min_df': [1, 10],
    'max_df': [0.8, 1.0],
    'norm': ['l1', 'l2']
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...
Done!



Dataset pequeno. Iniciando com tfidf padrão.
Similaridade inicial: 0.0107

[1/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l1'}
Similarity: 0.0261 | Melhor até agora: 0.0107
Tempo decorrido: 0.02s | Tempo estimado restante: 1.07s

[2/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2'}
Similarity: 0.0261 | Melhor até agora: 0.0107
Tempo decorrido: 0.03s | Tempo estimado restante: 1.09s

[3/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l1'}
Similarity: 0.0260 | M

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Similarity: 0.0264 | Melhor até agora: 0.0107
Tempo decorrido: 0.16s | Tempo estimado restante: 1.50s

[8/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 10, 'ngram_range': (1, 1), 'norm': 'l2'}
Similarity: 0.0264 | Melhor até agora: 0.0107
Tempo decorrido: 0.18s | Tempo estimado restante: 1.43s

[9/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 10, 'ngram_range': (1, 2), 'norm': 'l1'}
Similarity: 0.0263 | Melhor até agora: 0.0107
Tempo decorrido: 0.20s | Tempo estimado restante: 1.43s

[10/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 10, 'ngram_range': (1, 2), 'norm': 'l2'}
Similarity: 0.0263 | Melhor até agora: 0.0107
Tempo decorrido: 0.23s | Tempo estimado restante: 1.42s

[11/72] Testando parâmetros: {'max_df': 0.8, 'max_features': 100, 'min_df': 10, 'ngram_range': (1, 3), 'norm': 'l1'}
Similarity: 0.0263 | Melhor até agora: 0.0107
Tempo decorrido: 0.26s | Tempo estimado restante: 1.45s

[12/72] Testando pa

BayesSearchCV: 100%|██████████| 32/32 [00:32<00:00,  1.00s/it]


Best parameters for rf: OrderedDict([('max_depth', 48), ('n_estimators', 45)])
Best f1_macro score with rf for test data: 0.7267809832204887

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [01:17<00:00,  2.42s/it]


Best parameters for svm: OrderedDict([('C', 7.93503679725912), ('gamma', 1.1239646614321863)])
Best f1_macro score with svm for test data: 0.7892817812491324

Running Bayesian optimization for algorithm nb


BayesSearchCV: 100%|██████████| 32/32 [00:15<00:00,  2.05it/s]


Best parameters for nb: OrderedDict([('alpha', 0.020871588778809444)])
Best f1_macro score with nb for test data: 0.7302807394550515

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:17<00:00,  1.86it/s]


Best parameters for knn: OrderedDict([('leaf_size', 50), ('n_neighbors', 1)])
Best f1_macro score with knn for test data: 0.6353246753246753

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [00:20<00:00,  1.54it/s]


Best parameters for lr: OrderedDict([('C', 9.936082059895252), ('max_iter', 178)])
Best f1_macro score with lr for test data: 0.7574955908289241

Best algorithm: svm with best f1_macro score: 0.7892817812491324
Best parameters: OrderedDict([('C', 7.93503679725912), ('gamma', 1.1239646614321863)])
Classification Report in the test data:

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       177
           1       0.78      0.76      0.77       153

    accuracy                           0.79       330
   macro avg       0.79      0.79      0.79       330
weighted avg       0.79      0.79      0.79       330

F1-score: 0.7892817812491324
Accuracy: 0.7909090909090909
Tempo de execução: 169.37541723251343 segundos


In [35]:
algorithm = 'bow'

param_grid = {
    'max_features': [100, 1000, 5000],
    'ngram_range': [(1, 1), (1, 2), (1, 3)],
    'min_df': [1, 10],
    'max_df': [0.8, 1.0],
    'binary': [True, False]
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...
Done!



Dataset pequeno. Iniciando com bow padrão.
Similaridade inicial: 0.0175

[1/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 1)}
Similarity: 0.0311 | Melhor até agora: 0.0175
Tempo decorrido: 0.01s | Tempo estimado restante: 0.92s

[2/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 2)}
Similarity: 0.0310 | Melhor até agora: 0.0175
Tempo decorrido: 0.04s | Tempo estimado restante: 1.28s

[3/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 100, 'min_df': 1, 'ngram_range': (1, 3)}
Similarity: 0.0312

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Similarity: 0.0178 | Melhor até agora: 0.0175
Tempo decorrido: 0.19s | Tempo estimado restante: 1.49s

[9/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 1000, 'min_df': 1, 'ngram_range': (1, 3)}
Similarity: 0.0178 | Melhor até agora: 0.0175
Tempo decorrido: 0.23s | Tempo estimado restante: 1.58s

[10/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 1000, 'min_df': 10, 'ngram_range': (1, 1)}
Similarity: 0.0315 | Melhor até agora: 0.0175
Tempo decorrido: 0.24s | Tempo estimado restante: 1.48s

[11/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 1000, 'min_df': 10, 'ngram_range': (1, 2)}
Similarity: 0.0314 | Melhor até agora: 0.0175
Tempo decorrido: 0.26s | Tempo estimado restante: 1.45s

[12/72] Testando parâmetros: {'binary': True, 'max_df': 0.8, 'max_features': 1000, 'min_df': 10, 'ngram_range': (1, 3)}
Similarity: 0.0314 | Melhor até agora: 0.0175
Tempo decorrido: 0.29s | Tempo estimado restante: 1.46s

[13/72]

BayesSearchCV: 100%|██████████| 32/32 [00:25<00:00,  1.25it/s]


Best parameters for rf: OrderedDict([('max_depth', 48), ('n_estimators', 73)])
Best f1_macro score with rf for test data: 0.7146379158080659

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [01:04<00:00,  2.00s/it]


Best parameters for svm: OrderedDict([('C', 6.510073338549344), ('gamma', 0.06365147922804362)])
Best f1_macro score with svm for test data: 0.7826752372206918

Running Bayesian optimization for algorithm nb


BayesSearchCV: 100%|██████████| 32/32 [00:13<00:00,  2.30it/s]


Best parameters for nb: OrderedDict([('alpha', 0.3219999539141257)])
Best f1_macro score with nb for test data: 0.7572547721505021

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:19<00:00,  1.63it/s]


Best parameters for knn: OrderedDict([('leaf_size', 32), ('n_neighbors', 8)])
Best f1_macro score with knn for test data: 0.541405907535045

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [00:18<00:00,  1.73it/s]


Best parameters for lr: OrderedDict([('C', 2.004915013287878), ('max_iter', 1000)])
Best f1_macro score with lr for test data: 0.7778884186651177

Best algorithm: svm with best f1_macro score: 0.7826752372206918
Best parameters: OrderedDict([('C', 6.510073338549344), ('gamma', 0.06365147922804362)])
Classification Report in the test data:

              precision    recall  f1-score   support

           0       0.78      0.82      0.80       177
           1       0.78      0.74      0.76       153

    accuracy                           0.78       330
   macro avg       0.78      0.78      0.78       330
weighted avg       0.78      0.78      0.78       330

F1-score: 0.7826752372206918
Accuracy: 0.7848484848484848
Tempo de execução: 147.4212348461151 segundos


In [8]:
algorithm = 'word2vec'

param_grid = {
    'vector_size': [100, 1000, 5000],
    'window': [3, 5, 7],
    'min_count': [5, 10],
    'sg': [0, 1],
    'hs': [0, 1],
    'workers': [1]
}

treinando(df_name, column, target, algorithm, param_grid)

Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1



Defined pipeline: ['transform_to_lowercase', 'remove_special_characters', 'remove_stopwords'] 

Preprocess --> transform_to_lowercase
Converting text to lowercase...
Done!

Preprocess --> remove_special_characters
Removing special characters from text...
Done!

Preprocess --> remove_stopwords
Getting stopword list and removing from text...
Done!



Dataset pequeno. Iniciando com word2vec padrão.
Similaridade inicial: 0.2260

[1/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 3, 'workers': 1}


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Similarity: 0.1919 | Melhor até agora: 0.1919
Tempo decorrido: 0.11s | Tempo estimado restante: 8.07s

[2/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 5, 'workers': 1}
Similarity: 0.2260 | Melhor até agora: 0.1919
Tempo decorrido: 0.23s | Tempo estimado restante: 7.95s

[3/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 100, 'window': 7, 'workers': 1}
Similarity: 0.2452 | Melhor até agora: 0.1919
Tempo decorrido: 0.34s | Tempo estimado restante: 7.89s

[4/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 1000, 'window': 3, 'workers': 1}
Similarity: 0.1591 | Melhor até agora: 0.1591
Tempo decorrido: 0.58s | Tempo estimado restante: 9.93s

[5/72] Testando parâmetros: {'hs': 0, 'min_count': 5, 'sg': 0, 'vector_size': 1000, 'window': 5, 'workers': 1}
Similarity: 0.1904 | Melhor até agora: 0.1591
Tempo decorrido: 0.84s | Tempo estimado restante: 11.21s

[6/72] Testando parâmetros: {'hs': 0, 'min

BayesSearchCV: 100%|██████████| 32/32 [00:36<00:00,  1.14s/it]


Best parameters for rf: OrderedDict([('max_depth', 49), ('n_estimators', 90)])
Best f1_macro score with rf for test data: 0.7194795004250311

Running Bayesian optimization for algorithm svm


BayesSearchCV: 100%|██████████| 32/32 [00:15<00:00,  2.02it/s]


Best parameters for svm: OrderedDict([('C', 0.0007425534359037013), ('gamma', 0.1241881627783703)])
Best f1_macro score with svm for test data: 0.3167701863354037

Running Bayesian optimization for algorithm nb


BayesSearchCV:   0%|          | 0/32 [00:00<?, ?it/s]


An error occurred with algorithm nb: Negative values in data passed to MultinomialNB (input X)

Running Bayesian optimization for algorithm knn


BayesSearchCV: 100%|██████████| 32/32 [00:14<00:00,  2.24it/s]


Best parameters for knn: OrderedDict([('leaf_size', 10), ('n_neighbors', 1)])
Best f1_macro score with knn for test data: 0.676177536231884

Running Bayesian optimization for algorithm lr


BayesSearchCV: 100%|██████████| 32/32 [00:10<00:00,  3.08it/s]

Best parameters for lr: OrderedDict([('C', 0.0007425534359037013), ('max_iter', 755)])
Best f1_macro score with lr for test data: 0.3167701863354037

Best algorithm: rf with best f1_macro score: 0.7194795004250311
Best parameters: OrderedDict([('max_depth', 49), ('n_estimators', 90)])
Classification Report in the test data:

              precision    recall  f1-score   support

           0       0.72      0.80      0.76       177
           1       0.73      0.64      0.68       153

    accuracy                           0.72       330
   macro avg       0.73      0.72      0.72       330
weighted avg       0.72      0.72      0.72       330

F1-score: 0.7194795004250311
Accuracy: 0.7242424242424242
Tempo de execução: 106.770437002182 segundos



