# Escolha do modelo final

## Importações

In [25]:
import pandas as pd
import sklearn
from hydra import initialize, compose
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

## Inicialização

In [26]:
with initialize(version_base=None, config_path="../config/"):
    cfg = compose(config_name='main')
    
df = pd.read_csv(f"../{cfg.data.processed}")

results = pd.DataFrame()

def evaluate_model(model, X_test, y_test, normalizacao):
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted') 

    best_params = model.best_params_

    return {
        'Normalization': normalizacao,
        'Test Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1,
        **best_params
    }

def evaluate_results(grids, X_test, y_test, normalizacao, dataframe):
    
    grid_dict = {
        0: 'Logistic Regression', 
        1: 'Support Vector Machines',
        2: 'Multinomial Naive Bayes',
        3: 'KNeighbors Classifier',
        4: 'Decision Tree',
        5: 'Random Forest'
    }

    results_list = []
    results_df = pd.DataFrame()
    
    for i, model in enumerate(grids):
        
        results = evaluate_model(model, X_test, y_test, normalizacao)
        results_list.append(results)
    
        
    results_df = pd.DataFrame(results_list, index=grid_dict.values())
        
    return pd.concat([dataframe, results_df])

## Cleaned text

### Binary, Word Count and n-grams

In [168]:
y = df.encoded_sentiment

X = df.cleaned_text

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

pipeline_lr = Pipeline([
    ('count_vec', CountVectorizer()),
    ('LR', LogisticRegression(max_iter=1000))
])

pipeline_svm = Pipeline([
    ('count_vec', CountVectorizer()),
    ('SVC', LinearSVC())
])

pipeline_nb = Pipeline([
    ('count_vec', CountVectorizer()),
    ('MNB', MultinomialNB())
])

pipeline_knn = Pipeline([
    ('count_vec', CountVectorizer()),
    ('KNN', KNeighborsClassifier())
])

pipeline_dt = Pipeline([
    ('count_vec', CountVectorizer()),
    ('DT', DecisionTreeClassifier())
])

pipeline_rf = Pipeline([
    ('count_vec', CountVectorizer()),
    ('RF', RandomForestClassifier())
])

param_range = [0.01, 0.05, 0.25, 0.5, 1]

lr_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
    'LR__C': param_range
}]

svm_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
    'SVC__C': [0.001, 0.005, 0.01, 0.05, 0.1]
}]

nb_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': ["english"],
}]

knn_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
}]

dt_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
}]

rf_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
}]

lr_grid_search = GridSearchCV(estimator=pipeline_lr,
        param_grid=lr_param_grid,
        scoring='accuracy',
        cv=3)

svm_grid_search = GridSearchCV(estimator=pipeline_svm,
        param_grid=svm_param_grid,
        scoring='accuracy',
        cv=3)

nb_grid_search = GridSearchCV(estimator=pipeline_nb,
        param_grid=nb_param_grid,
        scoring='accuracy',
        cv=3)

knn_grid_search = GridSearchCV(estimator=pipeline_knn,
        param_grid=knn_param_grid,
        scoring='accuracy',
        cv=3)

dt_grid_search = GridSearchCV(estimator=pipeline_dt,
        param_grid=dt_param_grid,
        scoring='accuracy',
        cv=3)

rf_grid_search = GridSearchCV(estimator=pipeline_rf,
        param_grid=rf_param_grid,
        scoring='accuracy',
        cv=3)

grids = [lr_grid_search, svm_grid_search, nb_grid_search, knn_grid_search, dt_grid_search, rf_grid_search]

In [197]:
for pipe in grids:
    pipe.fit(X_train, y_train)

In [205]:
results = evaluate_results(grids, X_test, y_test, 'Cleaned Text', results)

### TF-IDF

In [207]:
y = df1.encoded_sentiment

X = df1.cleaned_text

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('LR', LogisticRegression(max_iter=1000))
])

pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('SVC', LinearSVC())
])

pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('MNB', MultinomialNB())
])

pipeline_knn = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])


pipeline_dt = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('DT', DecisionTreeClassifier())
])

pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('RF', RandomForestClassifier())
])

param_range = [0.01, 0.05, 0.25, 0.5, 1]

lr_param_grid = [{
    'LR__C': param_range,
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

svm_param_grid = [{
    'SVC__C': [0.001, 0.005, 0.01, 0.05, 0.1],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

nb_param_grid = [{
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

knn_param_grid = [{
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

dt_param_grid = [{
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

rf_param_grid = [{
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

lr_grid_search = GridSearchCV(estimator=pipeline_lr,
        param_grid=lr_param_grid,
        scoring='accuracy',
        cv=3)

svm_grid_search = GridSearchCV(estimator=pipeline_svm,
        param_grid=svm_param_grid,
        scoring='accuracy',
        cv=3)

nb_grid_search = GridSearchCV(estimator=pipeline_nb,
        param_grid=nb_param_grid,
        scoring='accuracy',
        cv=3)

knn_grid_search = GridSearchCV(estimator=pipeline_knn,
        param_grid=knn_param_grid,
        scoring='accuracy',
        cv=3)

dt_grid_search = GridSearchCV(estimator=pipeline_dt,
        param_grid=dt_param_grid,
        scoring='accuracy',
        cv=3)

rf_grid_search = GridSearchCV(estimator=pipeline_rf,
        param_grid=rf_param_grid,
        scoring='accuracy',
        cv=3)

grids = [lr_grid_search, svm_grid_search, nb_grid_search, knn_grid_search, dt_grid_search, rf_grid_search]

In [208]:
for pipe in grids:
    pipe.fit(X_train, y_train)

In [209]:
results = evaluate_results(grids, X_test, y_test, 'Cleaned Text', results)

## Stematized text

### Binary, Word Count and n-grams

In [214]:
y = df.encoded_sentiment

X = df.stematized_text

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

pipeline_lr = Pipeline([
    ('count_vec', CountVectorizer()),
    ('LR', LogisticRegression(max_iter=1000))
])

pipeline_svm = Pipeline([
    ('count_vec', CountVectorizer()),
    ('SVC', LinearSVC())
])

pipeline_nb = Pipeline([
    ('count_vec', CountVectorizer()),
    ('MNB', MultinomialNB())
])

pipeline_knn = Pipeline([
    ('count_vec', CountVectorizer()),
    ('KNN', KNeighborsClassifier())
])

pipeline_dt = Pipeline([
    ('count_vec', CountVectorizer()),
    ('DT', DecisionTreeClassifier())
])

pipeline_rf = Pipeline([
    ('count_vec', CountVectorizer()),
    ('RF', RandomForestClassifier())
])

param_range = [0.01, 0.05, 0.25, 0.5, 1]

lr_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
    'LR__C': param_range
}]

svm_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
    'SVC__C': [0.001, 0.005, 0.01, 0.05, 0.1]
}]

nb_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': ["english"],
}]

knn_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
}]

dt_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
}]

rf_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
}]

lr_grid_search = GridSearchCV(estimator=pipeline_lr,
        param_grid=lr_param_grid,
        scoring='accuracy',
        cv=3)

svm_grid_search = GridSearchCV(estimator=pipeline_svm,
        param_grid=svm_param_grid,
        scoring='accuracy',
        cv=3)

nb_grid_search = GridSearchCV(estimator=pipeline_nb,
        param_grid=nb_param_grid,
        scoring='accuracy',
        cv=3)

knn_grid_search = GridSearchCV(estimator=pipeline_knn,
        param_grid=knn_param_grid,
        scoring='accuracy',
        cv=3)

dt_grid_search = GridSearchCV(estimator=pipeline_dt,
        param_grid=dt_param_grid,
        scoring='accuracy',
        cv=3)

rf_grid_search = GridSearchCV(estimator=pipeline_rf,
        param_grid=rf_param_grid,
        scoring='accuracy',
        cv=3)

grids = [lr_grid_search, svm_grid_search, nb_grid_search, knn_grid_search, dt_grid_search, rf_grid_search]

In [215]:
for pipe in grids:
    pipe.fit(X_train, y_train)

In [216]:
results = evaluate_results(grids, X_test, y_test, 'Stematized Text', results)

### TF-IDF

In [218]:
y = df.encoded_sentiment

X = df.stematized_text

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('LR', LogisticRegression(max_iter=1000))
])

pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('SVC', LinearSVC())
])

pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('MNB', MultinomialNB())
])

pipeline_knn = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])


pipeline_dt = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('DT', DecisionTreeClassifier())
])

pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('RF', RandomForestClassifier())
])

param_range = [0.01, 0.05, 0.25, 0.5, 1]

lr_param_grid = [{
    'LR__C': param_range,
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

svm_param_grid = [{
    'SVC__C': [0.001, 0.005, 0.01, 0.05, 0.1],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

nb_param_grid = [{
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

knn_param_grid = [{
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

dt_param_grid = [{
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

rf_param_grid = [{
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

lr_grid_search = GridSearchCV(estimator=pipeline_lr,
        param_grid=lr_param_grid,
        scoring='accuracy',
        cv=3)

svm_grid_search = GridSearchCV(estimator=pipeline_svm,
        param_grid=svm_param_grid,
        scoring='accuracy',
        cv=3)

nb_grid_search = GridSearchCV(estimator=pipeline_nb,
        param_grid=nb_param_grid,
        scoring='accuracy',
        cv=3)

knn_grid_search = GridSearchCV(estimator=pipeline_knn,
        param_grid=knn_param_grid,
        scoring='accuracy',
        cv=3)

dt_grid_search = GridSearchCV(estimator=pipeline_dt,
        param_grid=dt_param_grid,
        scoring='accuracy',
        cv=3)

rf_grid_search = GridSearchCV(estimator=pipeline_rf,
        param_grid=rf_param_grid,
        scoring='accuracy',
        cv=3)

grids = [lr_grid_search, svm_grid_search, nb_grid_search, knn_grid_search, dt_grid_search, rf_grid_search]

In [219]:
for pipe in grids:
    pipe.fit(X_train, y_train)

In [220]:
results = evaluate_results(grids, X_test, y_test, 'Stematized Text', results)

## Lemmatized text

### Binary, Word Count and n-grams

In [223]:
y = df.encoded_sentiment

X = df.lemmatized_text

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

pipeline_lr = Pipeline([
    ('count_vec', CountVectorizer()),
    ('LR', LogisticRegression(max_iter=1000))
])

pipeline_svm = Pipeline([
    ('count_vec', CountVectorizer()),
    ('SVC', LinearSVC())
])

pipeline_nb = Pipeline([
    ('count_vec', CountVectorizer()),
    ('MNB', MultinomialNB())
])

pipeline_knn = Pipeline([
    ('count_vec', CountVectorizer()),
    ('KNN', KNeighborsClassifier())
])

pipeline_dt = Pipeline([
    ('count_vec', CountVectorizer()),
    ('DT', DecisionTreeClassifier())
])

pipeline_rf = Pipeline([
    ('count_vec', CountVectorizer()),
    ('RF', RandomForestClassifier())
])

param_range = [0.01, 0.05, 0.25, 0.5, 1]

lr_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
    'LR__C': param_range
}]

svm_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
    'SVC__C': [0.001, 0.005, 0.01, 0.05, 0.1]
}]

nb_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': ["english"],
}]

knn_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
}]

dt_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
}]

rf_param_grid = [{
    'count_vec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'count_vec__binary': [True, False],
    'count_vec__stop_words': [None, "english"],
}]

lr_grid_search = GridSearchCV(estimator=pipeline_lr,
        param_grid=lr_param_grid,
        scoring='accuracy',
        cv=3)

svm_grid_search = GridSearchCV(estimator=pipeline_svm,
        param_grid=svm_param_grid,
        scoring='accuracy',
        cv=3)

nb_grid_search = GridSearchCV(estimator=pipeline_nb,
        param_grid=nb_param_grid,
        scoring='accuracy',
        cv=3)

knn_grid_search = GridSearchCV(estimator=pipeline_knn,
        param_grid=knn_param_grid,
        scoring='accuracy',
        cv=3)

dt_grid_search = GridSearchCV(estimator=pipeline_dt,
        param_grid=dt_param_grid,
        scoring='accuracy',
        cv=3)

rf_grid_search = GridSearchCV(estimator=pipeline_rf,
        param_grid=rf_param_grid,
        scoring='accuracy',
        cv=3)

grids = [lr_grid_search, svm_grid_search, nb_grid_search, knn_grid_search, dt_grid_search, rf_grid_search]

In [224]:
for pipe in grids:
    pipe.fit(X_train, y_train)

In [225]:
results = evaluate_results(grids, X_test, y_test, 'Lemmatized Text', results)

### TF-IDF

In [227]:
y = df.encoded_sentiment

X = df.lemmatized_text

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('LR', LogisticRegression(max_iter=1000))
])

pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('SVC', LinearSVC())
])

pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('MNB', MultinomialNB())
])

pipeline_knn = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('KNN', KNeighborsClassifier())
])


pipeline_dt = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('DT', DecisionTreeClassifier())
])

pipeline_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('RF', RandomForestClassifier())
])

param_range = [0.01, 0.05, 0.25, 0.5, 1]

lr_param_grid = [{
    'LR__C': param_range,
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

svm_param_grid = [{
    'SVC__C': [0.001, 0.005, 0.01, 0.05, 0.1],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

nb_param_grid = [{
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

knn_param_grid = [{
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

dt_param_grid = [{
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

rf_param_grid = [{
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': [None, 'english']
}]

lr_grid_search = GridSearchCV(estimator=pipeline_lr,
        param_grid=lr_param_grid,
        scoring='accuracy',
        cv=3)

svm_grid_search = GridSearchCV(estimator=pipeline_svm,
        param_grid=svm_param_grid,
        scoring='accuracy',
        cv=3)

nb_grid_search = GridSearchCV(estimator=pipeline_nb,
        param_grid=nb_param_grid,
        scoring='accuracy',
        cv=3)

knn_grid_search = GridSearchCV(estimator=pipeline_knn,
        param_grid=knn_param_grid,
        scoring='accuracy',
        cv=3)

dt_grid_search = GridSearchCV(estimator=pipeline_dt,
        param_grid=dt_param_grid,
        scoring='accuracy',
        cv=3)

rf_grid_search = GridSearchCV(estimator=pipeline_rf,
        param_grid=rf_param_grid,
        scoring='accuracy',
        cv=3)

grids = [lr_grid_search, svm_grid_search, nb_grid_search, knn_grid_search, dt_grid_search, rf_grid_search]

In [228]:
for pipe in grids:
    pipe.fit(X_train, y_train)

In [229]:
results = evaluate_results(grids, X_test, y_test, 'Lemmatized Text', results)

## Resultados

In [230]:
results

Unnamed: 0,Normalization,Test Accuracy,Precision,Recall,F1-score,LR__C,count_vec__binary,count_vec__ngram_range,count_vec__stop_words,SVC__C,tfidf__ngram_range,tfidf__stop_words
Logistic Regression,Cleaned Text,0.826888,0.827765,0.826888,0.813534,1.0,True,"(1, 1)",,,,
Support Vector Machines,Cleaned Text,0.825046,0.822884,0.825046,0.813627,,True,"(1, 1)",,0.1,,
Multinomial Naive Bayes,Cleaned Text,0.804788,0.798024,0.804788,0.798526,,True,"(1, 1)",english,,,
KNeighbors Classifier,Cleaned Text,0.723757,0.700185,0.723757,0.668487,,True,"(1, 1)",,,,
Decision Tree,Cleaned Text,0.799263,0.791885,0.799263,0.787444,,False,"(1, 2)",,,,
Random Forest,Cleaned Text,0.810313,0.823593,0.810313,0.7861,,True,"(1, 1)",,,,
Logistic Regression,Cleaned Text,0.771639,0.792641,0.771639,0.727352,1.0,,,,,"(1, 1)",english
Support Vector Machines,Cleaned Text,0.769797,0.790767,0.769797,0.72445,,,,,0.1,"(1, 1)",english
Multinomial Naive Bayes,Cleaned Text,0.74954,0.796832,0.74954,0.681934,,,,,,"(1, 1)",english
KNeighbors Classifier,Cleaned Text,0.758748,0.74682,0.758748,0.748783,,,,,,"(1, 1)",


In [233]:
results.sort_values('F1-score', ascending=False)[:5]

Unnamed: 0,Normalization,Test Accuracy,Precision,Recall,F1-score,LR__C,count_vec__binary,count_vec__ngram_range,count_vec__stop_words,SVC__C,tfidf__ngram_range,tfidf__stop_words
Decision Tree,Stematized Text,0.841621,0.837789,0.841621,0.838074,,True,"(1, 1)",,,,
Logistic Regression,Stematized Text,0.843462,0.845525,0.843462,0.832728,1.0,True,"(1, 1)",,,,
Support Vector Machines,Lemmatized Text,0.843462,0.846482,0.843462,0.8322,,True,"(1, 1)",,0.1,,
Support Vector Machines,Stematized Text,0.841621,0.842081,0.841621,0.83154,,True,"(1, 1)",,0.1,,
Logistic Regression,Lemmatized Text,0.839779,0.843037,0.839779,0.8277,1.0,True,"(1, 1)",,,,


O **Decision Tree** foi selecionado como o modelo final. Ele será aplicado ao texto após o processo de stemização. A representação binária, utilizando unigramas e incluindo stopwords, será adotada para a construção desse modelo.

In [29]:
from joblib import dump, load

dt = DecisionTreeClassifier()
count_vec = CountVectorizer(binary=True)

X = df.stematized_text
y = df.encoded_sentiment

X_vec = count_vec.fit_transform(X)

dt.fit(X_vec, y)

dump(dt, f'../{cfg.models.dt}')
dump(count_vec, f'../{cfg.models.cv}')

['../models/cv.joblib']