In [1]:
import sys, os
import pandas as pd
import numpy as np

from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import nltk

from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import average_precision_score

In [2]:
pd.set_option('display.max_columns',210)
pd.set_option('display.max_rows',100)

In [3]:
result_path = './Resultados/'

In [6]:
transform_name = ['Bow','Bow_stopwords','Bow_stopwords_stemming','trunc_Bow','trunc_Bow_stopwords','trunc_Bow_stopwords_stemming']

In [4]:
def format_params(x):
    
    p_array = ['SVM__kernel','SVM__C','SVM__gamma','SVM__coef0','SVM__degree']

    s = []
    for i in range(0,len(x)):
        s.append(p_array[i][5:] + '=' + str(x[p_array[i]]))
    
    return ','.join(s)

In [77]:
grid_0 = pd.read_pickle(os.path.join(result_path,'grid_0_results.pkl'))
grid_0.insert(7,'transform',transform_name[0])

grid_1 = pd.read_pickle(os.path.join(result_path,'grid_1_results.pkl'))
grid_1.insert(7,'transform',transform_name[1])

grid_2 = pd.read_pickle(os.path.join(result_path,'grid_2_results.pkl'))
grid_2.insert(7,'transform',transform_name[2])

grid_3 = pd.read_pickle(os.path.join(result_path,'grid_3_results.pkl'))
grid_3.insert(7,'transform',transform_name[3])

grid_4 = pd.read_pickle(os.path.join(result_path,'grid_4_results.pkl'))
grid_4.insert(7,'transform',transform_name[4])

grid_5 = pd.read_pickle(os.path.join(result_path,'grid_5_results.pkl'))
grid_5.insert(7,'transform',transform_name[5])

grid_6 = pd.read_pickle(os.path.join(result_path,'grid_6_results.pkl'))
grid_6.insert(7,'transform',transform_name[0])

grid_7 = pd.read_pickle(os.path.join(result_path,'linear_rbf_results.pkl'))
grid_7.rename(columns={'param_C':'param_SVM__C','param_kernel':'param_SVM__kernel','param_gamma':'param_SVM__gamma'},inplace=True)
grid_7.insert(7,'transform','NonLinguisticFeature')

grid_8 = pd.read_pickle(os.path.join(result_path,'poly_results.pkl'))
grid_8.rename(columns={'param_C':'param_SVM__C','param_kernel':'param_SVM__kernel','param_gamma':'param_SVM__gamma',
                       'param_coef0':'param_SVM__coef0','param_degree':'param_SVM__degree'},inplace=True)
grid_8.insert(7,'transform','NonLinguisticFeature')

results = pd.concat([grid_0,grid_1,grid_2,grid_3,grid_4,grid_5,grid_6,grid_7,grid_8])

In [78]:
len(results)

1758

Resultados filtrados pela feature e pelo kernel utilizado

In [79]:
# Resultados filtrados pela feature e pelo kernel utilizado
idx = results.groupby(['transform','param_SVM__kernel'])['rank_test_f1'].transform(min) == results['rank_test_f1']
best_results = results[idx][['transform','param_SVM__kernel', 'param_SVM__C','param_SVM__gamma','param_SVM__coef0','param_SVM__degree','mean_test_f1','mean_test_precision','mean_test_recall','mean_test_accuracy']].sort_values('mean_test_f1',ascending=False)

# Empate: filtro por menor parâmetro C
idx = best_results.groupby(['transform','param_SVM__kernel'])['param_SVM__C'].transform(min) == best_results['param_SVM__C']
best_results = best_results[idx].sort_values(['transform','mean_test_f1'],ascending=False)

# Empate: filtro por menor grau polinomial
best_results.drop(index=8,inplace=True)

# Print
best_results

Unnamed: 0,transform,param_SVM__kernel,param_SVM__C,param_SVM__gamma,param_SVM__coef0,param_SVM__degree,mean_test_f1,mean_test_precision,mean_test_recall,mean_test_accuracy
18,trunc_Bow_stopwords_stemming,rbf,200.0,1.0,,,0.950016,0.956272,0.943925,0.950174
74,trunc_Bow_stopwords_stemming,poly,2.0,1.0,0.0,4.0,0.947926,0.947923,0.948079,0.947743
2,trunc_Bow_stopwords_stemming,linear,20.0,,,,0.940131,0.948263,0.932156,0.940451
14,trunc_Bow_stopwords,rbf,20.0,1.0,,,0.94803,0.949127,0.94704,0.947917
66,trunc_Bow_stopwords,poly,2.0,1.0,0.0,2.0,0.945843,0.942709,0.949117,0.945486
0,trunc_Bow_stopwords,linear,0.2,,,,0.942426,0.944123,0.94081,0.942361
14,trunc_Bow,rbf,20.0,1.0,,,0.951736,0.958262,0.94531,0.95191
74,trunc_Bow,poly,2.0,1.0,0.0,4.0,0.950855,0.950916,0.950848,0.950694
1,trunc_Bow,linear,2.0,,,,0.946465,0.953674,0.939425,0.946701
102,NonLinguisticFeature,poly,20000.0,0.01,100.0,2.0,0.936585,0.933071,0.940161,0.936111


In [71]:
print('melhor modelo:')
best_results[best_results['mean_test_f1']==best_results['mean_test_f1'].max()]

melhor modelo:


Unnamed: 0,transform,param_SVM__kernel,param_SVM__C,param_SVM__gamma,param_SVM__coef0,param_SVM__degree,mean_test_f1,mean_test_precision,mean_test_recall,mean_test_accuracy
48,Bow_stopwords,poly,0.2,0.01,10,3,0.964879,0.959944,0.969886,0.964583
