In [13]:
import sys, os
import pandas as pd
import numpy as np

from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import nltk

from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import plotly.express as px
import plotly.graph_objects as go

In [14]:
pd.set_option('display.max_columns',210)
pd.set_option('display.max_rows',100)

In [15]:
data_path = './Dados/'
result_path = './Resultados/'

In [16]:
df_corpus = pd.read_pickle(os.path.join(data_path,'preprocessed_corpus.pkl'))

In [17]:
corpus = list(df_corpus['sample'])
label = list(df_corpus['label'])

## Dependências

In [18]:
# Pacote de stop words em português
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

# Pacote de stemmers em portguês
nltk.download('rslp')
stemmer = nltk.stem.RSLPStemmer()

# Stemmer
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ELOGROUP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\ELOGROUP\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


## Separação dos dados

Os dados serão separados segundo os seguintes critérios:
* 20% conjunto de teste (1440 amostras)
* 80% conjunto de treinamento (5760 amostras)
    * 1/3 conjunto de validação (1920 amostras)
    * 2/3 conjunto de treino de algoritmo (3840 amostras)

Obs: a subdivisão do conjunto de treinamento é feita dentro do GridSearch

In [19]:
X_train, X_test, y_train, y_test = train_test_split(corpus, label, test_size=0.2, random_state=1)

## Bag of Words

In [20]:
Tfid = []

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = 'word',
                            stop_words = None,
                            max_features = None))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = 'word',
                            stop_words = stopwords,
                            max_features = None))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = stemmed_words,
                            stop_words = stopwords,
                            max_features = None))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = 'word',
                            stop_words = None,
                            max_features = 200))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = 'word',
                            stop_words = stopwords,
                            max_features = 200))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = stemmed_words,
                            stop_words = stopwords,
                            max_features = 200))

transform_name = ['Bow','Bow_stopwords','Bow_stopwords_stemming','trunc_Bow','trunc_Bow_stopwords','trunc_Bow_stopwords_stemming']

## SVM

In [21]:
# Set the parameters by cross-validation
param_grid1 = [{'SVM__kernel': ['linear'],
                'SVM__C'     : [2e-1, 2e0, 2e1, 2e2, 2e3, 2e4]},

               {'SVM__kernel': ['rbf'], 
                'SVM__gamma' : [1e0, 1e-1, 1e-2, 1e-3], 
                'SVM__C'     : [2e-1, 2e0, 2e1, 2e2, 2e3, 2e4]},

               {'SVM__kernel': ['poly'], 
                'SVM__gamma' : [1e0, 1e-1, 1e-2, 1e-3], 
                'SVM__C'     : [2e-1, 2e0, 2e1, 2e2, 2e3, 2e4],
                'SVM__degree': [2, 3, 4],
                'SVM__coef0' : [0, 1e1, 1e2]}]

# Set the parameters by cross-validation
param_grid2 = [{'SVM__kernel': ['linear'],
                'SVM__C'     : [2e-1, 2e0, 2e1, 2e2, 2e3, 2e4]},

               {'SVM__kernel': ['rbf'], 
                'SVM__gamma' : [1e0, 1e-1, 1e-2, 1e-3], 
                'SVM__C'     : [2e-1, 2e0, 2e1, 2e2, 2e3, 2e4]},

               {'SVM__kernel': ['poly'], 
                'SVM__gamma' : [1e0, 1e-1, 1e-2, 1e-3], 
                'SVM__C'     : [2e-1, 2e0, 2e1, 2e2, 2e3, 2e4],
                'SVM__degree': [2, 3, 4],
                'SVM__coef0' : [0, 1e1, 1e2]}]


## Métricas

In [22]:
scoring = ['accuracy','f1','precision','recall']

## Treinamento

Treinamento utilizando o bag of words completo

In [11]:
for i in range(0,3):
    
    print('\n***************************\n')
    print(f'Treinando modelo {i}...\n')
    
    steps = [(transform_name[i], Tfid[i]), ('SVM', SVC())]
    pipeline = Pipeline(steps)

    grid = GridSearchCV(pipeline, param_grid=param_grid1, cv=3, scoring=scoring, refit='f1', verbose=2)
    grid.fit(X_train, y_train)

    print(f'\nModelo {i} treinado!\n')

    print('Melhor estimador:\n')
    print(grid.best_estimator_)

    print('Salvando resultados... ', end='')

    grid_results = pd.DataFrame.from_dict(grid.cv_results_)
    grid_results.to_pickle(f'./Resultados/grid_{i}_results.pkl')

    print('OK \n')

 SVM__coef0=10.0, SVM__degree=2, SVM__gamma=0.01, SVM__kernel=poly; total time= 2.7min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=2, SVM__gamma=0.01, SVM__kernel=poly; total time= 2.8min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=2, SVM__gamma=0.01, SVM__kernel=poly; total time= 2.8min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=2, SVM__gamma=0.001, SVM__kernel=poly; total time= 2.8min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=2, SVM__gamma=0.001, SVM__kernel=poly; total time= 2.8min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=2, SVM__gamma=0.001, SVM__kernel=poly; total time= 2.9min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=3, SVM__gamma=1.0, SVM__kernel=poly; total time=23.2min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=3, SVM__gamma=1.0, SVM__kernel=poly; total time= 2.6min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=3, SVM__gamma=1.0, SVM__kernel=poly; total time= 2.2min
[CV] END SVM__C=2000.0, SVM__coef

Treinamento utilizando o bag of words truncado

In [23]:
for i in range(3,6):
    
    print('\n***************************\n')
    print(f'Treinando modelo {i}...\n')
    
    steps = [(transform_name[i], Tfid[i]), ('SVM', SVC())]
    pipeline = Pipeline(steps)

    grid = GridSearchCV(pipeline, param_grid=param_grid2, cv=3, scoring=scoring, refit='f1', verbose=2)
    grid.fit(X_train, y_train)

    print(f'\nModelo {i} treinado!\n')

    print('Melhor estimador:\n')
    print(grid.best_estimator_)

    print('Salvando resultados... ', end='')

    grid_results = pd.DataFrame.from_dict(grid.cv_results_)
    grid_results.to_pickle(f'./Resultados/grid_{i}_results.pkl')

    print('OK \n')

ernel=poly; total time= 1.9min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=2, SVM__gamma=0.01, SVM__kernel=poly; total time= 2.4min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=2, SVM__gamma=0.01, SVM__kernel=poly; total time= 1.8min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=2, SVM__gamma=0.01, SVM__kernel=poly; total time= 1.9min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=2, SVM__gamma=0.001, SVM__kernel=poly; total time= 2.0min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=2, SVM__gamma=0.001, SVM__kernel=poly; total time= 1.8min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=2, SVM__gamma=0.001, SVM__kernel=poly; total time= 1.8min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=3, SVM__gamma=1.0, SVM__kernel=poly; total time= 1.7min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=3, SVM__gamma=1.0, SVM__kernel=poly; total time= 1.7min
[CV] END SVM__C=2000.0, SVM__coef0=10.0, SVM__degree=3, SVM__gamma=1.0, SVM__kernel=poly

## Resultados

In [114]:
def format_params(x):
    
    p_array = ['SVM__kernel','SVM__C','SVM__gamma','SVM__coef0','SVM__degree']

    s = []
    for i in range(0,len(x)):
        s.append(p_array[i][5:] + '=' + str(x[p_array[i]]))
    
    return ','.join(s)

In [159]:
grid_0 = pd.read_pickle(os.path.join(result_path,'grid_0_results.pkl'))
grid_0.insert(7,'transform',transform_name[0])

grid_1 = pd.read_pickle(os.path.join(result_path,'grid_1_results.pkl'))
grid_1.insert(7,'transform',transform_name[1])

grid_2 = pd.read_pickle(os.path.join(result_path,'grid_2_results.pkl'))
grid_2.insert(7,'transform',transform_name[2])

grid_3 = pd.read_pickle(os.path.join(result_path,'grid_3_results.pkl'))
grid_3.insert(7,'transform',transform_name[3])

grid_4 = pd.read_pickle(os.path.join(result_path,'grid_4_results.pkl'))
grid_4.insert(7,'transform',transform_name[4])

grid_5 = pd.read_pickle(os.path.join(result_path,'grid_5_results.pkl'))
grid_5.insert(7,'transform',transform_name[5])

grid_6 = pd.read_pickle(os.path.join(result_path,'grid_6_results.pkl'))
grid_6.insert(7,'transform',transform_name[0])

results = pd.concat([grid_0,grid_1,grid_2,grid_3,grid_4,grid_5,grid_6])

In [160]:
len(results)

1620

In [161]:
results['params_str'] = results['params'].apply(lambda x: format_params(x))

In [164]:
idx = results.groupby('transform')['rank_test_f1'].transform(min) == results['rank_test_f1']
results[idx][['transform','param_SVM__kernel', 'param_SVM__C','param_SVM__gamma','param_SVM__coef0','param_SVM__degree','mean_test_f1','mean_test_precision','mean_test_recall','mean_test_accuracy']].sort_values('mean_test_f1',ascending=False)

Unnamed: 0,transform,param_SVM__kernel,param_SVM__C,param_SVM__gamma,param_SVM__coef0,param_SVM__degree,mean_test_f1,mean_test_precision,mean_test_recall,mean_test_accuracy
48,Bow_stopwords,poly,0.2,0.01,10.0,3.0,0.964879,0.959944,0.969886,0.964583
85,Bow_stopwords,poly,2.0,0.001,10.0,3.0,0.964879,0.959944,0.969886,0.964583
128,Bow,poly,20000.0,1.0,10.0,5.0,0.963298,0.963634,0.962963,0.963194
104,Bow,poly,2000.0,1.0,10.0,5.0,0.963298,0.963634,0.962963,0.963194
80,Bow,poly,200.0,1.0,10.0,5.0,0.963298,0.963634,0.962963,0.963194
56,Bow,poly,20.0,1.0,10.0,5.0,0.963298,0.963634,0.962963,0.963194
32,Bow,poly,2.0,1.0,10.0,5.0,0.963298,0.963634,0.962963,0.963194
8,Bow,poly,0.2,1.0,10.0,5.0,0.963298,0.963634,0.962963,0.963194
86,Bow,poly,2.0,1.0,10.0,4.0,0.962386,0.963891,0.960886,0.962326
50,Bow,poly,0.2,1.0,10.0,4.0,0.962386,0.963891,0.960886,0.962326


## Treinamento - 2 rodada

In [154]:
param_grid3 = {'SVM__kernel': ['poly'], 
               'SVM__gamma' : [1e0, 1e-1, 1e-2, 1e-3], 
               'SVM__C'     : [2e-1, 2e0, 2e1, 2e2, 2e3, 2e4],
               'SVM__degree': [5, 6],
               'SVM__coef0' : [0, 1e1, 1e2]}

Tfid3 = Tfid[0] #BoW
transform_name3 = transform_name[0] #BoW

In [158]:
i = 6
    
print('\n***************************\n')
print(f'Treinando modelo {i}...\n')

steps = [(transform_name3, Tfid3), ('SVM', SVC())]
pipeline = Pipeline(steps)

grid = GridSearchCV(pipeline, param_grid=param_grid3, cv=3, scoring=scoring, refit='f1', verbose=2)
grid.fit(X_train, y_train)

print(f'\nModelo {i} treinado!\n')

print('Melhor estimador:\n')
print(grid.best_estimator_)

print('Salvando resultados... ', end='')

grid_results = pd.DataFrame.from_dict(grid.cv_results_)
grid_results.to_pickle(f'./Resultados/grid_{i}_results.pkl')

print('OK \n')

_gamma=0.001, SVM__kernel=poly; total time=  18.0s
[CV] END SVM__C=200.0, SVM__coef0=10.0, SVM__degree=6, SVM__gamma=1.0, SVM__kernel=poly; total time=  20.0s
[CV] END SVM__C=200.0, SVM__coef0=10.0, SVM__degree=6, SVM__gamma=1.0, SVM__kernel=poly; total time=  19.6s
[CV] END SVM__C=200.0, SVM__coef0=10.0, SVM__degree=6, SVM__gamma=1.0, SVM__kernel=poly; total time=  19.7s
[CV] END SVM__C=200.0, SVM__coef0=10.0, SVM__degree=6, SVM__gamma=0.1, SVM__kernel=poly; total time=  18.4s
[CV] END SVM__C=200.0, SVM__coef0=10.0, SVM__degree=6, SVM__gamma=0.1, SVM__kernel=poly; total time=  18.0s
[CV] END SVM__C=200.0, SVM__coef0=10.0, SVM__degree=6, SVM__gamma=0.1, SVM__kernel=poly; total time=  18.2s
[CV] END SVM__C=200.0, SVM__coef0=10.0, SVM__degree=6, SVM__gamma=0.01, SVM__kernel=poly; total time=  18.2s
[CV] END SVM__C=200.0, SVM__coef0=10.0, SVM__degree=6, SVM__gamma=0.01, SVM__kernel=poly; total time=  17.9s
[CV] END SVM__C=200.0, SVM__coef0=10.0, SVM__degree=6, SVM__gamma=0.01, SVM__kernel