In [49]:
import sys, os
import pandas as pd
import numpy as np

from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import nltk

from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import plotly.express as px
import plotly.graph_objects as go

In [20]:
pd.set_option('display.max_columns',210)
pd.set_option('display.max_rows',100)

In [16]:
data_path = './Dados/'
result_path = './Resultados/'

In [2]:
df_corpus = pd.read_pickle(os.path.join(data_path,'preprocessed_corpus.pkl'))

In [3]:
corpus = list(df_corpus['sample'])
label = list(df_corpus['label'])

## Dependências

In [4]:
# Pacote de stop words em português
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

# Pacote de stemmers em portguês
nltk.download('rslp')
stemmer = nltk.stem.RSLPStemmer()

# Stemmer
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ELOGROUP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\ELOGROUP\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


## Separação dos dados

Os dados serão separados segundo os seguintes critérios:
* 20% conjunto de teste (1440 amostras)
* 80% conjunto de treinamento (5760 amostras)
    * 1/3 conjunto de validação (1920 amostras)
    * 2/3 conjunto de treino de algoritmo (3840 amostras)

Obs: a subdivisão do conjunto de treinamento é feita dentro do GridSearch

In [5]:
X_train, X_test, y_train, y_test = train_test_split(corpus, label, test_size=0.2, random_state=1)

## Bag of Words

In [78]:
Tfid = []

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = 'word',
                            stop_words = None,
                            max_features = None))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = 'word',
                            stop_words = stopwords,
                            max_features = None))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = stemmed_words,
                            stop_words = stopwords,
                            max_features = None))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = 'word',
                            stop_words = None,
                            max_features = 200))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = 'word',
                            stop_words = stopwords,
                            max_features = 200))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = stemmed_words,
                            stop_words = stopwords,
                            max_features = 200))

transform_name = ['Bow','Bow_stopwords','Bow_stopwords_stemming','trunc_Bow','trunc_Bow_stopwords','trunc_Bow_stopwords_stemming']

## SVM

In [79]:
# Set the parameters by cross-validation
param_grid = [{'SVM__kernel': ['linear'],
               'SVM__C': [2e-3, 2e-1, 2e1, 2e3, 2e5, 2e7, 2e9, 2e11, 2e13, 2e15]},

              {'SVM__kernel': ['rbf'], 
               'SVM__gamma': [1e-1, 1e-2, 1e-3, 1e-4], 
               'SVM__C': [2e-3, 2e-1, 2e1, 2e3, 2e5, 2e7, 2e9, 2e11, 2e13, 2e15]}]

## Métricas

In [80]:
scoring = ['accuracy','f1','precision','recall']

## Treinamento

Treinamento utilizando o bag of words completo

In [14]:
for i in range(0,3):
    
    print('\n***************************\n')
    print(f'Treinando modelo {i}...\n')
    
    steps = [(transform_name[i], Tfid[i]), ('SVM', SVC())]
    pipeline = Pipeline(steps)

    grid = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring=scoring, refit='f1', verbose=2)
    grid.fit(X_train, y_train)

    print(f'\nModelo {i} treinado!\n')

    print('Melhor estimador:\n')
    print(grid.best_estimator_)

    print('Salvando resultados... ', end='')

    grid_results = pd.DataFrame.from_dict(grid.cv_results_)
    grid_results.to_pickle(f'./Resultados/grid_{i}_results.pkl')

    print('OK \n')

..SVM__C=200000.0, SVM__gamma=0.1, SVM__kernel=rbf; total time=  20.7s
[CV] END ...SVM__C=200000.0, SVM__gamma=0.1, SVM__kernel=rbf; total time=  20.8s
[CV] END ..SVM__C=200000.0, SVM__gamma=0.01, SVM__kernel=rbf; total time=  20.2s
[CV] END ..SVM__C=200000.0, SVM__gamma=0.01, SVM__kernel=rbf; total time=  19.7s
[CV] END ..SVM__C=200000.0, SVM__gamma=0.01, SVM__kernel=rbf; total time=  19.6s
[CV] END .SVM__C=200000.0, SVM__gamma=0.001, SVM__kernel=rbf; total time=  20.4s
[CV] END .SVM__C=200000.0, SVM__gamma=0.001, SVM__kernel=rbf; total time=  22.1s
[CV] END .SVM__C=200000.0, SVM__gamma=0.001, SVM__kernel=rbf; total time=  22.8s
[CV] END SVM__C=200000.0, SVM__gamma=0.0001, SVM__kernel=rbf; total time=  20.2s
[CV] END SVM__C=200000.0, SVM__gamma=0.0001, SVM__kernel=rbf; total time=  20.0s
[CV] END SVM__C=200000.0, SVM__gamma=0.0001, SVM__kernel=rbf; total time=  19.9s
[CV] END .SVM__C=20000000.0, SVM__gamma=0.1, SVM__kernel=rbf; total time=  21.6s
[CV] END .SVM__C=20000000.0, SVM__gamm

Treinamento utilizando o bag of words truncado

In [81]:
for i in range(3,6):
    
    print('\n***************************\n')
    print(f'Treinando modelo {i}...\n')
    
    steps = [(transform_name[i], Tfid[i]), ('SVM', SVC())]
    pipeline = Pipeline(steps)

    grid = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring=scoring, refit='f1', verbose=2)
    grid.fit(X_train, y_train)

    print(f'\nModelo {i} treinado!\n')

    print('Melhor estimador:\n')
    print(grid.best_estimator_)

    print('Salvando resultados... ', end='')

    grid_results = pd.DataFrame.from_dict(grid.cv_results_)
    grid_results.to_pickle(f'./Resultados/grid_{i}_results.pkl')

    print('OK \n')


***************************

Treinando modelo 3...

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END ...................SVM__C=0.002, SVM__kernel=linear; total time=  52.0s
[CV] END ...................SVM__C=0.002, SVM__kernel=linear; total time=  43.9s
[CV] END ...................SVM__C=0.002, SVM__kernel=linear; total time=  44.0s
[CV] END .....................SVM__C=0.2, SVM__kernel=linear; total time=  24.8s
[CV] END .....................SVM__C=0.2, SVM__kernel=linear; total time=  24.7s
[CV] END .....................SVM__C=0.2, SVM__kernel=linear; total time=  24.2s
[CV] END ....................SVM__C=20.0, SVM__kernel=linear; total time=  15.8s
[CV] END ....................SVM__C=20.0, SVM__kernel=linear; total time=  15.5s
[CV] END ....................SVM__C=20.0, SVM__kernel=linear; total time=  15.8s
[CV] END ..................SVM__C=2000.0, SVM__kernel=linear; total time=  52.0s
[CV] END ..................SVM__C=2000.0, SVM__kernel=linear; total time= 1

## Resultados

In [72]:
def format_params(x):

    s_array = x.split('{')[1].split('}')[0].split(',')
    p_array = ['C =','gamma =','kernel =']

    x = []
    for i in range(0,len(s_array)):
        x.append(p_array[i] + s_array[i].split(':')[1])
    
    return ' , '.join(x)

In [37]:
grid_0 = pd.read_pickle(os.path.join(result_path,'grid_0_results.pkl'))
grid_0.insert(7,'transform',transform_name[0])

grid_1 = pd.read_pickle(os.path.join(result_path,'grid_1_results.pkl'))
grid_1.insert(7,'transform',transform_name[1])

grid_2 = pd.read_pickle(os.path.join(result_path,'grid_2_results.pkl'))
grid_2.insert(7,'transform',transform_name[2])

grid_3 = pd.read_pickle(os.path.join(result_path,'grid_3_results.pkl'))
grid_3.insert(7,'transform',transform_name[3])

grid_4 = pd.read_pickle(os.path.join(result_path,'grid_4_results.pkl'))
grid_4.insert(7,'transform',transform_name[4])

grid_5 = pd.read_pickle(os.path.join(result_path,'grid_5_results.pkl'))
grid_5.insert(7,'transform',transform_name[5])

results = pd.concat([grid_0,grid_1,grid_2,grid_3,grid_4,grid_5])