In [1]:
import sys, os
import pandas as pd
import numpy as np

from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import nltk

from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
df_corpus = pd.read_pickle('./Dados/preprocessed_corpus.pkl')

In [3]:
corpus = list(df_corpus['sample'])
label = list(df_corpus['label'])

## Dependências

In [4]:
# Pacote de stop words em português
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

# Pacote de stemmers em portguês
nltk.download('rslp')
stemmer = nltk.stem.RSLPStemmer()

# Stemmer
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ELOGROUP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\ELOGROUP\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


## Separação dos dados

Os dados serão separados segundo os seguintes critérios:
* 20% conjunto de teste (1440 amostras)
* 80% conjunto de treinamento (5760 amostras)
    * 1/3 conjunto de validação (1920 amostras)
    * 2/3 conjunto de treino de algoritmo (3840 amostras)

Obs: a subdivisão do conjunto de treinamento é feita dentro do GridSearch

In [5]:
X_train, X_test, y_train, y_test = train_test_split(corpus, label, test_size=0.2, random_state=1)

## Bag of Words

In [6]:
Tfid = []

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = 'word',
                            stop_words = None))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = 'word',
                            stop_words = stopwords))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = stemmed_words,
                            stop_words = stopwords))

transform_name = ['Bow','Bow_stopwords','Bow_stopwords_stemming']

## SVM

In [7]:
# Set the parameters by cross-validation
param_grid = [{'SVM__kernel': ['linear'],
               'SVM__C': [2e-3, 2e-1, 2e1, 2e3, 2e5, 2e7, 2e9, 2e11, 2e13, 2e15]},

              {'SVM__kernel': ['rbf'], 
               'SVM__gamma': [1e-1, 1e-2, 1e-3, 1e-4], 
               'SVM__C': [2e-3, 2e-1, 2e1, 2e3, 2e5, 2e7, 2e9, 2e11, 2e13, 2e15]}]

## Métricas

In [8]:
scoring = ['accuracy','f1','precision','recall']

## Treinamento

In [13]:
for i in range(0,3):
    
    print('\n***************************\n')
    print(f'Treinando modelo {i}...\n')
    
    steps = [(transform_name[i], Tfid[i]), ('SVM', SVC())]
    pipeline = Pipeline(steps)

    grid = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring=scoring, refit='f1', verbose=2)
    grid.fit(X_train, y_train)

    print(f'\nModelo {i} treinado!\n')

    print('Melhor estimador:\n')
    print(grid.best_estimator_)

    print('Salvando resultados... ', end='')

    grid_results = pd.DataFrame.from_dict(grid.cv_results_)
    grid_results.to_pickle(f'./Resultados/grid_{i}_results.pkl')

    print('OK \n')


***************************

Treinando modelo 0... Fitting 3 folds for each of 50 candidates, totalling 150 fits


KeyboardInterrupt: 