In [20]:
import sys, os
import pandas as pd
import numpy as np

from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import nltk

from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
df_corpus = pd.read_pickle('./Dados/preprocessed_corpus.pkl')
# df_corpus = pd.read_pickle('./Dados/raw_corpus.pkl')

In [3]:
df_corpus

Unnamed: 0,sample,label,token
0,kátia abreu diz que vai colocar sua expulsão e...,1,"[kátia, abreu, diz, que, vai, colocar, sua, ex..."
1,"dr. ray peita bolsonaro, chama-o de conservad...",1,"[dr, ., ray, peita, bolsonaro, ,, chama, -, o,..."
2,reinaldo azevedo desmascarado pela polícia fed...,1,"[reinaldo, azevedo, desmascarado, pela, políci..."
3,relatório assustador do bndes mostra dinheiro ...,1,"[relatório, assustador, do, bndes, mostra, din..."
4,"radialista americano fala sobre o pt: ""eles ve...",1,"[radialista, americano, fala, sobre, o, pt, :,..."
...,...,...,...
7195,"para jornal britânico, ação contra lula na lav...",0,"[para, jornal, britânico, ,, ação, contra, lul..."
7196,temer diz que acionou pf e cade para investiga...,0,"[temer, diz, que, acionou, pf, e, cade, para, ..."
7197,os obstáculos políticos de temer em 0. especia...,0,"[os, obstáculos, políticos, de, temer, em, 0, ..."
7198,"sexta-feira, 0 de setembro de 0. boa noite! aq...",0,"[sexta, -, feira, ,, 0, de, setembro, de, 0, ...."


In [4]:
corpus = list(df_corpus['sample'])
label = list(df_corpus['label'])

# Separação do conjunto de Dados

In [10]:
len(corpus)

7200

In [11]:
X_train, X_test, y_train, y_test = train_test_split(corpus, label, test_size=0.2, random_state=1)

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [13]:
print(f'treino: {len(X_train)} ({len(X_train)*100/len(corpus)}%)')
print(f'validacao: {len(X_val)} ({len(X_val)*100/len(corpus)}%)')
print(f'teste: {len(X_test)} ({len(X_test)*100/len(corpus)}%)')

treino: 4320 (60.0%)
validacao: 1440 (20.0%)
teste: 1440 (20.0%)


# Bag of Words

In [23]:
Tfid_1 = TfidfVectorizer(lowercase = False,
                         tokenizer = None,
                         preprocessor = None,
                         stop_words = None,
                         analyzer = 'word',
                         max_features = 1000)

svm_1 = SVC(verbose=True)

In [24]:
steps = [('BoW', Tfid_1), ('SVM', SVC())]
pipeline_1 = Pipeline(steps) # define the pipeline object.

In [25]:
C_values = []
i=-5
while i<=15:
    C_values.append(2**i)
    i=i+2

parameteres = {'SVM__C':C_values, 'SVM__gamma':[0.1,0.01]}

In [47]:
C_values

[0.03125, 0.125, 0.5, 2, 8, 32, 128, 512, 2048, 8192, 32768]

usamos o grid search para escolher o melhor parametro para o SVM utilizando o BoW puro.
cv = 5 indica que uma tecnica de k-fold com k=5. Ou seja, a cada rodada, 2/3 do dataset de treino (60% do dataset original) é usado para treino e 1/3 é usado para validação. 

In [26]:
grid_1 = GridSearchCV(pipeline_1, param_grid=parameteres, cv=5)

In [27]:
grid_1.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('BoW',
                                        TfidfVectorizer(lowercase=False,
                                                        max_features=1000)),
                                       ('SVM', SVC())]),
             param_grid={'SVM__C': [0.03125, 0.125, 0.5, 2, 8, 32, 128, 512,
                                    2048, 8192, 32768],
                         'SVM__gamma': [0.1, 0.01]})

O melhor estimador para BoW foi a SVM com os parametros abaixo

In [37]:
grid_1.best_estimator_

Pipeline(steps=[('BoW', TfidfVectorizer(lowercase=False, max_features=1000)),
                ('SVM', SVC(C=8, gamma=0.1))])

In [42]:
print ('score = {}'.format(grid_1.score(X_val,y_val)))

score = 0.9583333333333334


In [38]:
grid_1.best_score_

0.955787037037037

In [39]:
grid_1.best_params_

{'SVM__C': 8, 'SVM__gamma': 0.1}

In [32]:
grid_1.cv_results_

{'mean_fit_time': array([18.10670729, 18.04227338, 14.33316116, 18.06691427, 10.12422137,
        17.03641458,  7.44241834, 12.47033129,  6.16920238,  8.81699157,
         5.50579805,  6.89995313,  5.40232816,  5.56106124,  6.12070284,
         5.91296844,  5.27182593,  5.01787324,  5.15957389,  5.02461562,
         5.68239231,  5.7431952 ]),
 'std_fit_time': array([0.15403115, 0.03986476, 0.2283183 , 0.04222677, 0.08434027,
        0.0721518 , 0.06705282, 0.16216345, 0.27058167, 0.07478873,
        0.14105033, 0.53643653, 0.05134122, 0.04243356, 0.31610787,
        0.54368361, 0.07348944, 0.09490478, 0.07495743, 0.10869974,
        0.41636294, 0.56366898]),
 'mean_score_time': array([4.25275607, 4.26442356, 3.39300137, 4.22298822, 2.25857763,
        3.95798564, 1.62519226, 2.84746099, 1.29759793, 1.93660083,
        1.12380109, 1.4179285 , 1.10071745, 1.17468233, 1.36000648,
        1.11279941, 1.06847482, 0.97880764, 1.0472177 , 0.97459178,
        1.28001108, 1.07321501]),
 'std_sc

In [57]:
grid_1_results = pd.DataFrame.from_dict(grid_1.cv_results_)
grid_1_results.to_pickle('./Dados/grid_1_results.pkl')

In [36]:
grid_1.scorer_

<function sklearn.metrics._scorer._passthrough_scorer(estimator, *args, **kwargs)>

In [46]:
scoring = ['accuracy','f1','precision','recall']
grid_1a = GridSearchCV(pipeline_1, param_grid=parameteres, cv=3, scoring=scoring, refit='f1')
grid_1a.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('BoW',
                                        TfidfVectorizer(lowercase=False,
                                                        max_features=1000)),
                                       ('SVM', SVC())]),
             param_grid={'SVM__C': [0.03125, 0.125, 0.5, 2, 8, 32, 128, 512,
                                    2048, 8192, 32768],
                         'SVM__gamma': [0.1, 0.01]},
             refit='f1', scoring=['accuracy', 'f1', 'precision', 'recall'])

In [58]:
grid_1a_results = pd.DataFrame.from_dict(grid_1a.cv_results_)
grid_1a_results.to_pickle('./Dados/grid_1a_results.pkl')

In [59]:
grid_1a_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_SVM__C,param_SVM__gamma,params,split0_test_accuracy,split1_test_accuracy,split2_test_accuracy,...,split2_test_precision,mean_test_precision,std_test_precision,rank_test_precision,split0_test_recall,split1_test_recall,split2_test_recall,mean_test_recall,std_test_recall,rank_test_recall
0,26.156106,0.422332,12.927477,0.558941,0.03125,0.1,"{'SVM__C': 0.03125, 'SVM__gamma': 0.1}",0.720833,0.719444,0.720139,...,0.987539,0.983545,0.003749,1,0.445531,0.445531,0.442737,0.4446,0.001317,20
1,26.390735,0.673792,11.907019,0.215078,0.03125,0.01,"{'SVM__C': 0.03125, 'SVM__gamma': 0.01}",0.502778,0.502778,0.502778,...,0.0,0.0,0.0,21,0.0,0.0,0.0,0.0,0.0,21
2,21.691156,0.47946,9.808496,0.070186,0.125,0.1,"{'SVM__C': 0.125, 'SVM__gamma': 0.1}",0.913194,0.898611,0.9125,...,0.916667,0.906819,0.01239,19,0.910615,0.909218,0.906425,0.908752,0.001742,18
3,25.723549,0.046107,11.939271,0.169456,0.125,0.01,"{'SVM__C': 0.125, 'SVM__gamma': 0.01}",0.502778,0.502778,0.502778,...,0.0,0.0,0.0,21,0.0,0.0,0.0,0.0,0.0,21
4,15.741728,0.189923,7.084963,0.196899,0.5,0.1,"{'SVM__C': 0.5, 'SVM__gamma': 0.1}",0.94375,0.929861,0.940972,...,0.947518,0.940981,0.008572,17,0.939944,0.930168,0.932961,0.934358,0.004112,13
5,24.594292,0.268399,11.851484,0.308614,0.5,0.01,"{'SVM__C': 0.5, 'SVM__gamma': 0.01}",0.883333,0.875,0.884722,...,0.914157,0.903034,0.01735,20,0.842179,0.868715,0.847765,0.852886,0.011423,19
6,11.43848,0.405897,5.047578,0.20329,2.0,0.1,"{'SVM__C': 2, 'SVM__gamma': 0.1}",0.95625,0.947222,0.957639,...,0.95423,0.951789,0.006249,12,0.953911,0.951117,0.960894,0.955307,0.004112,2
7,18.608888,0.337119,8.373296,0.208979,2.0,0.01,"{'SVM__C': 2, 'SVM__gamma': 0.01}",0.931944,0.915278,0.923611,...,0.929178,0.926423,0.009507,18,0.925978,0.916201,0.916201,0.91946,0.004609,17
8,9.158823,0.433606,3.926196,0.294065,8.0,0.1,"{'SVM__C': 8, 'SVM__gamma': 0.1}",0.960417,0.951389,0.960417,...,0.962132,0.960164,0.003739,2,0.956704,0.946927,0.958101,0.953911,0.004971,3
9,13.102699,0.055137,5.887463,0.142158,8.0,0.01,"{'SVM__C': 8, 'SVM__gamma': 0.01}",0.947222,0.938194,0.948611,...,0.952113,0.946685,0.006704,16,0.942737,0.938547,0.944134,0.941806,0.002374,5


# Outros

In [22]:
X_train_BoW = Tfid_1.fit_transform(corpus)


In [26]:
X_train_BoW = X_train_BoW.toarray()

In [27]:
X_train_BoW

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
X_train_BoW.shape

(4320, 66858)

In [31]:
model = SVC(kernel='linear')
# model.fit(X_BoW,y_train)        #treinando o modelo

SVC(kernel='linear')

In [33]:
cv_results = cross_validate(model, X_train_BoW, y_train)    #porcentagem de acertos do modelo
print(cv_results['test_score']) 

In [None]:
df

# Bag of Words com Stopwords

In [15]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ELOGROUP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [16]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = stopwords,
                             max_features = None)

In [17]:
X_BoW_stopwords = vectorizer.fit_transform(corpus)
X_BoW_stopwords = X_BoW_stopwords.toarray()

In [None]:
X_Bow_stopwords.shape

In [None]:
# np.savetxt("./Dados/X_BoW_stopwords.csv", X_BoW_stopwords, delimiter=",")

# Bag of Words com Stopwords e Stemming

In [19]:
nltk.download('rslp')
stemmer = nltk.stem.RSLPStemmer()

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\ELOGROUP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping stemmers\rslp.zip.


In [21]:
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

In [22]:
'''
vectorizer = CountVectorizer(analyzer=stemmed_words,
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = stopwords,
                             max_features = None)
'''

In [2]:
vectorizer = TfidfVectorizer()

In [23]:
X_BoW_stem = vectorizer.fit_transform(corpus)
X_BoW_stem = X_BoW_stem.toarray()

In [None]:
X_Bow_stem.shape

In [None]:
# np.savetxt("./Dados/X_BoW_stem.csv", X_BoW_stem, delimiter=",")

# Versão Atual

## Dependencias

In [63]:
# Pacote de stop words em português
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

# Pacote de stemmers em portguês
nltk.download('rslp')
stemmer = nltk.stem.RSLPStemmer()

# Stemmer
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ELOGROUP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\ELOGROUP\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!


## Separação dos dados

Os dados serão separados segundo os seguintes critérios:
* 20% conjunto de teste (1440 amostras)
* 80% conjunto de treinamento (5760 amostras)
    * 1/3 conjunto de validação (1920 amostras)
    * 2/3 conjunto de treino de algoritmo (3840 amostras)

Obs: a subdivisão do conjunto de treinamento é feita dentro do GridSearch

In [61]:
X_train, X_test, y_train, y_test = train_test_split(corpus, label, test_size=0.2, random_state=1)

## Bag of Words

In [67]:
Tfid = []

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = 'word',
                            stop_words = None))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = 'word',
                            stop_words = stopwords))

Tfid.append(TfidfVectorizer(lowercase  = False,
                            analyzer   = stemmed_words,
                            stop_words = stopwords))

transform_name = ['Bow','Bow_stopwords','Bow_stopwords_stemming']

In [76]:
# Set the parameters by cross-validation
param_grid = [{'SVM__kernel': ['linear'],
               'SVM__C': [2e-3, 2e-1, 2e1, 2e3, 2e5, 2e7, 2e9, 2e11, 2e13, 2e15]},

              {'SVM__kernel': ['rbf'], 
               'SVM__gamma': [1e-1, 1e-2, 1e-3, 1e-4], 
               'SVM__C': [2e-3, 2e-1, 2e1, 2e3, 2e5, 2e7, 2e9, 2e11, 2e13, 2e15]}]

In [77]:
scoring = ['accuracy','f1','precision','recall']

In [75]:
for i in range(0,3):
    steps = [(transform_name[i], Tfid[i]), ('SVM', SVC())]
    pipeline = Pipeline(steps)

    grid = GridSearchCV(pipeline, param_grid=param_grid, cv=3, scoring=scoring, refit='f1')
    grid.fit(X_train, y_train)

    grid_results = pd.DataFrame.from_dict(grid.cv_results_)
    grid_results.to_pickle(f'./Resultados/grid_{i}_results.pkl')

ValueError: Invalid parameter C for estimator Pipeline(steps=[('Bow', TfidfVectorizer(lowercase=False)), ('SVM', SVC())]). Check the list of available parameters with `estimator.get_params().keys()`.