In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

In [2]:
#Carregando os dados
train = pd.read_csv('./data/train_small.csv')
test = pd.read_csv('./data/test_small.csv')


In [3]:
#Vizualizando os dados
train.head()

Unnamed: 0,themes,process_id,file_name,document_type,pages,body
0,[232],AI_856934,AI_856934_1926210_1060_17072013.pdf,outros,1,"{""tribunal justiça estado bahia poder judiciár..."
1,[232],AI_856934,AI_856934_1926211_34_17072013.pdf,outros,1,"{""excelentíssimo senhor doutor juiz direito ju..."
2,[232],AI_856934,AI_856934_1926211_34_17072013.pdf,outros,2,"{""razões recurso inominado recorrente atlantic..."
3,[232],AI_856934,AI_856934_1926211_34_17072013.pdf,outros,3,"{""empresa recorrente tornou credora dos débito..."
4,[232],AI_856934,AI_856934_1926211_34_17072013.pdf,outros,4,"{""entretanto verdade parte apelante tornou tit..."


In [4]:
#definindo tamanho do dataset que será utilizado
porcentagem_de_treino = 0.3

tamanho_de_treino = int(0.3 * len(train))
tamanho_de_teste = int(0.2 * len(test))

#Pega os dados de treino
treino_dados = train[0:tamanho_de_treino]
teste_dados = test[0:tamanho_de_teste]


In [5]:
#Particionando os dados
x_train = treino_dados['body']
y_train = treino_dados['document_type']

x_test = teste_dados['body']
y_test = teste_dados['document_type']

scoring = {
    'f1_micro',
    'f1_macro',
    'f1_weighted'
}

## PreProcessing - GPAM


In [6]:
from pre_processing import Pipeline

for text in x_train:
    Pipeline().apply(text)

for text in x_test:
    Pipeline().apply(text)
    



# TfidfVectorizer + MultinomialNB

In [7]:
from sklearn.pipeline import make_pipeline, Pipeline

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

parameters = {
    'vect__min_df': [1, 2, 3],
    'vect__smooth_idf': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=-1, verbose=1, scoring=scoring, refit='f1_micro')
grid_search.fit(x_train, y_train)

print("Best parameters:")
print(grid_search.best_params_)

print("Best scorers: ")
print(grid_search.best_score_)

tfidf_naive = grid_search.best_estimator_

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters:
{'vect__min_df': 3, 'vect__ngram_range': (1, 3), 'vect__smooth_idf': True}
Best scorers: 
0.9344130459064001


In [8]:
#Previsão
print('F1-Score (micro) Test: ', f1_score(y_test, tfidf_naive.predict(x_test), average='micro'))

F1-Score (micro) Test:  0.9188694059146821


# TfidfVectorizer + SDGClassifier

In [9]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf',  SGDClassifier())
])

parameters = {
    'vect__min_df': [1, 2, 3],
    'vect__smooth_idf': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]
}


grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=-1, verbose=10, scoring=scoring, refit='f1_micro')
grid_search.fit(x_train, y_train)
print("Best parameters:")
print(grid_search.best_params_)
print("Best scorers: ")
print(grid_search.best_score_)
tfidf_sdg = grid_search.best_estimator_


Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 4/5; 1/9] START vect__min_df=1, vect__ngram_range=(1, 1), vect__smooth_idf=True
[CV 2/5; 1/9] START vect__min_df=1, vect__ngram_range=(1, 1), vect__smooth_idf=True
[CV 3/5; 1/9] START vect__min_df=1, vect__ngram_range=(1, 1), vect__smooth_idf=True
[CV 1/5; 1/9] START vect__min_df=1, vect__ngram_range=(1, 1), vect__smooth_idf=True
[CV 3/5; 1/9] END vect__min_df=1, vect__ngram_range=(1, 1), vect__smooth_idf=True; f1_macro: (test=0.283) f1_micro: (test=0.951) f1_weighted: (test=0.933) total time=  21.7s
[CV 4/5; 1/9] END vect__min_df=1, vect__ngram_range=(1, 1), vect__smooth_idf=True; f1_macro: (test=0.311) f1_micro: (test=0.953) f1_weighted: (test=0.940) total time=  22.0s
[CV 1/5; 1/9] END vect__min_df=1, vect__ngram_range=(1, 1), vect__smooth_idf=True; f1_macro: (test=0.174) f1_micro: (test=0.934) f1_weighted: (test=0.904) total time=  21.7s
[CV 5/5; 1/9] START vect__min_df=1, vect__ngram_range=(1, 1), vect__smooth_idf=Tru

In [10]:
print('F1-Score (micro) Test: ', f1_score(y_test, tfidf_sdg.predict(x_test), average='micro')) # 0.93 - antes 0.90

F1-Score (micro) Test:  0.9269824653232138


# CountVectorizer + SDGClassifier

In [11]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SGDClassifier())
])

parameters = {
    'vect__min_df': [1, 2, 3],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vect__max_df': [0.5, 0.75, 1.0]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=-1, verbose=10, scoring=scoring, refit='f1_micro')
grid_search.fit(x_train, y_train)

print("Best parameters:")
print(grid_search.best_params_)

print("Best scorers: ")
print(grid_search.best_score_)

countv_sdg = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5; 1/27] START vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1).
[CV 2/5; 1/27] START vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1).
[CV 3/5; 1/27] START vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1).
[CV 4/5; 1/27] START vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1).
[CV 3/5; 1/27] END vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1); f1_macro: (test=0.383) f1_micro: (test=0.949) f1_weighted: (test=0.944) total time=  26.5s
[CV 1/5; 1/27] END vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1); f1_macro: (test=0.499) f1_micro: (test=0.923) f1_weighted: (test=0.920) total time=  27.1s
[CV 5/5; 1/27] START vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1).
[CV 1/5; 2/27] START vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 2).
[CV 4/5; 1/27] END vect__max_df=0.5, vect__min_df=1, vect__ngram_range=(1, 1); f1_macro: (test=0.673) f1_micro: 

In [None]:
print('F1-Score (micro) Test: ', f1_score(y_test, countv_sdg.predict(x_test), average='micro'))

F1-Score (micro) Test:  0.9396493064642764
