In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

In [2]:
#Carregando os dados
train = pd.read_csv('./data/train_small.csv')
test = pd.read_csv('./data/test_small.csv')


In [3]:
#Vizualizando os dados
train.head()

Unnamed: 0,themes,process_id,file_name,document_type,pages,body
0,[232],AI_856934,AI_856934_1926210_1060_17072013.pdf,outros,1,"{""tribunal justiça estado bahia poder judiciár..."
1,[232],AI_856934,AI_856934_1926211_34_17072013.pdf,outros,1,"{""excelentíssimo senhor doutor juiz direito ju..."
2,[232],AI_856934,AI_856934_1926211_34_17072013.pdf,outros,2,"{""razões recurso inominado recorrente atlantic..."
3,[232],AI_856934,AI_856934_1926211_34_17072013.pdf,outros,3,"{""empresa recorrente tornou credora dos débito..."
4,[232],AI_856934,AI_856934_1926211_34_17072013.pdf,outros,4,"{""entretanto verdade parte apelante tornou tit..."


In [4]:
#Particionando os dados
x_train = train['body']
y_train = train['document_type']

x_test = test['body']
y_test = test['document_type']

scoring = {
    'f1_micro',
    'f1_macro',
    'f1_weighted'
}

# TfidfVectorizer + MultinomialNB

In [None]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

parameters = {
    'vect__min_df': [1, 2, 3],
    'vect__smooth_idf': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=-1, verbose=1, scoring=scoring, refit='f1_micro')
grid_search.fit(x_train, y_train)

print("Best parameters:")
print(grid_search.best_params_)

print("Best scorers: ")
print(grid_search.best_score_)

tfidf_naive = grid_search.best_estimator_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [None]:
#Previsão
print('F1-Score (micro) Test: ', f1_score(y_test, tfidf_naive.predict(x_test), average='micro'))

# TfidfVectorizer + SDGClassifier

In [None]:
pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf',  SGDClassifier())
])

parameters = {
    'vect__min_df': [1, 2, 3],
    'vect__smooth_idf': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)]
}


grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=-1, verbose=10, scoring=scoring, refit='f1_micro')
grid_search.fit(x_train, y_train)
print("Best parameters:")
print(grid_search.best_params_)
print("Best scorers: ")
print(grid_search.best_score_)
tfidf_sdg = grid_search.best_estimator_


In [None]:
print('F1-Score (micro) Test: ', f1_score(y_test, tfidf_sdg.predict(x_test), average='micro')) # 0.93 - antes 0.90

# CountVectorizer + SDGClassifier

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SGDClassifier())
])

parameters = {
    'vect__min_df': [1, 2, 3],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'vect__max_df': [0.5, 0.75, 1.0]
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=-1, verbose=10, scoring=scoring, refit='f1_micro')
grid_search.fit(x_train, y_train)

print("Best parameters:")
print(grid_search.best_params_)

print("Best scorers: ")
print(grid_search.best_score_)

countv_sdg = grid_search.best_estimator_

In [None]:
print('F1-Score (micro) Test: ', f1_score(y_test, countv_sdg.predict(x_test), average='micro'))