In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from tabulate import tabulate
import re

In [2]:
file_train = pd.read_csv('data_files/train_small.csv')
file_test = pd.read_csv('data_files/test_small.csv')

print(file_train[:5])

  themes process_id                            file_name document_type  pages  \
0  [232]  AI_856934  AI_856934_1926210_1060_17072013.pdf        outros      1   
1  [232]  AI_856934    AI_856934_1926211_34_17072013.pdf        outros      1   
2  [232]  AI_856934    AI_856934_1926211_34_17072013.pdf        outros      2   
3  [232]  AI_856934    AI_856934_1926211_34_17072013.pdf        outros      3   
4  [232]  AI_856934    AI_856934_1926211_34_17072013.pdf        outros      4   

                                                body  
0  {"tribunal justiça estado bahia poder judiciár...  
1  {"excelentíssimo senhor doutor juiz direito ju...  
2  {"razões recurso inominado recorrente atlantic...  
3  {"empresa recorrente tornou credora dos débito...  
4  {"entretanto verdade parte apelante tornou tit...  


In [3]:
x_train = file_train['body']
y_train = file_train['document_type']

x_test = file_test['body']
y_test = file_test['document_type']

print(x_train[:5])
print(y_train[:5])

0    {"tribunal justiça estado bahia poder judiciár...
1    {"excelentíssimo senhor doutor juiz direito ju...
2    {"razões recurso inominado recorrente atlantic...
3    {"empresa recorrente tornou credora dos débito...
4    {"entretanto verdade parte apelante tornou tit...
Name: body, dtype: object
0    outros
1    outros
2    outros
3    outros
4    outros
Name: document_type, dtype: object


In [4]:
porter_stemmer = PorterStemmer()

def stemming_tokenizer(str_input):
  words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
  words = [porter_stemmer.stem(word) for word in words]
  return words

In [11]:
pipeline = Pipeline([
  ('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', SGDClassifier()),
])

parameters = {
  'vect__max_df': (0.5, 0.75),
  'vect__ngram_range': ((1, 2), (1, 3)), # unigrams or bigrams or trigrams
  'vect__tokenizer': ([stemming_tokenizer]),
  'tfidf__norm': ('l1', 'l2'),
  'clf__alpha': (0.01, 0.001),
}

scoring = {
  'accuracy',
  'f1_micro',
  'f1_macro',
  'f1_weighted'
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=None, verbose=3, return_train_score=False, scoring=scoring, refit='f1_micro')

grid_search.fit(x_train, y_train)

print('Best parameters: {}'.format(grid_search.best_params_))
print('Best score: {}'.format(grid_search.best_score_))

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END clf__alpha=0.01, tfidf__norm=l1, vect__max_df=0.5, vect__ngram_range=(1, 2), vect__tokenizer=<function stemming_tokenizer at 0x7fc0e07d7a60>; accuracy: (test=0.899) f1_macro: (test=0.158) f1_micro: (test=0.899) f1_weighted: (test=0.851) total time= 9.3min
[CV 2/5] END clf__alpha=0.01, tfidf__norm=l1, vect__max_df=0.5, vect__ngram_range=(1, 2), vect__tokenizer=<function stemming_tokenizer at 0x7fc0e07d7a60>; accuracy: (test=0.899) f1_macro: (test=0.158) f1_micro: (test=0.899) f1_weighted: (test=0.851) total time=12.5min
[CV 3/5] END clf__alpha=0.01, tfidf__norm=l1, vect__max_df=0.5, vect__ngram_range=(1, 2), vect__tokenizer=<function stemming_tokenizer at 0x7fc0e07d7a60>; accuracy: (test=0.899) f1_macro: (test=0.158) f1_micro: (test=0.899) f1_weighted: (test=0.851) total time=12.8min
[CV 4/5] END clf__alpha=0.01, tfidf__norm=l1, vect__max_df=0.5, vect__ngram_range=(1, 2), vect__tokenizer=<function stemming_tokeniz

In [6]:
best_estimator = grid_search.best_estimator_

print('F1 Score (micro): {}'.format(f1_score(y_test, best_estimator.predict(x_test), average='micro')))

F1 Score (micro): 0.8940811925549065


In [7]:
header = ['Classifiers', 'vect__max_df', 'vect__ngram_range', 'vect__tokenizer', 'tfidf__norm', 'clf__alpha', 'Best Score', 'F1 Score - micro']

metrics = []

metrics.append(['CountVectorizer + SGDClassifier', '0.5', '(1, 2)', 'stemming_tokenizer', 'l1', '0.01', '0.898919', '0.894081'])

print(tabulate(metrics, headers=header, tablefmt="fancy_grid"))

╒═════════════════════════════════╤════════════════╤═════════════════════╤════════════════════╤═══════════════╤══════════════╤══════════════╤════════════════════╕
│ Classifiers                     │   vect__max_df │ vect__ngram_range   │ vect__tokenizer    │ tfidf__norm   │   clf__alpha │   Best Score │   F1 Score - micro │
╞═════════════════════════════════╪════════════════╪═════════════════════╪════════════════════╪═══════════════╪══════════════╪══════════════╪════════════════════╡
│ CountVectorizer + SGDClassifier │            0.5 │ (1, 2)              │ stemming_tokenizer │ l1            │         0.01 │     0.898919 │           0.894081 │
╘═════════════════════════════════╧════════════════╧═════════════════════╧════════════════════╧═══════════════╧══════════════╧══════════════╧════════════════════╛
