In [23]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from tabulate import tabulate

In [3]:
file_train = pd.read_csv('data_files/train_small.csv')
file_test = pd.read_csv('data_files/test_small.csv')

print(file_train[:5])

  themes process_id                            file_name document_type  pages  \
0  [232]  AI_856934  AI_856934_1926210_1060_17072013.pdf        outros      1   
1  [232]  AI_856934    AI_856934_1926211_34_17072013.pdf        outros      1   
2  [232]  AI_856934    AI_856934_1926211_34_17072013.pdf        outros      2   
3  [232]  AI_856934    AI_856934_1926211_34_17072013.pdf        outros      3   
4  [232]  AI_856934    AI_856934_1926211_34_17072013.pdf        outros      4   

                                                body  
0  {"tribunal justiça estado bahia poder judiciár...  
1  {"excelentíssimo senhor doutor juiz direito ju...  
2  {"razões recurso inominado recorrente atlantic...  
3  {"empresa recorrente tornou credora dos débito...  
4  {"entretanto verdade parte apelante tornou tit...  


In [4]:
x_train = file_train['body']
y_train = file_train['document_type']

x_test = file_test['body']
y_test = file_test['document_type']

print(x_train[:5])
print(y_train[:5])

0    {"tribunal justiça estado bahia poder judiciár...
1    {"excelentíssimo senhor doutor juiz direito ju...
2    {"razões recurso inominado recorrente atlantic...
3    {"empresa recorrente tornou credora dos débito...
4    {"entretanto verdade parte apelante tornou tit...
Name: body, dtype: object
0    outros
1    outros
2    outros
3    outros
4    outros
Name: document_type, dtype: object


### TfidfVectorizer + MultinomialNB

In [8]:
pipeline = Pipeline([
  ('vect', TfidfVectorizer()),
  ('clf', MultinomialNB()),
])

parameters = {
  'vect__max_df': (0.5, 0.75, 1.0),
  'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
  'clf__alpha': (1, 0.1, 0.01),
}

scoring = {
  'accuracy',
  'f1_micro',
  'f1_macro',
  'f1_weighted'
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=1, verbose=1, return_train_score=False, scoring=scoring, refit='f1_micro')

grid_search.fit(x_train, y_train)

print('Best parameters: {}'.format(grid_search.best_params_))
print('Best score: {}'.format(grid_search.best_score_))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters: {'clf__alpha': 1, 'vect__max_df': 0.5, 'vect__ngram_range': (1, 2)}
Best score: 0.901157353784587


In [10]:
best_estimator = grid_search.best_estimator_

print('F1 Score (micro): {}'.format(f1_score(y_test, best_estimator.predict(x_test), average='micro')))

F1 Score (micro): 0.9034399011787366


### TfidfVectorizer + SGDClassifier

In [11]:
pipeline = Pipeline([
  ('vect', TfidfVectorizer()),
  ('clf', SGDClassifier()),
])

parameters = {
  'vect__max_df': (0.5, 0.75, 1.0),
  'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
  'clf__alpha': (1, 0.1, 0.01),
}

scoring = {
  'accuracy',
  'f1_micro',
  'f1_macro',
  'f1_weighted'
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=1, verbose=1, return_train_score=False, scoring=scoring, refit='f1_micro')

grid_search.fit(x_train, y_train)

print('Best parameters: {}'.format(grid_search.best_params_))
print('Best score: {}'.format(grid_search.best_score_))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters: {'clf__alpha': 1, 'vect__max_df': 0.5, 'vect__ngram_range': (1, 1)}
Best score: 0.8989190241242083


In [12]:
best_estimator = grid_search.best_estimator_

print('F1 Score (micro): {}'.format(f1_score(y_test, best_estimator.predict(x_test), average='micro')))

F1 Score (micro): 0.8940811925549065


### CountVectorizer + MultinomialNB

In [15]:
pipeline = Pipeline([
  ('vect', CountVectorizer()),
  ('clf', MultinomialNB()),
])

parameters = {
  'vect__max_df': (0.5, 0.75, 1.0),
  'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
  'clf__alpha': (1, 0.1, 0.01),
}

scoring = {
  'accuracy',
  'f1_micro',
  'f1_macro',
  'f1_weighted'
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=1, verbose=1, return_train_score=False, scoring=scoring, refit='f1_micro')

grid_search.fit(x_train, y_train)

print('Best parameters: {}'.format(grid_search.best_params_))
print('Best score: {}'.format(grid_search.best_score_))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters: {'clf__alpha': 1, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 2)}
Best score: 0.866181260134713


In [16]:
best_estimator = grid_search.best_estimator_

print('F1 Score (micro): {}'.format(f1_score(y_test, best_estimator.predict(x_test), average='micro')))

F1 Score (micro): 0.8997550405125306


### CountVectorizer + SGDClassifier

In [17]:
pipeline = Pipeline([
  ('vect', CountVectorizer()),
  ('clf', SGDClassifier()),
])

parameters = {
  'vect__max_df': (0.5, 0.75, 1.0),
  'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
  'clf__alpha': (1, 0.1, 0.01),
}

scoring = {
  'accuracy',
  'f1_micro',
  'f1_macro',
  'f1_weighted'
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=parameters, n_jobs=1, verbose=1, return_train_score=False, scoring=scoring, refit='f1_micro')

grid_search.fit(x_train, y_train)

print('Best parameters: {}'.format(grid_search.best_params_))
print('Best score: {}'.format(grid_search.best_score_))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters: {'clf__alpha': 0.01, 'vect__max_df': 0.75, 'vect__ngram_range': (1, 2)}
Best score: 0.9198613774324833


In [18]:
best_estimator = grid_search.best_estimator_

print('F1 Score (micro): {}'.format(f1_score(y_test, best_estimator.predict(x_test), average='micro')))

F1 Score (micro): 0.939461507861734


## Results

In [24]:
header = ['Classifiers', 'vect__max_df', 'vect__ngram_range', 'clf__alpha', 'Best Score', 'F1 Score - micro']

metrics = []

metrics.append(['TfidfVectorizer + MultinomialNB', '0.5', '(1, 2)', '1.0', '0.901157', '0.903439'])
metrics.append(['TfidfVectorizer + SGDClassifier', '0.5', '(1, 1)', '1.0', '0.898919', '0.894081'])
metrics.append(['CountVectorizer + MultinomialNB', '0.75', '(1, 2)', '1.0', '0.866181', '0.899755'])
metrics.append(['CountVectorizer + SGDClassifier', '0.75', '(1, 2)', '0.01', '0.919861', '0.939461'])

print(tabulate(metrics, headers=header, tablefmt="fancy_grid"))

╒═════════════════════════════════╤════════════════╤═════════════════════╤══════════════╤══════════════╤════════════════════╕
│ Classifiers                     │   vect__max_df │ vect__ngram_range   │   clf__alpha │   Best Score │   F1 Score - micro │
╞═════════════════════════════════╪════════════════╪═════════════════════╪══════════════╪══════════════╪════════════════════╡
│ TfidfVectorizer + MultinomialNB │           0.5  │ (1, 2)              │         1    │     0.901157 │           0.903439 │
├─────────────────────────────────┼────────────────┼─────────────────────┼──────────────┼──────────────┼────────────────────┤
│ TfidfVectorizer + SGDClassifier │           0.5  │ (1, 1)              │         1    │     0.898919 │           0.894081 │
├─────────────────────────────────┼────────────────┼─────────────────────┼──────────────┼──────────────┼────────────────────┤
│ CountVectorizer + MultinomialNB │           0.75 │ (1, 2)              │         1    │     0.866181 │           0.8