In [4]:
import os
import glob
import pandas as pd
from collections import Counter
import numpy as np
import time

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

import scipy.stats

In [9]:
combined_csv = pd.read_csv('combined_csv.csv')
combined_csv.head()

Unnamed: 0,Token,Tag,Sentence #
0,Ementa,INICIO_ARQ,Sentence 1
1,,O,Sentence 1
2,e,O,Sentence 1
3,,O,Sentence 1
4,Acórdão,O,Sentence 1


In [10]:
new_classes = np.unique(combined_csv.Tag.values).tolist()
new_classes.remove('O')
new_classes

['B_Doutrina',
 'B_Doutrinador',
 'B_Pessoa',
 'B_Precedente',
 'B_Ref. Legislativa',
 'FIM_ARQ',
 'INICIO_ARQ',
 'I_Doutrina',
 'I_Doutrinador',
 'I_Pessoa',
 'I_Precedente',
 'I_Ref. Legislativa']

In [11]:
combined_csv['Token'] = combined_csv['Token'].astype('str')

In [12]:
# Separa as frases para criar contexto na aprendizagem

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['Token'].values.tolist(),
                                                     s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [13]:
def word2features(sent, i):
    word = sent[i][0]
#     postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
#         postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
#         postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [14]:
getter = SentenceGetter(combined_csv)
sentences = getter.sentences

In [15]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [18]:
sent = getter.get_next()
print(sent)

[('RELATOR', 'O'), (' ', 'O'), (':', 'O'), (' ', 'O'), ('MIN', 'B_Pessoa'), ('.', 'I_Pessoa'), (' ', 'I_Pessoa'), ('ROBERTO', 'I_Pessoa'), (' ', 'I_Pessoa'), ('BARROSORECLTE', 'O'), ('.', 'O'), ('(', 'O'), ('S', 'O'), (')', 'O'), (' ', 'O'), (':', 'O'), ('ABRIL', 'O'), (' ', 'O'), ('COMUNICAÇÕES', 'O'), (' ', 'O'), ('S', 'O'), ('/', 'O'), ('A', 'O'), (' ', 'O'), ('ADV', 'O'), ('.', 'O'), ('(', 'O'), ('A', 'O'), ('/', 'O'), ('S', 'O'), (')', 'O'), (' ', 'O'), (':', 'O'), ('ALEXANDRE', 'B_Pessoa'), (' ', 'I_Pessoa'), ('FIDALGO', 'I_Pessoa'), (' ', 'I_Pessoa'), ('E', 'O'), (' ', 'O'), ('OUTRO', 'O'), ('(', 'O'), ('A', 'O'), ('/', 'O'), ('S', 'O'), (')', 'O'), ('RECLDO', 'O'), ('.', 'O'), ('(', 'O'), ('A', 'O'), ('/', 'O'), ('S', 'O'), (')', 'O'), (' ', 'O'), (':', 'O'), ('JUÍZA', 'O'), (' ', 'O'), ('DE', 'O'), (' ', 'O'), ('DIREITO', 'O'), (' ', 'O'), ('DA', 'O'), (' ', 'O'), ('7ª', 'O'), (' ', 'O'), ('VARA', 'O'), (' ', 'O'), ('CÍVEL', 'O'), (' ', 'O'), ('DO', 'O'), (' ', 'O'), ('FORO', 

In [18]:
# np.savetxt('xtrain.txt', X_train, fmt='%s')
# np.savetxt('ytrain.txt', y_train, fmt='%s')

# CRF (Conditional Random Fields)

In [19]:
%%time 

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', #Gradient Descent
    c1=0.1, # coefficient for L1 penalty
    c2=0.1, # coefficient for L2 penalty
    max_iterations=100,
    all_possible_transitions=True) # whether to include transitions that are possible, but not observed

# crf = sklearn_crfsuite.CRF( # Performa 'pior'
#     algorithm='l2sgd', # Stochastic Gradient Descent with L2 regularization term
#     c2=0.1,
#     max_iterations=100,
#     all_possible_transitions=True) # whether to include transitions that are possible, but not observed

crf.fit(X_train, y_train)

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)

CPU times: user 1min 56s, sys: 283 ms, total: 1min 56s
Wall time: 1min 53s


0.8272271452962323

In [20]:
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes, digits=3))

                    precision    recall  f1-score   support

        B_Doutrina      0.714     0.588     0.645        85
     B_Doutrinador      0.653     0.395     0.492        81
          B_Pessoa      0.713     0.657     0.684       757
      B_Precedente      0.868     0.746     0.802       841
B_Ref. Legislativa      0.822     0.703     0.758       791
           FIM_ARQ      1.000     0.755     0.860        53
        INICIO_ARQ      1.000     0.755     0.860        53
        I_Doutrina      0.837     0.714     0.771      3726
     I_Doutrinador      0.797     0.813     0.805      3472
          I_Pessoa      0.850     0.771     0.809      4402
      I_Precedente      0.881     0.830     0.854     12454
I_Ref. Legislativa      0.882     0.822     0.851     10692

         micro avg      0.860     0.798     0.828     37407
         macro avg      0.835     0.712     0.766     37407
      weighted avg      0.860     0.798     0.827     37407



In [21]:
# Cria arquivo com todas as informações geradas pelo crf.

crf.tagger_.dump(filename="crf_tagger.txt")

In [22]:
# # Encontra melhor parâmetro

# crf = sklearn_crfsuite.CRF(
#     algorithm='lbfgs',
#     max_iterations=100,
#     all_possible_transitions=True
# )
# params_space = {
#     'c1': scipy.stats.expon(scale=0.5),
#     'c2': scipy.stats.expon(scale=0.05),
# }

# # use the same metric for evaluation
# f1_scorer = make_scorer(metrics.flat_f1_score,
#                         average='weighted', labels=new_classes)

# # search
# rs = RandomizedSearchCV(crf, params_space,
#                         cv=3,
#                         verbose=1,
#                         n_jobs=-1,
#                         n_iter=50,
#                         scoring=f1_scorer)
# rs.fit(X_train, y_train)

In [23]:
# print('best params:', rs.best_params_)
# print('best CV score:', rs.best_score_)
# print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [24]:
# crf = rs.best_estimator_
# y_pred = crf.predict(X_test)
# print(metrics.flat_classification_report(y_test, y_pred, labels=new_classes))

# Inspeciona dados que o modelo previu.

In [26]:
# Cria dataframe com o que o modelo predisse para análise.

result = []
for i in range(len(y_test)):
    df = pd.DataFrame(zip([X_test[i][j]['word.lower()'] for j in range(len(X_test[i]))], y_test[i], y_pred[i]))
#     df = pd.DataFrame(zip(X_test[i], y_test[i], y_pred[i]))
    result.append(df)
    
result = pd.concat(result)
result.reset_index(inplace=True, drop=True)

In [27]:
# combined_csv = pd.concat(frames)
result.to_csv("resultado.csv", index=False, encoding='utf-8')

In [28]:
result.shape

(325149, 3)

In [29]:
result.columns = ['X_test','y_test', 'y_pred']
result.head()

Unnamed: 0,X_test,y_test,y_pred
0,publique,O,O
1,-,O,O
2,se,O,O
3,.,O,O
4,,O,O


In [30]:
result[result['y_test'] != result['y_pred']]

Unnamed: 0,X_test,y_test,y_pred
875,\n,O,I_Precedente
876,\n,O,I_Precedente
1632,artigo,B_Ref. Legislativa,O
1633,,I_Ref. Legislativa,O
1634,24,I_Ref. Legislativa,O
1635,",",I_Ref. Legislativa,O
1648,,B_Ref. Legislativa,O
1649,incisos,I_Ref. Legislativa,O
1650,,I_Ref. Legislativa,O
1651,ix;,I_Ref. Legislativa,O


In [31]:
print('Proporção de erros:',result[result.y_test != result.y_pred].shape[0] / result.shape[0])

Proporção de erros: 0.03700457328793876


In [57]:
df_res = result
df_combined = combined_csv

In [58]:
df_res.head()

Unnamed: 0,X_test,y_test,y_pred
0,publique,O,O
1,-,O,O
2,se,O,O
3,.,O,O
4,,O,O


In [6]:
df_res.shape

(325149, 3)

In [59]:
df_erros = df_res[df_res['y_test'] != df_res['y_pred']]
df_erros.head()

Unnamed: 0,X_test,y_test,y_pred
875,\n,O,I_Precedente
876,\n,O,I_Precedente
1632,artigo,B_Ref. Legislativa,O
1633,,I_Ref. Legislativa,O
1634,24,I_Ref. Legislativa,O


In [72]:
df_erros[(df_erros['X_test']==' ') & (df_erros['y_test'].str.startswith('B_'))]
indices = df_erros[(df_erros['X_test']==' ') & (df_erros['y_test'].str.startswith('B_'))].index.values

In [86]:
n = 10
pos = indices[1]

print('Retorna %d linhas antes e depois do ocorrido na linha %d para tag que começa com B_'%(n,pos)),df_res.iloc[pos-n:pos+n]

Retorna 10 linhas antes e depois do ocorrido na linha 19073 para tag que começa com B_


(None,         X_test        y_test        y_pred
 19063        .  I_Precedente  I_Precedente
 19064           I_Precedente  I_Precedente
 19065      min  I_Precedente  I_Precedente
 19066        .  I_Precedente  I_Precedente
 19067           I_Precedente  I_Precedente
 19068  moreira  I_Precedente  I_Precedente
 19069           I_Precedente  I_Precedente
 19070    alves  I_Precedente  I_Precedente
 19071           I_Precedente  I_Precedente
 19072        –             O             O
 19073           B_Precedente             O
 19074      pet  I_Precedente  B_Precedente
 19075           I_Precedente  I_Precedente
 19076      129  I_Precedente  I_Precedente
 19077        /  I_Precedente  I_Precedente
 19078       pr  I_Precedente  I_Precedente
 19079        ,  I_Precedente  I_Precedente
 19080           I_Precedente  I_Precedente
 19081      rel  I_Precedente  I_Precedente
 19082        .  I_Precedente  I_Precedente)

In [87]:
df_res

Unnamed: 0,X_test,y_test,y_pred
0,publique,O,O
1,-,O,O
2,se,O,O
3,.,O,O
4,,O,O
5,brasília,O,O
6,",",O,O
7,,O,O
8,26,O,O
9,,O,O


In [82]:
for i in indices:
    df.iloc[pos-n:pos+n]

106