# Conditional Random Fields

CRFs fall into the sequence modeling family. Whereas a discrete classifier predicts a label for a single sample without considering "neighboring" samples, a CRF can take context into account; e.g., the linear chain CRF (which is popular in natural language processing) predicts sequences of labels for sequences of input samples. 


CRFs are a type of discriminative undirected probabilistic graphical model. They are used to encode known relationships between observations and construct consistent interpretations and are often used for labeling or parsing of sequential data, such as natural language processing or biological sequences and in computer vision.

In [16]:
import os
import time
import glob
import numpy as np
import pandas as pd
import pandas_profiling
from collections import Counter

import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from sklearn.metrics import make_scorer
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier

import scipy.stats

%autosave 60
%matplotlib inline

Autosaving every 60 seconds


In [2]:
# Para ser compilado após o notebook de preprocessamento.
combined_csv = pd.read_csv('preprocessados1.csv')
combined_csv.head()

Unnamed: 0,Token,Tag,Sentence #
0,EMENTA,INICIO_ARQ,Sentence 1
1,:,O,Sentence 1
2,,O,Sentence 1
3,CONSTITUCIONAL,O,Sentence 1
4,.,O,Sentence 1


In [3]:
combined_csv.shape

(15090200, 3)

In [17]:
combined_csv.profile_report()

KeyboardInterrupt: 

In [4]:
# Classes que seram usadas como parâmetro do CRF
new_classes = np.unique(combined_csv.Tag.values).tolist()
new_classes.remove('O')
new_classes

['B_Doutrina',
 'B_Pessoa',
 'B_Precedente',
 'B_Ref. Legislativa',
 'FIM_ARQ',
 'INICIO_ARQ',
 'I_Doutrina',
 'I_Pessoa',
 'I_Precedente',
 'I_Ref. Legislativa']

In [5]:
combined_csv['Token'] = combined_csv['Token'].astype('str')

In [6]:
# Separa as frases para criar contexto na aprendizagem

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['Token'].values.tolist(),
                                                     s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [7]:
def word2features(sent, i):
    word = sent[i][0]
#     postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
#         postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
#         postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [13]:
getter = SentenceGetter(combined_csv)
sentences = getter.sentences

In [12]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

NameError: name 'sentences' is not defined

In [10]:
sent = getter.get_next()
# print(sent)

[('~~e»to', 'INICIO_ARQ'), (' ', 'O'), ('c;JJ;~', 'O'), ('.', 'O'), ('~wa', 'O'), ('/', 'O'), (' ', 'O'), ('pfi~', 'O'), ('/', 'O'), ("<'d", 'O'), (' ', 'O')]


In [40]:
print('Conjunto de treino: %d, conjunto de teste: %d'%(len(X_train),len(X_test)))

Conjunto de treino: 5411, conjunto de teste: 2319


In [46]:
len(combined_csv)

695972

In [48]:
combined_csv.shape

(695972, 3)

In [50]:
len(combined_csv['Sentence #'].unique())

7730

# CRF (Conditional Random Fields)

In [11]:
%%time 

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', #Gradient Descent
    c1=0.1, # coefficient for L1 penalty
    c2=0.1, # coefficient for L2 penalty
    max_iterations=100,
    all_possible_transitions=True) # whether to include transitions that are possible, but not observed

# crf = sklearn_crfsuite.CRF( # Performa 'pior'
#     algorithm='l2sgd', # Stochastic Gradient Descent with L2 regularization term
#     c2=0.1,
#     max_iterations=100,
#     all_possible_transitions=True) # whether to include transitions that are possible, but not observed

crf.fit(X_train, y_train)

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)

CPU times: user 1min 5s, sys: 217 ms, total: 1min 5s
Wall time: 1min 3s


0.8963811215929433

In [12]:
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes, digits=3))

                    precision    recall  f1-score   support

        B_Doutrina      0.729     0.653     0.689        95
          B_Pessoa      0.831     0.755     0.791       620
      B_Precedente      0.816     0.751     0.782       555
B_Ref. Legislativa      0.900     0.835     0.866       667
           FIM_ARQ      1.000     0.769     0.870        26
        INICIO_ARQ      1.000     0.800     0.889        25
        I_Doutrina      0.923     0.915     0.919      4975
          I_Pessoa      0.902     0.872     0.887      4090
      I_Precedente      0.930     0.868     0.898      9447
I_Ref. Legislativa      0.928     0.883     0.905      9484

         micro avg      0.919     0.875     0.897     29984
         macro avg      0.896     0.810     0.850     29984
      weighted avg      0.919     0.875     0.896     29984



In [13]:
# Cria arquivo com todas as informações geradas pelo crf.

crf.tagger_.dump(filename="crf_tagger.txt")

In [22]:
# # Encontra melhor parâmetro

# crf = sklearn_crfsuite.CRF(
#     algorithm='lbfgs',
#     max_iterations=100,
#     all_possible_transitions=True
# )
# params_space = {
#     'c1': scipy.stats.expon(scale=0.5),
#     'c2': scipy.stats.expon(scale=0.05),
# }

# # use the same metric for evaluation
# f1_scorer = make_scorer(metrics.flat_f1_score,
#                         average='weighted', labels=new_classes)

# # search
# rs = RandomizedSearchCV(crf, params_space,
#                         cv=3,
#                         verbose=1,
#                         n_jobs=-1,
#                         n_iter=50,
#                         scoring=f1_scorer)
# rs.fit(X_train, y_train)

In [23]:
# print('best params:', rs.best_params_)
# print('best CV score:', rs.best_score_)
# print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [24]:
# crf = rs.best_estimator_
# y_pred = crf.predict(X_test)
# print(metrics.flat_classification_report(y_test, y_pred, labels=new_classes))

# Inspeciona dados que o modelo previu.

In [14]:
# Cria dataframe com o que o modelo predisse para análise.

result = []
for i in range(len(y_test)):
    df = pd.DataFrame(zip([X_test[i][j]['word.lower()'] for j in range(len(X_test[i]))], y_test[i], y_pred[i]))
#     df = pd.DataFrame(zip(X_test[i], y_test[i], y_pred[i]))
    result.append(df)
    
result = pd.concat(result)
result.reset_index(inplace=True, drop=True)

In [15]:
# combined_csv = pd.concat(frames)
result.to_csv("resultado.csv", index=False, encoding='utf-8')

In [16]:
result.shape

(210816, 3)

In [17]:
result.columns = ['X_test','y_test', 'y_pred']
result.head()

Unnamed: 0,X_test,y_test,y_pred
0,5,O,O
1,.,O,O
2,,O,O
3,o,O,O
4,,O,O


In [20]:
result[result['y_test'] != result['y_pred']].head(20)

Unnamed: 0,X_test,y_test,y_pred
24,subprocurador,B_Pessoa,O
25,-,I_Pessoa,O
26,geral,I_Pessoa,O
27,,I_Pessoa,O
28,da,I_Pessoa,O
29,,I_Pessoa,O
30,república,I_Pessoa,O
31,,I_Pessoa,O
32,luciano,I_Pessoa,B_Pessoa
357,artigos,B_Ref. Legislativa,O


In [21]:
print('Proporção de erros:',result[result.y_test != result.y_pred].shape[0] / result.shape[0])

Proporção de erros: 0.024775159380692167
