# Prioridades:
+ Parâmetros utilizados no classificador.

+ Analisar o formato dos dados que tem maior acerto e menor acerto tambem.
  
+ Visualização: Separar o conjunto de test em 2 ou 3 arquivos e visualizar o que o modelo classificou e o que os anotadores classificaram (separar por id do anotador).

+ Incluir POS tagging.

+ Tranformar o dado $x_i$ em um $x'_i$ que incorpora os 2 últimos e próximos tokens.

# Detalhes na predição

+ B_ em espaço em branco

+ Remover linhas com '\n' seguidos

+ Para criar um contexto no erro imprimir 10 palavras antes e depois de dois erros.

+ REGEX:
    + Garantir letra e números onde tamanho for maior que 1.
    + Passar múltiplos símbolos para outra linha. Exemplo:  §3º -->  § \n 3 \n º
    + Remover pontuação de centenas dos números. Exemplo: 12.200 --> 12200

# Instruções para os anotadores:

+ Atentar à marcação de tags que envolve espaço 

+ Atentar para não incluir espaço no início da Tag

# Organização do diretório: 

+ Manter toda a análise em somente um diretório

+ Formato do diretório com os datasets: /resources/dataset/

+ Notebooks:
    + '01 - ProcessamentodoDataset.ipynb'
        + Gerar 'treino.csv' e teste.csv' para processamento
    + '02 - [CRF].ipynb' - Criando o modelo
        + Gerar modelo (xxx.model)
        + Métricas
    + '03 - Metricas.ipynb'

In [1]:
import os
import glob
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from collections import Counter
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
import time
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter

os.chdir("mock")

In [2]:
extension = 'csv'
all_filenames = [i for i in glob.glob('*/**/***/****.{}'.format(extension))]

In [3]:
all_filenames[0]

'161704902/[PRATICA_ETAPA_1]/Documentos/20180510_Rcl_22328_314302526.ner.csv'

# Cria uma tag de inicio e fim de arquivo em cada arquivo antes de apendar todos os arquivos

In [4]:
frames = []
for all_files in all_filenames:
    df = pd.read_csv(all_files,delimiter=';', na_values='NaN')
    df['Tag'].iloc[0] , df['Tag'].iloc[-1] = ['INICIO_ARQ', 'FIM_ARQ']
    frames.append(df)
    
combined_csv = pd.concat(frames)
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8')

In [5]:
combined_csv.head() , combined_csv.tail()
print("Número de linhas dos arquivos concatenados:", len(combined_csv['Tag']))

Número de linhas dos arquivos concatenados: 967910


In [6]:
# combined_csv.reset_index(inplace=True, drop=True)
combined_csv = combined_csv.reset_index(drop=True)

In [7]:
combined_csv[-10:]

Unnamed: 0,Token,Tag
967900,Chefe,O
967901,,O
967902,do,O
967903,,O
967904,Plenário,O
967905,\n,O
967906,id,O
967907,:,O
967908,,O
967909,20141203_ADI_4350_285683668,FIM_ARQ


# Encontra parágrafo duplo no arquivo. Uma opção de separar por sentenças.

In [8]:
a_df = combined_csv #Simplifica o nome do arquivo 
starts = a_df[a_df['Token']=='\n'].index & a_df[a_df['Token'].shift(-1)=='\n'].index #Identifica os paragrafos duplos
print(u'Padrões(sentenças) encontrados:', len(starts))

Padrões(sentenças) encontrados: 9708


In [9]:
# combined_csv.iloc[:starts[0]+2] # Primeira sentença
# combined_csv.iloc[starts[-1]+2:] # Última sentença

In [10]:
%%time

combined_csv['Sentence #'] = 'Sentence'

combined_csv['Sentence #'][:starts[0]+2] = 'Sentence %d'%(1) # Primeira sentença
combined_csv['Sentence #'][starts[-1]+2:] = 'Sentence %d'%(len(starts)+1) # Última sentença

for i in range(1,len(starts)):
    combined_csv['Sentence #'][starts[i-1]+2:starts[i]+2] = 'Sentence %d'%(i+1) 

combined_csv.head(), combined_csv.tail()

CPU times: user 5min 30s, sys: 82.4 ms, total: 5min 30s
Wall time: 5min 30s


(     Token         Tag  Sentence #
 0   Ementa  INICIO_ARQ  Sentence 1
 1                    O  Sentence 1
 2        e           O  Sentence 1
 3                    O  Sentence 1
 4  Acórdão           O  Sentence 1,
                               Token      Tag     Sentence #
 967905                           \n        O  Sentence 9709
 967906                           id        O  Sentence 9709
 967907                            :        O  Sentence 9709
 967908                                     O  Sentence 9709
 967909  20141203_ADI_4350_285683668  FIM_ARQ  Sentence 9709)

In [11]:
len(combined_csv['Sentence #'].unique())

9709

In [12]:
combined_csv.head()

Unnamed: 0,Token,Tag,Sentence #
0,Ementa,INICIO_ARQ,Sentence 1
1,,O,Sentence 1
2,e,O,Sentence 1
3,,O,Sentence 1
4,Acórdão,O,Sentence 1


In [13]:
# combined_csv.groupby('Tag').size().reset_index(name='counts')

# Modelo

In [14]:
new_classes = np.unique(combined_csv.Tag.values).tolist()
new_classes.remove('O')
new_classes

['B_Doutrina',
 'B_Doutrinador',
 'B_Pessoa',
 'B_Precedente',
 'B_Ref. Legislativa',
 'FIM_ARQ',
 'INICIO_ARQ',
 'I_Doutrina',
 'I_Doutrinador',
 'I_Pessoa',
 'I_Precedente',
 'I_Ref. Legislativa']

In [15]:
combined_csv.head()

Unnamed: 0,Token,Tag,Sentence #
0,Ementa,INICIO_ARQ,Sentence 1
1,,O,Sentence 1
2,e,O,Sentence 1
3,,O,Sentence 1
4,Acórdão,O,Sentence 1


In [16]:
combined_csv['Token'] = combined_csv['Token'].astype('str')

In [17]:
# Separa as frases para criar contexto na aprendizagem

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['Token'].values.tolist(),
                                                     s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
    
getter = SentenceGetter(combined_csv)
sentences = getter.sentences

In [18]:
def word2features(sent, i):
    word = sent[i][0]
#     postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
#         postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
#         postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [20]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [21]:
getter = SentenceGetter(combined_csv)

sent = getter.get_next()
print(sent)

[('Ementa', 'INICIO_ARQ'), (' ', 'O'), ('e', 'O'), (' ', 'O'), ('Acórdão', 'O'), (' ', 'O'), ('06/03/2018', 'O'), (' ', 'O'), ('PRIMEIRA', 'O'), (' ', 'O'), ('TURMA', 'O'), (' ', 'O'), ('\n', 'O'), ('\n', 'O')]


In [22]:
%%time 

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', #Gradient Descent
    c1=0.1, # coefficient for L1 penalty
    c2=0.1, # coefficient for L2 penalty
    max_iterations=100,
    all_possible_transitions=True) # whether to include transitions that are possible, but not observed

# crf = sklearn_crfsuite.CRF( # Performa 'pior'
#     algorithm='l2sgd', # Stochastic Gradient Descent with L2 regularization term
#     c2=0.1,
#     max_iterations=100,
#     all_possible_transitions=True) # whether to include transitions that are possible, but not observed

crf.fit(X_train, y_train)

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=new_classes)

CPU times: user 1min 41s, sys: 248 ms, total: 1min 42s
Wall time: 1min 38s


0.8184949279966993

In [23]:
print(metrics.flat_classification_report(y_test, y_pred, labels = new_classes, digits=3))

                    precision    recall  f1-score   support

        B_Doutrina      0.772     0.616     0.685        99
     B_Doutrinador      0.696     0.448     0.545        87
          B_Pessoa      0.698     0.636     0.665       807
      B_Precedente      0.848     0.750     0.796       913
B_Ref. Legislativa      0.817     0.715     0.763       892
           FIM_ARQ      1.000     0.712     0.832        59
        INICIO_ARQ      1.000     0.724     0.840        58
        I_Doutrina      0.758     0.812     0.784      4225
     I_Doutrinador      0.771     0.647     0.703      3565
          I_Pessoa      0.839     0.763     0.799      4701
      I_Precedente      0.881     0.836     0.858     13490
I_Ref. Legislativa      0.869     0.827     0.847     12040

         micro avg      0.844     0.796     0.819     40936
         macro avg      0.829     0.707     0.760     40936
      weighted avg      0.844     0.796     0.818     40936



In [24]:
# Cria arquivo com todas as informações geradas pelo crf.

crf.tagger_.dump(filename="crf_tagger.txt")

The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.


# Olhando para a predição de cada X

In [25]:
X_test[0][0]['word.lower()'], y_pred[0][0], y_test[0][0]

('publique', 'O', 'O')

In [26]:
# Cria dataframe com o que o modelo predisse para análise.

result = []
for i in range(len(y_test)):
    df = pd.DataFrame(zip([X_test[i][j]['word.lower()'] for j in range(len(X_test[i]))], y_test[i], y_pred[i]))
#     df = pd.DataFrame(zip(X_test[i], y_test[i], y_pred[i]))
    result.append(df)
    
result = pd.concat(result)
result.reset_index(inplace=True, drop=True)

In [28]:
# combined_csv = pd.concat(frames)
result.to_csv("resultado.csv", index=False, encoding='utf-8')

In [30]:
result.shape

(325149, 3)

In [27]:
result.columns = ['X_test','y_test', 'y_pred']
result.head()

Unnamed: 0,X_test,y_test,y_pred
0,publique,O,O
1,-,O,O
2,se,O,O
3,.,O,O
4,,O,O


In [111]:
result[result['y_test'] != result['y_pred']]

Unnamed: 0,X_test,y_test,y_pred
875,\n,O,I_Precedente
876,\n,O,I_Precedente
1632,artigo,B_Ref. Legislativa,O
1633,,I_Ref. Legislativa,O
1634,24,I_Ref. Legislativa,O
1635,",",I_Ref. Legislativa,O
1648,,B_Ref. Legislativa,O
1649,incisos,I_Ref. Legislativa,O
1650,,I_Ref. Legislativa,O
1651,ix;,I_Ref. Legislativa,O


In [94]:
print('Proporção de erros:',result[result.y_test != result.y_pred].shape[0] / result.shape[0])

Proporção de erros: 0.03700457328793876


# DUMP

### Erro apresentado devido a incapacidade de reconhecer a expressão regular dentro do arquivo '~/anaconda3/lib/python3.7/site-packages/pycrfsuite/_dumpparser.py'. 

Links para recorrer à ajuda: 
+ [https://github.com/TeamHG-Memex/eli5/issues/242]
+ [https://github.com/scrapinghub/python-crfsuite/issues/14]

In [70]:
# Let’s check what classifier learned

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(5))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-5:])

Top likely transitions:


AttributeError: 'NoneType' object has no attribute 'group'

In [None]:
# np.unique(y_test)
y_test[0] + y_test[1]

In [None]:
y_novo = []
for i in range(len(y_test)):
    y_novo += y_test[i]
y_novo

In [None]:
np.unique(y_novo)

In [None]:
np.sctypes

In [None]:
sum([len(y_test[i]) for i in range(len(y_test))])
# len(y_test[1])

In [None]:
i = 1
combined_csv.iloc[starts[i-1]+2:starts[i]+2]

Parameters

•algorithm(str, optional (default='lbfgs'))  –  Training  algorithm.   Al-lowed values:–'lbfgs'- Gradient descent using the L-BFGS method–'l2sgd'- Stochastic Gradient Descent with L2 regularization term–'ap'- Averaged Perceptron–'pa'- Passive Aggressive (PA)–'arow'- Adaptive Regularization Of Weight Vector (AROW)

•min_freq(float, optional (default=0)) – Cut-off threshold for occurrencefrequency of a feature.  CRFsuite will ignore features whose frequencies of occurrences inthe training data are no greater thanmin_freq. The default is no cut-off.

•all_possible_states(bool, optional (default=False))    –    Specifywhether CRFsuite generates state features that do not even occur in the training data (i.e.,negative state features).  When True, CRFsuite generates state features that associate all ofpossible combinations between attributes and labels.Suppose that the numbers of attributes and labels are A and L respectively, this function willgenerate (A * L) features. Enabling this function may improve the labeling accuracy becausethe CRF model can learn the condition where an item is not predicted to its reference label.However, this function may also increase the number of features and slow down the trainingprocess drastically. This function is disabled by default.

•all_possible_transitions(bool, optional (default=False)) – Spec-ify whether CRFsuite generates transition features that do not even occur in the training data(i.e., negative transition features).  When True, CRFsuite generates transition features thatassociate all of possible label pairs. Suppose that the number of labels in the training data isL, this function will generate (L * L) transition features. This function is disabled by default.

•c1(float, optional (default=0)) – The coefficient for L1 regularization.  If anon-zero value is specified, CRFsuite switches to the Orthant-Wise Limited-memory Quasi-Newton (OWL-QN) method. The default value is zero (no L1 regularization).Supported training algorithms: lbfgs

•c2(float, optional (default=1.0)) – The coefficient for L2 regularization.Supported training algorithms: l2sgd, lbfgs

•max_iterations(int, optional (default=None)) – The maximum numberof iterations for optimization algorithms. Default value depends on training algorithm:–lbfgs - unlimited;–l2sgd - 1000;1.3.  API Reference13
sklearn-crfsuite Documentation, Release 0.3–ap - 100;–pa - 100;–arow - 100.

•num_memories(int, optional (default=6)) – The number of limited memo-ries for approximating the inverse hessian matrix.Supported training algorithms: lbfgs•epsilon(float, optional (default=1e-5)) – The epsilon parameter that de-termines the condition of convergence.Supported training algorithms: ap, arow, lbfgs, pa

•period(int, optional (default=10)) – The duration of iterations to test thestopping criterion.Supported training algorithms: l2sgd, lbfgs

•delta(float, optional (default=1e-5)) – The threshold for the stopping cri-terion; an iteration stops when the improvement of the log likelihood over the lastperioditerations is no greater than this threshold.Supported training algorithms: l2sgd, lbfgs

•linesearch(str, optional (default='MoreThuente')) – The line searchalgorithm used in L-BFGS updates. Allowed values:–'MoreThuente'- More and Thuente’s method;–'Backtracking'- backtracking method with regular Wolfe condition;–'StrongBacktracking'- backtracking method with strong Wolfe condition.Supported training algorithms: lbfgs•max_linesearch(int, optional (default=20)) – The maximum number oftrials for the line search algorithm.Supported training algorithms: lbfgs

•calibration_eta(float, optional (default=0.1)) – The initial value oflearning rate (eta) used for calibration.Supported training algorithms: l2sgd

•calibration_rate(float, optional (default=2.0))  –  The  rate  of  in-crease/decrease of learning rate for calibration.Supported training algorithms: l2sgd

•calibration_samples(int, optional (default=1000)) – The number ofinstances used for calibration. The calibration routine randomly chooses instances no largerthancalibration_samples.Supported training algorithms: l2sgd

•calibration_candidates(int, optional (default=10))  –  The  numberof  candidates  of  learning  rate.   The  calibration  routine  terminates  after  findingcalibra-tion_samplescandidates of learning rates that can increase log-likelihood.Supported training algorithms: l2sgd14Chapter 1.  Contents
sklearn-crfsuite Documentation, Release 0.3

•calibration_max_trials(int, optional (default=20)) – The maximumnumber of trials of learning rates for calibration.  The calibration routine terminates aftertryingcalibration_max_trialscandidate values of learning rates.Supported training algorithms: l2sgd

•pa_type(int, optional (default=1))  –  The  strategy  for  updating  featureweights. Allowed values:–0 - PA without slack variables;–1 - PA type I;–2 - PA type II.Supported training algorithms: pa

•c(float, optional (default=1)) – Aggressiveness parameter (used only for PA-I and PA-II). This parameter controls the influence of the slack term on the objective func-tion.Supported training algorithms: pa

•error_sensitive(bool, optional (default=True)) – If this parameter isTrue,  the optimization routine includes into the objective function the square root of thenumber of incorrect labels predicted by the model.Supported training algorithms: pa

•averaging(bool, optional (default=True)) – If this parameter is True, theoptimization routine computes the average of feature weights at all updates in the trainingprocess (similarly to Averaged Perceptron).Supported training algorithms: pa

•variance(float, optional (default=1)) – The initial variance of every fea-ture weight.  The algorithm initialize a vector of feature weights as a multivariate Gaussiandistribution with mean 0 and variancevariance.Supported training algorithms: arow

•gamma(float, optional (default=1)) – The tradeoff between loss function andchanges of feature weights.Supported training algorithms: arow

•verbose(bool, optional (default=False)) – Enable trainer verbose mode.

•model_filename(str, optional (default=None))  –  A  path  to  an  existingCRFSuite model. This parameter allows to load and use existing crfsuite models.By  default,  model  files  are  created  automatically  and  saved  in  temporary  locations;  thepreferred way to save/load CRF models is to use pickle (or its alternatives like joblib)

In [None]:
X = combined_csv.drop('Tag', axis=1) # Define o conjunto X
v = DictVectorizer(sparse=True) # Função que transforma listas de features em vetores
X = v.fit_transform(X.to_dict('records')) #Aplica a função de vetorização no conjunto 
                                          #X que foi colocado no formato 'records' (informa o que preenche cada coluna 
                                          # da linha i)
y = combined_csv.Tag.values # Define o conjunto y

classes = np.unique(y) # Define quais serão as classes baseado nos valores únicos da coluna y
classes = classes.tolist() # Tranforma as classes de array para lista

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=0) # Divide o conjunto em treino
                                                                                            #e teste
X_train.shape, y_train.shape # Formato dos dados