In [None]:
import pycrfsuite
from IPython.core.display import display, HTML
from sklearn.model_selection import train_test_split

from dataset import load_training_data
from extractor import get_features, get_tokens, get_labels
from features import SentenceProcessor, row_to_tokenfeatures

display(HTML("<style>.container { width:100% !important; }</style>"))


## Load and transform and split the dataset

In [None]:
# Transform new sentences to TokenFeatures
processor = SentenceProcessor('tagger/spanish.tagger',
                              'tagger/stanford-postagger.jar')
processor.process('¡CUN a Madrid $200!', 30)

## Transform previous dataset to TokenFeatures

In [None]:
# preserve
training_set = load_training_data()
training_set['real_label'] = training_set['real_label'].replace('f', 'n')

documents = []
current_doc = []
prev = -1
for i,word in training_set.iterrows():
    if i != prev:
        if current_doc:
            documents.append(current_doc)
        current_doc = []
    current_doc.append(row_to_tokenfeatures(word))
    prev = i

if current_doc:
    documents.append(current_doc)

print(documents[0])

train_docs, test_docs = train_test_split(documents)
print()
print(f'Training docs: {len(train_docs)}')
print(f'Testing docs: {len(test_docs)}')

[TokenFeatures(sentence_id=0, offer_length=44, token='¡', position=0, POS='faa', left_POS='<p>', right_POS='np00000', token_length=1, uppercase=False, tokens_in_sentence=11, is_numeric=False, is_punctuation=False, label='n'), TokenFeatures(sentence_id=0, offer_length=44, token='CUN', position=1, POS='np00000', left_POS='faa', right_POS='sp000', token_length=3, uppercase=True, tokens_in_sentence=11, is_numeric=False, is_punctuation=False, label='o'), TokenFeatures(sentence_id=0, offer_length=44, token='a', position=5, POS='sp000', left_POS='np00000', right_POS='np00000', token_length=1, uppercase=False, tokens_in_sentence=11, is_numeric=False, is_punctuation=False, label='s'), TokenFeatures(sentence_id=0, offer_length=44, token='Ámsterdam', position=7, POS='np00000', left_POS='sp000', right_POS='zm', token_length=9, uppercase=False, tokens_in_sentence=11, is_numeric=False, is_punctuation=False, label='d'), TokenFeatures(sentence_id=0, offer_length=44, token='$', position=17, POS='zm', l

## Extractor functions

In [None]:
%%time
y_train = [get_labels(s) for s in train_docs]
X_train = [get_features(s) for s in train_docs]

y_test = [get_labels(s) for s in test_docs]
X_test = [get_features(s) for s in test_docs]

In [None]:
# Check sizes
for features, labels in zip(y_test, X_test):
    assert len(features) == len(labels)

In [None]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [None]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [None]:
%%time
trainer.train('model.crfsuite')

In [None]:
crf_tagger = pycrfsuite.Tagger()
crf_tagger.open('model.crfsuite')

## Testing the tagger

### Test set

In [None]:
# preserve
incorrect = 0
for i in range(len(test_docs)):
    example_sent = documents[i]
    predicted = crf_tagger.tag(get_features(example_sent))
    correct = get_labels(example_sent)
    if predicted != correct:
        incorrect += 1
        tokens = get_tokens(example_sent)
        lengths = [len(t) for t in tokens]
        print("%4d" %  example_sent[0].sentence_id, ' '.join(tokens))
        
        print('P:   ', end='')
        for i, token in enumerate(predicted):
            print(token + ( " " * lengths[i]), end='')
        print()
        print('C:   ', end='')
        for i, token in enumerate(correct):
            print(token + ( " " * lengths[i]), end='')
        print('\n\n')
        
print(f'Incorrectly predicted: {incorrect} out of {len(test_docs)}')

  40 ¡ CDMX a Europa en Semana Santa $ 14,984 ! ( París + Ibiza + Venecia )
P:   n o    s d      d  d      d     n p      n n n     n n     n n       n 
C:   n o    s d      n  n      n     n p      n n n     n n     n n       n 


Incorrectly predicted: 1 out of 60


### Full dataset

In [None]:
# preserve
incorrect = 0
for i in range(len(documents)):
    example_sent = documents[i]
    features = get_features(example_sent)
    predicted = crf_tagger.tag(features)
    correct = get_labels(example_sent)
    if predicted != correct:
        incorrect += 1
        tokens = get_tokens(example_sent)
        lengths = [len(t) for t in tokens]
        print("%4d" %  example_sent[0].sentence_id, ' '.join(tokens))
        
        print('  P: ', end='')
        for i, token in enumerate(predicted):
            print(token + ( " " * lengths[i]), end='')
        print()
        print('  C: ', end='')
        for i, token in enumerate(correct):
            print(token + ( " " * lengths[i]), end='')
        print('\n\n')
        
print(f'Incorrectly predicted: {incorrect} out of {len(documents)}')

  40 ¡ CDMX a Europa en Semana Santa $ 14,984 ! ( París + Ibiza + Venecia )
  P: n o    s d      d  d      d     n p      n n n     n n     n n       n 
  C: n o    s d      n  n      n     n p      n n n     n n     n n       n 


  60 ¡ CDMX a Noruega $ 10,061 ! ( Y agrega 9 noches de hotel por $ 7,890 !
  P: n o    s d       n p      n n n n      n n      n  n     n   n p     n 
  C: n o    s d       n p      n n n n      n n      n  n     n   n n     n 


 171 ¡ CUN a King ’ s Landing ( Croacia ) + Ámsterdam $ 12,549 ! Sin escala EE.UU
  P: n o   s d    n n n       n n       n n n         n p      n n   n      n     
  C: n o   s d    d d d       d d       d d d         n p      n n   n      n     


 202 ¡ CUN a Washington D . C . $ 3,196 ! Directos ( Y por $ 3,027 adicionales agrega 3 noches de hotel )
  P: n o   s d          d d d n n p     n n        n n n   n n     n           n      n n      n  n     n 
  C: n o   s d          d d d d n p     n n        n n n   n n     n     

### Manual examples

In [None]:
# preserve
manual_examples = ['¡CDMX a Bogotá $3,467! Directos (Agrega 2 noches de hotel por $419)',
                   '¡CUN a Israel $14,574! Sin escala EE.UU (y desde CDMX $15,146)',
                   '¡CDMX, GDL, VER, MTY, CUN, Silao y TIJ a Lima, Perú – $6,529! ',
                   '¡CDMX a Noruega $11,863! Temporada de Auroras',
                   '¡Tijuana a China + Corea $17,522! Sin escala EE.UU (Sem. Santa)']
documents_as_token_features = [processor.process(example, i) for i, example in enumerate(manual_examples)]
documents_as_tagger_features = [get_features(doc) for doc in documents_as_token_features]

for i in range(len(documents_as_tagger_features)):
    token_features = documents_as_token_features[i]
    tagger_features = documents_as_tagger_features[i]
    predicted = crf_tagger.tag(tagger_features)
    correct = get_labels(token_features)
    tokens = get_tokens(token_features)
    lengths = [len(t) for t in tokens]
    
    print("%4d" %  example_sent[0].sentence_id, ' '.join(tokens))

    print('  P: ', end='')
    for i, token in enumerate(predicted):
        print(token + ( " " * lengths[i]), end='')
    print('\n\n')

   0 ¡ CDMX a Bogotá $ 3,467 ! Directos ( Agrega 2 noches de hotel por $ 419 )
  P: n o    s d      n p     n n        n n      n n      n  n     n   n n   n 


   0 ¡ CUN a Israel $ 14,574 ! Sin escala EE.UU ( y desde CDMX $ 15,146 )
  P: n o   s d      n p      n n   n      n     n n n     n    n n      n 


   0 ¡ CDMX , GDL , VER , MTY , CUN , Silao y TIJ a Lima , Perú – $ 6,529 !
  P: n o    o o   o o   o o   o o   o o     o o   s d    d d    n n p     n 


   0 ¡ CDMX a Noruega $ 11,863 ! Temporada de Auroras
  P: n o    s d       n p      n n         n  n       


   0 ¡ Tijuana a China + Corea $ 17,522 ! Sin escala EE.UU ( Sem . Santa )
  P: n o       s d     d d     n p      n n   n      n     n n   n n     n 




### Traditional ML metrics

In [None]:
y_test_flat = [item for sublist in y_test for item in sublist]
y_pred_flat = []


from m16_mlutils.datatools.evaluation import eval_summary

for doc in X_test:
    predicted = crf_tagger.tag(doc)
    y_pred_flat.extend(predicted)

metrics, summary, cm = eval_summary(y_test_flat, y_pred_flat)

In [None]:
# preserve
print(metrics, end='\n\n')
print(summary)

accuracy     0.988267
precision    0.982729
recall       0.985057
f1           0.983786
dtype: float64

              precision    recall  f1-score   support

           d       0.97      0.93      0.95        91
           n       0.99      0.99      0.99       797
           o       0.97      1.00      0.99       105
           p       1.00      1.00      1.00        57
           s       0.98      1.00      0.99        58

   micro avg       0.99      0.99      0.99      1108
   macro avg       0.98      0.99      0.98      1108
weighted avg       0.99      0.99      0.99      1108

