In [None]:
from sklearn.model_selection import train_test_split
from dataset import load_training_data
from features import index_emoji_tokenize, is_numeric, is_punctuation, row_to_tokenfeatures, TokenFeatures
import pycrfsuite
from collections import namedtuple
import json
import string
from pipelines import SentenceChunker

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Load and transform and split the dataset

In [None]:
# Transform new sentences to TokenFeatures
from nltk.tag.stanford import StanfordPOSTagger

tagger = StanfordPOSTagger('tagger/spanish.tagger',
                           'tagger/stanford-postagger.jar')


def process_sentence(sentence, sentence_id=None):
    tokens = list(index_emoji_tokenize(sentence, True))
    only_tokens = [l[0] for l in tokens]
    positions = [l[1] for l in tokens]
    tagged = tagger.tag(only_tokens)
    tags = [l[1] for l in tagged]
    lengths = [len(l) for l in only_tokens]
    offer_len = [len(sentence) for l in only_tokens]
    n_tokens = [len(only_tokens) for l in only_tokens]
    augmented = ['<p>'] + tags + ['</p>']
    uppercase = [all([l.isupper() for l in token]) for token in only_tokens]
    numeric = [is_numeric(l) for l in only_tokens]
    puntctuations = [is_punctuation(l) for l in only_tokens]
    labels = [None for l in only_tokens]
    sentence_ids = [sentence_id] * len(only_tokens)
    return [TokenFeatures(*t) for t in zip(sentence_ids, offer_len, only_tokens, positions, tags, augmented[:len(only_tokens)], augmented[2:], 
                                           lengths, uppercase, n_tokens, numeric, puntctuations, labels)]

process_sentence('¡CUN a Madrid $200!', 30)

## Transform previous dataset to TokenFeatures

In [None]:
# preserve
training_set = load_training_data()
training_set['real_label'] = training_set['real_label'].replace('f', 'n')

documents = []
current_doc = []
prev = -1
for i,word in training_set.iterrows():
    if i != prev:
        if current_doc:
            documents.append(current_doc)
        current_doc = []
    current_doc.append(row_to_tokenfeatures(word))
    prev = i

if current_doc:
    documents.append(current_doc)

print(documents[0])

train_docs, test_docs = train_test_split(documents)
print()
print(f'Training docs: {len(train_docs)}')
print(f'Testing docs: {len(test_docs)}')

[TokenFeatures(sentence_id=0, offer_length=44, token='¡', position=0, POS='faa', left_POS='<p>', right_POS='np00000', token_length=1, uppercase=False, tokens_in_sentence=11, is_numeric=False, is_punctuation=False, label='n'), TokenFeatures(sentence_id=0, offer_length=44, token='CUN', position=1, POS='np00000', left_POS='faa', right_POS='sp000', token_length=3, uppercase=True, tokens_in_sentence=11, is_numeric=False, is_punctuation=False, label='o'), TokenFeatures(sentence_id=0, offer_length=44, token='a', position=5, POS='sp000', left_POS='np00000', right_POS='np00000', token_length=1, uppercase=False, tokens_in_sentence=11, is_numeric=False, is_punctuation=False, label='s'), TokenFeatures(sentence_id=0, offer_length=44, token='Ámsterdam', position=7, POS='np00000', left_POS='sp000', right_POS='zm', token_length=9, uppercase=False, tokens_in_sentence=11, is_numeric=False, is_punctuation=False, label='d'), TokenFeatures(sentence_id=0, offer_length=44, token='$', position=17, POS='zm', l

## Extractor functions

In [None]:
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

def extract_labels(doc):
    return [doc[i].label for i in range(len(doc))]

def extract_tokens(doc):
    return [doc[i].token for i in range(len(doc))]
    

def word2features(doc, i):
    tkn = doc[i]
    word = doc[i].token
    postag = doc[i].POS

    # Common features for all words. You may add more features here based on your custom use case
    features = [
            'bias',
            'word.lower=' + word.lower(),
#            'word[-3:]=' + word[-3:],
#            'word[-2:]=' + word[-2:],
#            'word.isupper=%s' % word.isupper(),
            'word.istitle=%s' % word.istitle(),
            'word.isdigit=%s' % tkn.is_numeric,
#            'word.ispunct=%s' % tkn.is_punctuation,
#            'word.location=%s' % doc[i]['loc'],
            'postag=' + postag
        ]

    # Features for words that are not at the beginning of a document
    if i > 0:
            tkn1 = doc[i-1]
            word1 = doc[i-1].token
            postag1 = doc[i-1].POS
            features.extend([
                '-1:word.lower=' + word1.lower(),
                '-1:word.istitle=%s' % word1.istitle(),
#                '-1:word.isupper=%s' % word1.isupper(),
                '-1:word.isdigit=%s' % tkn1.is_numeric,
                '-1:word.ispunct=%s' % tkn1.is_punctuation,
                '-1:postag=' + postag1
            ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not at the end of a document
    if i < len(doc)-1:
            tkn1 = doc[i+1]
            word1 = doc[i+1].token
            postag1 = doc[i+1].POS
            features.extend([
                '+1:word.lower=' + word1.lower(),
                '+1:word.istitle=%s' % word1.istitle(),
#                '+1:word.isupper=%s' % word1.isupper(),
                '+1:word.isdigit=%s' % tkn1.is_numeric,
                '+1:word.ispunct=%s' % tkn1.is_punctuation,
                '+1:postag=' + postag1
            ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [None]:
%%time
y_train = [extract_labels(s) for s in train_docs]
X_train = [extract_features(s) for s in train_docs]

y_test = [extract_labels(s) for s in test_docs]
X_test = [extract_features(s) for s in test_docs]

In [None]:
# Check sizes
for features, labels in zip(y_test, X_test):
    assert len(features) == len(labels)

In [None]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [None]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [None]:
%%time
trainer.train('model.crfsuite')

In [None]:
crf_tagger = pycrfsuite.Tagger()
crf_tagger.open('model.crfsuite')

## Testing the tagger

### Test set

In [None]:
# preserve
incorrect = 0
for i in range(len(test_docs)):
    example_sent = documents[i]
    predicted = crf_tagger.tag(extract_features(example_sent))
    correct = extract_labels(example_sent)
    if predicted != correct:
        incorrect += 1
        tokens = extract_tokens(example_sent)
        lengths = [len(t) for t in tokens]
        print("%4d" %  example_sent[0].sentence_id, ' '.join(tokens))
        
        print('P:   ', end='')
        for i, token in enumerate(predicted):
            print(token + ( " " * lengths[i]), end='')
        print()
        print('C:   ', end='')
        for i, token in enumerate(correct):
            print(token + ( " " * lengths[i]), end='')
        print('\n\n')
        
print(f'Incorrectly predicted: {incorrect} out of {len(test_docs)}')

  40 ¡ CDMX a Europa en Semana Santa $ 14,984 ! ( París + Ibiza + Venecia )
P:   n o    s d      d  d      d     n p      n n n     n n     n n       n 
C:   n o    s d      n  n      n     n p      n n n     n n     n n       n 


Incorrectly predicted: 1 out of 60


### Full dataset

In [None]:
# preserve
incorrect = 0
for i in range(len(documents)):
    example_sent = documents[i]
    features = extract_features(example_sent)
    predicted = crf_tagger.tag(features)
    correct = extract_labels(example_sent)
    if predicted != correct:
        incorrect += 1
        tokens = extract_tokens(example_sent)
        lengths = [len(t) for t in tokens]
        print("%4d" %  example_sent[0].sentence_id, ' '.join(tokens))
        
        print('  P: ', end='')
        for i, token in enumerate(predicted):
            print(token + ( " " * lengths[i]), end='')
        print()
        print('  C: ', end='')
        for i, token in enumerate(correct):
            print(token + ( " " * lengths[i]), end='')
        print('\n\n')
        
print(f'Incorrectly predicted: {incorrect} out of {len(documents)}')

  40 ¡ CDMX a Europa en Semana Santa $ 14,984 ! ( París + Ibiza + Venecia )
  P: n o    s d      d  d      d     n p      n n n     n n     n n       n 
  C: n o    s d      n  n      n     n p      n n n     n n     n n       n 


  60 ¡ CDMX a Noruega $ 10,061 ! ( Y agrega 9 noches de hotel por $ 7,890 !
  P: n o    s d       n p      n n n n      n n      n  n     n   n p     n 
  C: n o    s d       n p      n n n n      n n      n  n     n   n n     n 


 171 ¡ CUN a King ’ s Landing ( Croacia ) + Ámsterdam $ 12,549 ! Sin escala EE.UU
  P: n o   s d    n n n       n n       n n n         n p      n n   n      n     
  C: n o   s d    d d d       d d       d d d         n p      n n   n      n     


 202 ¡ CUN a Washington D . C . $ 3,196 ! Directos ( Y por $ 3,027 adicionales agrega 3 noches de hotel )
  P: n o   s d          d d d n n p     n n        n n n   n n     n           n      n n      n  n     n 
  C: n o   s d          d d d d n p     n n        n n n   n n     n     

### Manual examples

In [None]:
# preserve
manual_examples = ['¡CDMX a Bogotá $3,467! Directos (Agrega 2 noches de hotel por $419)',
                   '¡CUN a Israel $14,574! Sin escala EE.UU (y desde CDMX $15,146)',
                   '¡CDMX, GDL, VER, MTY, CUN, Silao y TIJ a Lima, Perú – $6,529! ',
                   '¡CDMX a Noruega $11,863! Temporada de Auroras',
                   '¡Tijuana a China + Corea $17,522! Sin escala EE.UU (Sem. Santa)']
documents_as_token_features = [process_sentence(example, i) for i, example in enumerate(manual_examples)]
documents_as_tagger_features = [extract_features(doc) for doc in documents_as_token_features]

for i in range(len(documents_as_token_features)):
    example_sent = documents_as_token_features[i]
    features = extract_features(example_sent)
    predicted = crf_tagger.tag(features)
    correct = extract_labels(example_sent)
    tokens = extract_tokens(example_sent)
    lengths = [len(t) for t in tokens]
    
    print("%4d" %  example_sent[0].sentence_id, ' '.join(tokens))

    print('  P: ', end='')
    for i, token in enumerate(predicted):
        print(token + ( " " * lengths[i]), end='')
    print('\n\n')

   0 ¡ CDMX a Bogotá $ 3,467 ! Directos ( Agrega 2 noches de hotel por $ 419 )
  P: n o    s d      n p     n n        n n      n n      n  n     n   n n   n 


   1 ¡ CUN a Israel $ 14,574 ! Sin escala EE.UU ( y desde CDMX $ 15,146 )
  P: n o   s d      n p      n n   n      n     n n n     n    n n      n 


   2 ¡ CDMX , GDL , VER , MTY , CUN , Silao y TIJ a Lima , Perú – $ 6,529 !
  P: n o    o o   o o   o o   o o   o o     o o   s d    d d    n n p     n 


   3 ¡ CDMX a Noruega $ 11,863 ! Temporada de Auroras
  P: n o    s d       n p      n n         n  n       


   4 ¡ Tijuana a China + Corea $ 17,522 ! Sin escala EE.UU ( Sem . Santa )
  P: n o       s d     d d     n p      n n   n      n     n n   n n     n 




### Traditional ML metrics

In [None]:
y_test_flat = [item for sublist in y_test for item in sublist]
y_pred_flat = []


from m16_mlutils.datatools.evaluation import eval_summary

for doc in X_test:
    predicted = crf_tagger.tag(doc)
    y_pred_flat.extend(predicted)

print(len(y_pred_flat), len(y_test_flat))
metrics, summary, cm = eval_summary(y_test_flat, y_pred_flat)

In [None]:
# preserve
print(metrics, end='\n\n')
print(summary)

accuracy     0.994037
precision    0.992306
recall       0.988180
f1           0.990022
dtype: float64

              precision    recall  f1-score   support

           d       1.00      0.94      0.97        88
           n       0.99      1.00      1.00       877
           o       1.00      1.00      1.00        91
           p       0.97      1.00      0.98        59
           s       1.00      1.00      1.00        59

   micro avg       0.99      0.99      0.99      1174
   macro avg       0.99      0.99      0.99      1174
weighted avg       0.99      0.99      0.99      1174

