In [None]:
import json

import pycrfsuite

from extractor import get_features
from features import SentenceProcessor


# Preparing to predict

## Loading the sentence chunker

In [None]:
sentence_processor = SentenceProcessor('tagger/spanish.tagger',
                                       'tagger/stanford-postagger.jar')

## Loading the estimator

In [None]:
crf_tagger = pycrfsuite.Tagger()
_ = crf_tagger.open('models/vuelax.crf')

## A function to load and identify the data

In [None]:
def label_data(sentence):
    token_features = sentence_processor.process(sentence)
    labels = crf_tagger.tag(get_features(token_features))
    tokens = [t.token for t in token_features]
    destination_tokens = []
    origin_tokens = []
    price = -1
    for token, label in zip(tokens, labels):
        if label == 'o':
            origin_tokens.append(token)
        elif label == 'd':
            destination_tokens.append(token)
        elif label == 'p':
            price = float(token.replace(',',''))
    return {
        'origin': origin_tokens,
        'destination': ' '.join(destination_tokens),
        'price': price
    }

# Some predictions

In [None]:
promotions = [
    '¡CDMX a Puerto Rico $4,220!',
    '¡CDMX, MTY y GDL a Belice – $1,841! (Agrega 8 noches de hotel por $2,524)',
    '¡CUN a Miami $2,902!',
    '¡CDMX a La Paz, Bolivia – $8,240! 🇧🇴 (Por $2,402 agrega 6 noches de hotel con desayunos)',
    '¡CDMX a Estocolmo, Suecia – $11,528! 🇸🇪 (Por $3,975 agrega 6 noches en hotel-barco con desayunos',
    '¡NYC a Dublín $4,615! Directos',
    '¡CDMX a París + Madrid $13,252! Directos',
    '¡TIJ a China + Rusia $12,242! Directos (Y agrega hotel 14 noches por $4,847)',
    '¡GDL a Denver $2.364! Y desde CDMX $3,054',
]

for promotion in promotions:
    result = label_data(promotion)
    print(promotion)
    print(json.dumps(result, indent=4))
    print()