# Creating a tagger for flight offer phrases
### Like this one: "¡CDMX a Bogotá 🇨🇴 $4,659!"

In [None]:
# libraries
import pandas as pd
import csv
import os
HOME = os.getenv('HOME')

In [None]:
# Load dataset:
vuelos = pd.read_csv('data/vuelos.csv', index_col=0)
with pd.option_context('max_colwidth', 800):
    print(vuelos.loc[:100:5][['label']])

Most of the offers follow a simple pattern: *Destination - Origin - Price - Extras*, while extracting this may seem easy for a regular expression, it is not (see this notebook for reference). 

The idea is to create a tagger that will be able to extract this information, however, one first tag is to identify the information that we want to extract. Following the pattern described above: 

 - **DST**: Destination 
 - **ORI**: Origin 
 - **PRC**: Price 
 - **EXT**: Extras
 
| Text 	| DST 	| ORI 	| PRC 	| OTH 	|
|------	|-----	|-----	|-----	|-----	|
| ¡CUN a Holanda \$8,885! Sin escala EE.UU | CUN | Holanda | 8,885 | Sin escala EE.UU |   
| ¡CDMX a Noruega <span>$</span>10,061! (Y agrega 9 noches de hotel por \$7,890!) | CDMX | Noruega | 10,061 | Y agrega 9 noches de hotel por \$7,890!| 
| ¡Todo México a Pisa, Toscana Italia \$12,915! Sin escala EE.UU (Y por \$3,975 agrega 13 noches hotel) | México | Pisa, Toscana Italia | 12,915 | Sin escala EE.UU (Y por \$3,975 agrega 13 noches hotel) |

## Tokenize and POS-tag the dataset 
We need to generate a *csv* file that we can tag (manually 😨) that consists of:
```
token1    POS tag    Label
token2    POS tag    Label
token3    POS tag    Label
```

Where `Label` will be one of DST, ORI, PRC, OTH and NA and will be manually assigned (again: 😨)

In [None]:
from nltk.tag.stanford import StanfordPOSTagger

spanish_postagger = StanfordPOSTagger(HOME + '/stanford_nlp/models/spanish-distsim.tagger', 
                                      HOME + '/stanford_nlp/stanford-postagger.jar')

print(spanish_postagger.tag('Pepe Pecas pica papas con un pico, con un pico pica papas Pepe Pecas.'.split()))

In [None]:
from nltk.tokenize import TweetTokenizer
tknz = TweetTokenizer()

transforms = {
    'LA': ['Los', 'Angeles']
}

def index_emoji_tokenize(string, return_flags=False):
    i = 0
    flag = ''
    ix = 0
    for t in tknz.tokenize(string):
        ix = string.find(t, ix)
        if len(t) == 1 and ord(t) >= 127462: # this is the code for 🇦
            if not return_flags: continue
            if flag:
                yield flag + t, ix - 1
                flag = ''
            else:
                flag = t
        else:
            yield t, ix
        ix=+1
        

label = vuelos.iloc[75]['label']
print(label)
print()
tokens = list(index_emoji_tokenize(label, return_flags=True))
print(tokens)

In [None]:
simply_tokens = [ l[0] for l in tokens ]
print(spanish_postagger.tag(simply_tokens))

In [None]:
def process_label(label, debug=False):
    tokens = list(index_emoji_tokenize(label, True))
    if debug:
        print('Tokens', len(tokens))
    only_tokens = [l[0] for l in tokens]
    if debug:
        print('Only tokens', len(only_tokens))
    positions = [l[1] for l in tokens]
    if debug:
        print('Positions', len(positions))
    tagged = spanish_postagger.tag(only_tokens)
    if debug:
        print('Tagged', len(tagged))
    tags =  [l[1] for l in tagged]
    if debug:
        print('Tags', len(tags))
    lengths =  [len(l) for l in only_tokens]
    if debug:
        print('Lengths', len(lengths))
    n_tokens =  [len(only_tokens) for l in only_tokens]
    if debug:
        print('N tokens', len(n_tokens))
    augmented = ['<p>'] + tags + ['</p>']
    uppercase = [all([l.isupper() for l in token]) for token in only_tokens]
    return only_tokens, positions, tags, augmented[:len(only_tokens)], augmented[2:], lengths, uppercase, n_tokens

In [None]:
# This takes quite a while
from tqdm import tqdm

with open('data/i__training_data.csv', 'w') as w:
    writer = csv.writer(w)
    for i, vuelo in tqdm(list(vuelos.iterrows())):
        result = process_label(vuelo['label'])
        for row in zip(*result):
            writer.writerow(( i, len(vuelo['label']) ) + row)

In [None]:
training_data = pd.read_csv('data/i__training_data.csv', header=None,
                            names=['sentence_id', 'offer_len', 
                                   'token', 'loc', 'pos', 'pos_left', 'pos_right', 'token_len', 'all_upper', 'n_tokens'])
print(f'Length {len(training_data)}')
training_data.head(12)

## Individual test

In [None]:
values = process_label(vuelos.iloc[3]['label'], debug=True)

In [None]:
print(vuelos.iloc[3]['label'])
values[2]