# Creating a tagger for flight offer phrases
### Like this one: "¡CDMX a Bogotá 🇨🇴 $4,659!"

In [1]:
# libraries
import pandas as pd
import csv

In [2]:
# Load dataset:
vuelos = pd.read_csv('data/vuelos.csv', index_col=0)
with pd.option_context('max_colwidth', 800):
    print(vuelos.loc[:100:5][['label']])

                                                                                                   label
0                                                           ¡CUN a Ámsterdam $8,960! Sin escala en EE.UU
5                              ¡GDL a Los Ángeles $3,055! Directos (Agrega 3 noches de hotel por $3,350)
10                                      ¡CUN a Puerto Rico $3,296! (Agrega 3 noches de hotel por $2,778)
15                    ¡LA a Seúl, regresa desde Tokio 🇯🇵 $8,607! (Por $3,147 agrega 11 noches de hostal)
20                                           ¡CDMX a Chile $8,938! (Agrega 9 noches de hotel por $5,933)
25                                                               ¡CUN a Holanda $8,885! Sin escala EE.UU
30                                              ¡Todo México a París, regresa desde Amsterdam – $11,770!
35                  ¡CDMX a Vietnam $10,244! Sin escala en EE.UU (Agrega 15 noches de hostal por $2,082)
40                                     ¡CDMX a Europa e

Most of the offers follow a simple pattern: *Destination - Origin - Price - Extras*, while extracting this may seem easy for a regular expression, it is not (see this notebook for reference). 

The idea is to create a tagger that will be able to extract this information, however, one first tag is to identify the information that we want to extract. Following the pattern described above: 

 - **DST**: Destination 
 - **ORI**: Origin 
 - **PRC**: Price 
 - **EXT**: Extras
 
| Text 	| DST 	| ORI 	| PRC 	| OTH 	|
|------	|-----	|-----	|-----	|-----	|
| ¡CUN a Holanda \$8,885! Sin escala EE.UU | CUN | Holanda | 8,885 | Sin escala EE.UU |   
| ¡CDMX a Noruega <span>$</span>10,061! (Y agrega 9 noches de hotel por \$7,890!) | CDMX | Noruega | 10,061 | Y agrega 9 noches de hotel por \$7,890!| 
| ¡Todo México a Pisa, Toscana Italia \$12,915! Sin escala EE.UU (Y por \$3,975 agrega 13 noches hotel) | México | Pisa, Toscana Italia | 12,915 | Sin escala EE.UU (Y por \$3,975 agrega 13 noches hotel) |

## Tokenize and POS-tag the dataset 
We need to generate a *csv* file that we can tag (manually 😨) that consists of:
```
token1    POS tag    Label
token2    POS tag    Label
token3    POS tag    Label
```

Where `Label` will be one of DST, ORI, PRC, OTH and NA and will be manually assigned (again: 😨)

In [3]:
from nltk.tag.stanford import StanfordPOSTagger

spanish_postagger = StanfordPOSTagger('/Users/antonioferegrino/stanford_nlp/'
                                      'stanford-postagger-full-2018-02-27/models/spanish-distsim.tagger', 
                                      '/Users/antonioferegrino/stanford_nlp/'
                                      'stanford-postagger-full-2018-02-27/stanford-postagger.jar')

print(spanish_postagger.tag('Pepe Pecas pica papas con un pico, con un pico pica papas Pepe Pecas.'.split()))

[('Pepe', 'np00000'), ('Pecas', 'np00000'), ('pica', 'aq0000'), ('papas', 'nc0p000'), ('con', 'sp000'), ('un', 'di0000'), ('pico,', 'nc0s000'), ('con', 'sp000'), ('un', 'di0000'), ('pico', 'nc0s000'), ('pica', 'aq0000'), ('papas', 'nc0p000'), ('Pepe', 'np00000'), ('Pecas.', 'np00000')]


In [4]:
from nltk.tokenize import TweetTokenizer
tknz = TweetTokenizer()

transforms = {
    'LA': ['Los', 'Angeles']
}

def index_emoji_tokenize(string, return_flags=False):
    i = 0
    flag = ''
    ix = 0
    for t in tknz.tokenize(string):
        ix = string.find(t, ix)
        if len(t) == 1 and ord(t) >= 127462: # this is the code for 🇦
            if not return_flags: continue
            if flag:
                yield flag + t, ix - 1
                flag = ''
            else:
                flag = t
        else:
            yield t, ix
        ix=+1
        

label = vuelos.iloc[75]['label']
print(label)
print()
tokens = list(index_emoji_tokenize(label, return_flags=True))
print(tokens)

¡LA a Bangkok 🇹🇭$8,442! (Por $2,170 agrega 6 noches de Hotel)

[('¡', 0), ('LA', 1), ('a', 4), ('Bangkok', 6), ('🇹🇭', 14), ('$', 16), ('8,442', 17), ('!', 22), ('(', 24), ('Por', 25), ('$', 16), ('2,170', 30), ('agrega', 36), ('6', 43), ('noches', 45), ('de', 52), ('Hotel', 55), (')', 60)]


In [5]:
simply_tokens = [ l[0] for l in tokens ]
print(spanish_postagger.tag(simply_tokens))

[('¡', 'faa'), ('LA', 'pp000000'), ('a', 'sp000'), ('Bangkok', 'np00000'), ('🇹🇭', 'dn0000'), ('$', 'zm'), ('8,442', 'dn0000'), ('!', 'fat'), ('(', 'np00000'), ('Por', 'sp000'), ('$', 'nc0p000'), ('2,170', 'dn0000'), ('agrega', 'vmip000'), ('6', 'dn0000'), ('noches', 'nc0p000'), ('de', 'sp000'), ('Hotel', 'np00000'), (')', 'word')]


In [6]:
def process_label(label, debug=False):
    tokens = list(index_emoji_tokenize(label, True))
    if debug:
        print('Tokens', len(tokens))
    only_tokens = [l[0] for l in tokens]
    if debug:
        print('Only tokens', len(only_tokens))
    positions = [l[1] for l in tokens]
    if debug:
        print('Positions', len(positions))
    tagged = spanish_postagger.tag(only_tokens)
    if debug:
        print('Tagged', len(tagged))
    tags =  [l[1] for l in tagged]
    if debug:
        print('Tags', len(tags))
    lengths =  [len(l) for l in only_tokens]
    if debug:
        print('Lengths', len(lengths))
    n_tokens =  [len(only_tokens) for l in only_tokens]
    if debug:
        print('N tokens', len(n_tokens))
    uppercase = [all([l.isupper() for l in token]) for token in only_tokens]
    return only_tokens, positions, tags, lengths, uppercase, n_tokens

In [7]:
# This takes quite a while
from tqdm import tqdm

with open('data/i__training_data.csv', 'w') as w:
    writer = csv.writer(w)
    for i, vuelo in tqdm(list(vuelos.iterrows())):
        result = process_label(vuelo['label'])
        for row in zip(*result):
            writer.writerow(( i, len(vuelo['label']) ) + row)

100%|██████████| 2619/2619 [1:21:32<00:00,  1.81s/it]


In [7]:
training_data = pd.read_csv('data/i__training_data.csv', header=None,
                            names=['sentence_id', 'offer_len', 'token', 'pos', 'token_len', 'all_upper'])
print(f'Length {len(training_data)}')
training_data.head()

Length 40969


Unnamed: 0,Unnamed: 1,Unnamed: 2,sentence_id,offer_len,token,pos,token_len,all_upper
0,44,¡,0,faa,1,False,11,n
0,44,CUN,1,np00000,3,True,11,o
0,44,a,5,sp000,1,False,11,s
0,44,Ámsterdam,7,np00000,9,False,11,d
0,44,$,17,zm,1,False,11,n


## Individual test

In [9]:
values = process_label(vuelos.iloc[3]['label'], debug=True)

Tokens 22
Only tokens 22
Positions 22
Tagged 22
Tags 22
Lengths 22
N tokens 22
