# Sequence Tagging

## Download Data

Retrieve the path to the `cs329` project:

In [138]:
from pathlib import Path

path = Path.cwd()

while path.name != 'cs329':
    path = path.parent

print(path)
print(type(path))

/Users/jdchoi/workspace/cs329
<class 'pathlib.PosixPath'>


Create the `dat/pos` directory under the `cs329` project:

In [139]:
path /= 'dat/pos'
path.mkdir(parents=True, exist_ok=True)
print(path)

/Users/jdchoi/workspace/cs329/dat/pos


Download the [training set](https://raw.githubusercontent.com/emory-courses/cs329/master/dat/pos/wsj-pos.trn.gold.tsv) and the [development set](https://raw.githubusercontent.com/emory-courses/cs329/master/dat/pos/wsj-pos.dev.gold.tsv) for part-of-speech tagging:

In [140]:
import requests

def download(remote_addr: str, local_addr: str):
    r = requests.get(remote_addr)

    with open(local_addr, 'wb') as fin:
        fin.write(r.content)

In [141]:
import os

url = 'https://raw.githubusercontent.com/emory-courses/cs329/master/dat/pos/wsj-pos.{}.gold.tsv'

remote = url.format('trn')
download(remote, path / Path(remote).name)

remote = url.format('dev')
download(remote, path / Path(remote).name)

## Read Data

Retrieve the training data:

In [142]:
def read_pos(filename: str):
    data, sentence = [], []
    fin = open(filename)
    
    for line in fin:
        l = line.split()
        if l:
            sentence.append((l[0], l[1]))
        else:
            data.append(sentence)
            sentence = []
    
    return data

In [143]:
trn_data = read_pos(path / 'wsj-pos.trn.gold.tsv')
print(len(trn_data))
print(trn_data[0])

38219
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


### Exercise

How many tokens are in `trn_data`?

In [146]:
n = sum([len(sentence) for sentence in trn_data])
print(n)

n = 0
for sentence in trn_data:
    n += len(sentence)

print(n)

912344
912344


## Label Map

### Exercise

Given `trn_data`, create a dictionary whose key is POS tag and value is a unique ID:

In [147]:
tagset = {pos for sentence in trn_data for token, pos in sentence}
print(tagset)

label_map = {pos:i for i, pos in enumerate(sorted(list(tagset)))}
print(label_map)

{'EX', 'RP', '#', 'NN', ',', 'JJR', 'PDT', '-LRB-', 'VBP', 'LS', 'SYM', 'NNPS', 'VBN', 'WDT', 'PRP$', 'JJ', ':', 'VBG', 'WP', 'VBZ', 'RB', 'DT', '-RRB-', '``', 'RBR', 'IN', 'VBD', 'POS', '$', 'UH', 'FW', 'PRP', "''", 'TO', 'RBS', 'JJS', 'CD', '.', 'VB', 'NNP', 'MD', 'WRB', 'CC', 'WP$', 'NNS'}
{'#': 0, '$': 1, "''": 2, ',': 3, '-LRB-': 4, '-RRB-': 5, '.': 6, ':': 7, 'CC': 8, 'CD': 9, 'DT': 10, 'EX': 11, 'FW': 12, 'IN': 13, 'JJ': 14, 'JJR': 15, 'JJS': 16, 'LS': 17, 'MD': 18, 'NN': 19, 'NNP': 20, 'NNPS': 21, 'NNS': 22, 'PDT': 23, 'POS': 24, 'PRP': 25, 'PRP$': 26, 'RB': 27, 'RBR': 28, 'RBS': 29, 'RP': 30, 'SYM': 31, 'TO': 32, 'UH': 33, 'VB': 34, 'VBD': 35, 'VBG': 36, 'VBN': 37, 'VBP': 38, 'VBZ': 39, 'WDT': 40, 'WP': 41, 'WP$': 42, 'WRB': 43, '``': 44}


## Feature Map

### Exercise

Given `trn_data`, create a dictionary whose key is a feature and value is a unique ID.
The following features need to be extracted for each token $w_i$:

* $w_i$: the word form of the current token
* $w_{i-1}$: the word from of the previous token
* $w_{i+1}$: the word from of the next token
* $p_{i-1}$: the (predicted) POS tag of the previous token

In [101]:
feature_map = {}

for sentence in trn_data:
    for i in range(len(sentence)):
        c = sentence[i]
        p = sentence[i-1] if i-1 >= 0 else None
        n = sentence[i+1] if i+1 < len(sentence) else None

        fs = []
        fs.append('f0' + c[0])
        if p: fs.append('f1' + p[0])
        if n: fs.append('f2' + n[0])
        if p: fs.append('f3' + p[1])

        for f in fs:
            feature_map.setdefault(f, len(feature_map)+1)

In [104]:
sentence = trn_data[0]
print(sentence)

print(feature_map['f0'+'Vinken'])
print(feature_map['f1'+'Vinken'])
print(feature_map['f2'+'Vinken'])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
3
8
2


## Create Instances

Create a instance per token where the instnace is a tuple of the feature vector and the label ID for the corresponding token:

In [127]:
from typing import List, Tuple
import numpy as np

def extract_instances(data: List[List[Tuple[str, str]]], label_map, feature_map, training=False):
    instances = []
    
    for sentence in data:
        for i in range(len(sentence)):
            c = sentence[i]
            p = sentence[i-1] if i-1 >= 0 else None
            n = sentence[i+1] if i+1 < len(sentence) else None
            
            # label map
            if training: y = label_map.setdefault(c[1], len(label_map))
            else: y = label_map.get(c[1], -1)
            if y < 0: continue
            
            fs = []
            fs.append('f0' + c[0])
            if p: fs.append('f1' + p[0])
            if n: fs.append('f2' + n[0])
            if p: fs.append('f3' + p[1])
            
            # feature map
            if training: features = [feature_map.setdefault(f, len(feature_map)+1) for f in fs]
            else: features = [feature_map[f] for f in fs if f in feature_map]
            
            features.append(0)
            x = np.array(sorted(features))
            instances.append((x, y))
    
    return instances

In [122]:
label_map = {}
feature_map = {}

trn_inst = extract_instances(trn_data, label_map, feature_map, True)

In [128]:
sentence = trn_data[0]

for i in range(len(sentence)):
    print(sentence[i], trn_inst[i])

('Pierre', 'NNP') (array([0, 1, 2]), 0)
('Vinken', 'NNP') (array([0, 3, 4, 5, 6]), 0)
(',', ',') (array([0, 6, 7, 8, 9]), 1)
('61', 'CD') (array([ 0, 10, 11, 12, 13]), 2)
('years', 'NNS') (array([ 0, 14, 15, 16, 17]), 3)
('old', 'JJ') (array([ 0,  5, 18, 19, 20]), 4)
(',', ',') (array([ 0,  7, 21, 22, 23]), 1)
('will', 'MD') (array([ 0, 11, 13, 24, 25]), 5)
('join', 'VB') (array([ 0, 26, 27, 28, 29]), 6)
('the', 'DT') (array([ 0, 30, 31, 32, 33]), 7)
('board', 'NN') (array([ 0, 34, 35, 36, 37]), 8)
('as', 'IN') (array([ 0, 38, 39, 40, 41]), 9)
('a', 'DT') (array([ 0, 42, 43, 44, 45]), 7)
('nonexecutive', 'JJ') (array([ 0, 37, 46, 47, 48]), 4)
('director', 'NN') (array([ 0, 23, 49, 50, 51]), 8)
('Nov.', 'NNP') (array([ 0, 41, 52, 53, 54]), 0)
('29', 'CD') (array([ 0,  6, 55, 56, 57]), 2)
('.', '.') (array([ 0, 17, 58, 59]), 10)
