In [312]:
import numpy as np
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dieumynguyen/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### <b>Training Data:</b>

POS-tagged data from Berkeley Restaurant corpus. ~15,000 sentences in corpus. 

Assume: 1) POS tagset is closed. 2) New words will occur in testset. 

File format: Sentences are arranged as 1 word per line with blank line separating the sentences. Columns are tab separated. 1st col is word position, 2nd col is word, and 3rd col is POS tag. 

In [4]:
training_fname = 'berp-POS-training.txt'

In [92]:
with open(training_fname, 'r') as file:
    training_data = file.readlines()
    
# First few sentences & the space after them
training_data[0:21]

['1\ti\tPRP\n',
 "2\t'd\tMD\n",
 '3\tlike\tVB\n',
 '4\tto\tTO\n',
 '5\tgo\tVB\n',
 '6\tto\tIN\n',
 '7\ta\tDT\n',
 '8\tfancy\tJJ\n',
 '9\trestaurant\tNN\n',
 '10\t.\t.\n',
 '\n',
 '1\ti\tPRP\n',
 "2\t'd\tMD\n",
 '3\tlike\tVB\n',
 '4\tfrench\tJJ\n',
 '5\tfood\tNN\n',
 '6\t.\t.\n',
 '\n',
 '1\tnext\tJJ\n',
 '2\tthursday\tNN\n',
 '3\t.\t.\n']

### <b>Evaluation:</b>

Basic script is provided and calculates overall accuracy compared to a gold standard eval set. 

``` python eval-pos.py  gold-file system-file ```

Produce a confusion matrix for more useful tool. 

### <b>Task: Build a probabilistic tagger</b>

#### <b>1) Baseline system:</b> 
Implement a "most frequent tag" system. Given counts from training data, the tagger should simply assign to each input word the tag that it was most frequently assigned to in the training data. 

In [142]:
# Create list of tuple: (word, POS)
tups_list = []
for line in training_data[:]:
    if line != '\n':
        split_line = line.strip().split('\t')
        word_pos = split_line[1], split_line[2]
        tups_list.append(word_pos)

# Get count of each unique tuple
count_set = dict((x, tups_list.count(x)) for x in set(tups_list))

# Create list of [word, POS, count]
word_pos_count = []
for k in count_set.keys():
    k_list = list(k)
    k_list.append(count_set[k])
    word_pos_count.append(k_list)

In [306]:
input_word = '\'s'

# Matching input word to a word-tag-count
matching_l = []
for l in word_pos_count:    
    if l[0] == input_word:
        matching_l.append(l)
    
print('{} \n'.format(matching_l))


most_frequent_tag = ''
# Dealing with unseen words, assign POS 'UNK'
if len(matching_l) == 0:
    most_frequent_tag = 'UNK'
    
else:
    # Find max count and tag that POS to word
    max_count = matching_l[0][2]
    for match in matching_l:
        if match[2] > max_count:
            most_frequent_tag = match[1]
        elif match[2] == max_count:
            most_frequent_tag = match[1]

print(most_frequent_tag)

[["'s", 'POS', 1113], ["'s", 'PRP', 426], ["'s", 'VBZ', 478]] 

POS


In [335]:
input_sentence = "i'd zachary's a-la-carte."

In [336]:
re.split('(\W)', input_sentence)

['i',
 "'",
 'd',
 ' ',
 'zachary',
 "'",
 's',
 ' ',
 'a',
 '-',
 'la',
 '-',
 'carte',
 '.',
 '']

In [337]:
split_sentence = word_tokenize(input_sentence)
print(split_sentence)

for token_i, token in enumerate(split_sentence):
    if '-' in token:
        split_token = re.split('(\W)', token)
        print(split_token)
        split_sentence[token_i] = split_token
        
list(np.hstack(split_sentence))

['i', "'d", 'zachary', "'s", 'a-la-carte', '.']
['a', '-', 'la', '-', 'carte']


['i', "'d", 'zachary', "'s", 'a', '-', 'la', '-', 'carte', '.']