In [1]:
from collections import defaultdict
import string

In [7]:
DATA = '../../../../data'

In [8]:
with open(f'{DATA}/WSJ_02-21.pos', 'r') as f:
    lines = f.readlines()

In [9]:
print("\t\tWord", "\tTag\n")
for i in range(5):
    print(f'line number {i + 1}: {lines[i]}')

		Word 	Tag

line number 1: In	IN

line number 2: an	DT

line number 3: Oct.	NNP

line number 4: 19	CD

line number 5: review	NN



In [10]:
lines[0]

'In\tIN\n'

In [11]:
words = [line.split('\t')[0] for line in lines]

In [12]:
freq = defaultdict(int)
for word in words:
    freq[word] += 1

In [14]:
vocab = sorted([k for k, v in freq.items() if (v > 1 and k != '\n')])

In [15]:
for i in range(4000, 4005):
    print(vocab[i])

Early
Earnings
Earth
Earthquake
East


In [16]:
def assign_unk(word):
    '''
    Assign tokens to unknown words
    '''
    punct = set(string.punctuation)
    noun_suffix = [
        "action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood",
        "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", 
        "scape", "ship", "ty"]
    verb_suffix = ["ate", "ify", "ise", "ize"]
    adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", 
                  "ive", "less", "ly", "ous"]
    adv_suffix = ["ward", "wards", "wise"]
    if any(char.isdigit() for char in word):
        return "--unk_digit--"
    elif any(char in punct for char in word):
        return "--unk_punct--"
    elif any(char.isupper() for char in word):
        return "--unk_upper--"
    elif any(word.endswith(suffix) for suffix in noun_suffix):
        return "--unk_noun--"
    elif any(word.endswith(suffix) for suffix in verb_suffix):
        return "--unk_verb--"
    elif any(word.endswith(suffix) for suffix in adj_suffix):
        return "--unk_adj--"
    elif any(word.endswith(suffix) for suffix in adv_suffix):
        return "--unk_adv--"
    return "--unk--"

In [17]:
def get_word_tag(line, vocab):
    if not line.split():
        word = "--n--"
        tag = "--s--"
    else:
        word, tag = line.split()
        if word not in vocab: 
            word = assign_unk(word)
    return word, tag

In [18]:
get_word_tag('\n', vocab)

('--n--', '--s--')

In [19]:
get_word_tag('In\tIN\n', vocab)

('In', 'IN')

In [20]:
get_word_tag('tardigrade\tNN\n', vocab)

('--unk--', 'NN')

In [21]:
get_word_tag('scrutinize\tVB\n', vocab)

('--unk_verb--', 'VB')