# Parts of Speech: Creating Vocab and Handling Unknown Words
- Create vocab from tagged dataset
- Read text files
- Work with defaultdict
- Work with string data

In [1]:
import string
from collections import defaultdict

In [3]:
# Read lines from pos file and save tem into the lines variable
with open("WSJ_02-21.pos") as file:
    lines = file.readlines()

In [4]:
# Print columns for reference
print("\t\tWord", "\tTag\n")
for i in range(5):
    print(f'line number {i+1}: {lines[i]}')

		Word 	Tag

line number 1: In	IN

line number 2: an	DT

line number 3: Oct.	NNP

line number 4: 19	CD

line number 5: review	NN



If you want to understand the meaning of these tags you can take a look [here](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html).

### Creating a Vocabulary
- Get only the words from dataset
- Use a default to count the number of times each word appears
- Filter the dict to only include words that appeared at least 2 times
- Create a list out of the filtered dict
- Sort the list

In [5]:
# Get the words from each line in the dataset
words = [line.split('\t')[0] for line in lines]

In [6]:
freq = defaultdict(int)

# Count frequency of occurence for each word in the dataset
for word in words:
    freq[word] += 1

In [7]:
# Create vocabulary by filtering the freq dictionary
vocab = [k for k, v in freq.items() if(v > 1 and k != '\n')]

In [9]:
# Sort the vocabulary
vocab.sort()

# Print some random values of the vocabulary
for i in range(4000, 4005):
    print(vocab[i])

Early
Earnings
Earth
Earthquake
East


### Processing New Text Sources
- New text sources, A new text will have words that do not appear in the vocabulary
- Classify each new word, an unknown
- --unk_digit-- Unknown word with a digit
- --unk_punct-- UW with a punctuation character
- --unk_upper-- UW contains upper-case
- --unk_noun-- Noun
- --unk_verb-- Verb
- --unk_adj-- Adjective
- --unk_adv-- Adverb
- --unk-- Not fell in anything above

In [10]:
def assign_unk(word):
    """
    Assign tokens to unknown words
    """
    # Punctuation Charactoers, try printing them out in a new cell
    punct = set(string.punctuation)
    
    # Suffixes
    noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
    verb_suffix = ["ate", "ify", "ise", "ize"]
    adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
    adv_suffix = ["ward", "wards", "wise"]
    
    # Loop the characters in the word, check if any is a digit
    if(any(char.isdigit() for char in word)):
        return "--unk_dict--"
    
    # Loop the characters in the word, check if any is an upper case character
    elif any(char.isupper() for char in word):
        return "--unk_upper--"
    
    # Check if word ends with any noun suffix
    elif any(word.endswith(suffix) for suffix in noun_suffix):
        return "--unk_noun--"
    
    # Check if word ends with any verb suffix
    elif any(word.endswith(suffix) for suffix in verb_suffix):
        return "--unk_verb--"
    
    # Check if word ends with any adjective suffix
    elif any(word.endswith(suffix) for suffix in adj_suffix):
        return "--unk_adj--"
    
    # Check if word ends with any adverb suffix
    elif any(word.endswith(suffix) for suffix in adv_suffix):
        return "--unk_adv--"
   
    # If none of the previous criteria is met, return plain unknown
    return "--unk--"


In [11]:
### Tag the Word
def get_word_tag(line, vocab):
    if not line.split():
        word = "--n--"
        tag = "--s--"
    else:
        # Split the line to separate word and tag
        word, tag = line.split()
        # Check if word is not in vocab
        if(word not in vocab):
            # Handle unknown word
            word = assign_unk(word)
            
    return word, tag

In [12]:
get_word_tag('\n', vocab)

('--n--', '--s--')

In [13]:
get_word_tag('In\tIN\n', vocab)

('In', 'IN')

In [14]:
get_word_tag('tardigrade\tNN\n', vocab)

('--unk--', 'NN')

In [15]:
get_word_tag('scrutinize\tVB\n', vocab)

('--unk_verb--', 'VB')