In [1]:
import numpy as np
import pandas as pd
from collections import Counter


### Note: for unseen words, baseline should output most frequent tag.

### <b>Training Data:</b>

POS-tagged data from Berkeley Restaurant corpus. ~15,000 sentences in corpus. 

Assume: 1) POS tagset is closed. 2) New words will occur in testset. 

File format: Sentences are arranged as 1 word per line with blank line separating the sentences. Columns are tab separated. 1st col is word position, 2nd col is word, and 3rd col is POS tag. 

In [44]:
training_fname = 'training_set_shuffled.txt'

In [46]:
with open(training_fname, 'r') as file:
    training_data = file.readlines()

### <b>Evaluation:</b>

Basic script is provided and calculates overall accuracy compared to a gold standard eval set. 

``` python eval-pos.py  gold-file system-file ```

Produce a confusion matrix for more useful tool. 

### <b>Task: Build a probabilistic tagger</b>

#### <b>1) Baseline system:</b> 
Implement a "most frequent tag" system. Given counts from training data, the tagger should simply assign to each input word the tag that it was most frequently assigned to in the training data. 

In [47]:
def get_word_pos_count(training_data):
    '''
    Given a training set, create a list of lists as lookup table.
    Each inside list is a word, its POS, and the frequency of this pairing.
    '''
    
    # Create list of tuple: (word, POS)
    tups_list = []
    for line in training_data:
        if line != '\n':
            split_line = line.strip().split('\t')
            word_pos = split_line[1], split_line[2]
            tups_list.append(word_pos)

    # Get count of each unique tuple
    count_set = dict((x, tups_list.count(x)) for x in set(tups_list))

    # Create list of [word, POS, count]
    word_pos_count = []
    for k in count_set.keys():
        k_list = list(k)
        k_list.append(count_set[k])
        word_pos_count.append(k_list)
    
    return word_pos_count


In [50]:
def get_most_frequent_tag(word_pos_count, input_word):
    '''
    Given word_pos_count from training data & an input word, 
    get the most frequent POS by 
    calling the function get_word_pos_count() to get the lookup table
    and match input word to most frequent POS.
    
    Return most frequent tag for that word.
    '''
    
    # Get word_pos_count lookup table
    # word_pos_count = get_word_pos_count(training_data)
    
    # Get max word_pos_count's tag
    counts = []
    for i, word_pos in enumerate(word_pos_count):
        counts.append(word_pos_count[i][2])
    most_freq = word_pos_count[np.argmax(counts)][1]
    
    # Matching input word to possible word-tag-count lists
    matching_l = []
    for l in word_pos_count:    
        if l[0] == input_word:
            matching_l.append(l)

    # print('{} \n'.format(matching_l))

    # Find most frequent POS tag
    most_frequent_tag = ''
    # Dealing with unseen words (matching_l: []), assign POS 'UNK'
    if len(matching_l) == 0:
        most_frequent_tag = most_freq
    else:
        # Find max count and tag that POS to word
        max_count = matching_l[0][2] # Temporary max
        for match in matching_l:
            if match[2] > max_count:
                most_frequent_tag = match[1]
            elif match[2] == max_count:
                most_frequent_tag = match[1]
                
    return most_frequent_tag

In [51]:
# Test function
input_word = 'food'
get_most_frequent_tag(word_pos_count, input_word)

'NN'

In [None]:
# Process the short test file
with open('test_set_shuffled.txt', 'r') as test_file:
    test_data = test_file.readlines()

baseline_output = []
for line in test_data:
    if line != "\n":
        line = line.split()
        word = line[1]
        tag = get_most_frequent_tag(word_pos_count, word)
        baseline_output.append(tag)

In [None]:
baseline_output

In [457]:
new_file = ''
for line in test_data:
    if line != '\n':
        split_line = line.strip().split('\t')
        word = split_line[1]
        tag = get_most_frequent_tag(word_pos_count, word)
        # print(word, tag)
        new_file += '{}\t{}\t{}\n'.format(split_line[0], word, tag)
    elif line == '\n':
        new_file += '\n'

In [456]:
# Write new_file with POs tags to a file
with open('baseline_test.txt', 'w') as baseline:
    pass # Empty content before writing
    baseline.write(new_file)

#### <b>2) Viterbi algorithm:</b>  

Implement Viterbi with a bigram-based approach (only need previous to infer current). 
1. Extract required counts from training data to generate required probability estimates for model.
2. Deal with unknown words in some sensible way: UNK
3. Do some form of smoothing for the bigram tag model: Add 1
4. Implement Viterbi decoder.
5. Evaluate performance on unseen data.

<u>Step 1. Create state transition probability matrix</u>

In [782]:
short_train = training_data[0:7]
short_train[0]

'1\ti\tPRP\n'

In [772]:
# Create ordered list of POS & observed tokens
pos_array = []
token_array = []

# Get list of POS that are at position 1 in sentences
init_array = []

num_sentences = 0

for line in short_train:
    if line != '\n':
        split_line = line.strip().split('\t')
        pos = split_line[2]
        token = split_line[1]
        pos_array.append(pos)
        token_array.append(token)
        
        if split_line[0] == '1':
            init_array.append(split_line[2])
            num_sentences += 1
            
init_array, num_sentences, pos_array

(['PRP'], 1, ['PRP', 'MD', 'VB', 'TO', 'VB', 'IN', 'DT'])

In [771]:
transition_matrix = pd.crosstab(pd.Series(pos_array[:], name='given:'),
                                pd.Series(pos_array[:], name='followed:'), 
                                normalize=0)

transition_matrix

followed:,DT,IN,MD,PRP,TO,VB
given:,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DT,1.0,0.0,0.0,0.0,0.0,0.0
IN,0.0,1.0,0.0,0.0,0.0,0.0
MD,0.0,0.0,1.0,0.0,0.0,0.0
PRP,0.0,0.0,0.0,1.0,0.0,0.0
TO,0.0,0.0,0.0,0.0,1.0,0.0
VB,0.0,0.0,0.0,0.0,0.0,1.0


In [775]:
def transition_matrix(transitions):
    n = len(set(transitions)) #number of states

    M = [[0]*n for _ in range(n)]

    for (i,j) in zip(transitions,transitions[1:]):
        M[i][j] += 1

    #now convert to probabilities:
    for row in M:
        s = sum(row)
        if s > 0:
            row[:] = [f/s for f in row]
    return M

m = transition_matrix(pos_array)
for row in m: print(' '.join('{0:.2f}'.format(x) for x in row))


TypeError: list indices must be integers or slices, not str

In [755]:
# transition_matrix_np = transition_matrix.reset_index().values
# transition_matrix_np

<u>Step 2. Create emission / observation likelihood matrix</u>

In [757]:
emission_matrix = pd.crosstab(pd.Series(pos_array[:], name="tag:"),
                              pd.Series(token_array, name="word:"), 
                              normalize=0)
print(emission_matrix)
emission_matrix.at['VB','go']

word:   'd    a   go    i  like   to
tag:                                
DT     0.0  1.0  0.0  0.0   0.0  0.0
IN     0.0  0.0  0.0  0.0   0.0  1.0
MD     1.0  0.0  0.0  0.0   0.0  0.0
PRP    0.0  0.0  0.0  1.0   0.0  0.0
TO     0.0  0.0  0.0  0.0   0.0  1.0
VB     0.0  0.0  0.5  0.0   0.5  0.0


0.5

In [745]:
emission_matrix_np = emission_matrix.reset_index().values
# emission_matrix_np

### \***** smooth with Laplace add-1 smoothing for emission probs.

<u>Step 3. Create initial probability vector: prob of a POS follwing start of sentence</u>

In [738]:
init_vector = {}
for pos in list(set(pos_array)):
    init_vector[pos] = init_array.count(pos) / num_sentences
init_vector

{'MD': 0.0, 'PRP': 1.0, 'TO': 0.0, 'VB': 0.0}

<u>Step 3. Create table where columns are observations (all sentences in order) and rows are possible hidden states. 
    
Step 4. Sweep through table and find max prob and path. </u>

In [725]:
input_seq = ['i', "'d", 'like', 'food'] # emissions/seq - aka x
Q =  list(set(pos_array)) # set of states
'''
Given a sequence of emissions, return the most probably path and 
its joint probability.
'''

nrow, ncol = len(Q), len(x)+1
mat = np.zeros(shape=(nrow, ncol), dtype=float) # prob table
matTb = np.zeros(shape=(nrow, ncol), dtype=int) # backtrace

# Fill in 1st column of mat: P(POS|start) * P(word|POS)
start_col = []
for i, pos in enumerate(Q):
    print(pos)
    mat[i, 0] = init_vector[pos] * emission_matrix.at[pos, input_seq[0]]
mat

DT
NN
PRP
.
TO
MD
IN
VB
JJ


array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.66666667, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [715]:
# Fill in the rest of mat table
for j, token in enumerate(input_seq):
#     print(j, token)    
    one_token_all_states = []
    for i, pos in enumerate(Q):
        # Probability of first word "i" at each pos = mat[i, 0] * P(word|POS)
        # Starts filling in mat at j=1 now
        
        one_token_all_states.append(start_col[i] * emission_matrix.at[pos, token])
        
        # mat[i, j+1] = mat[i, j] * emission_matrix.at[pos, token]
#         for i2, pos2 in enumerate(Q):
            
#     * emission_matrix.at[pos, x[i]]

#     emission_matrix[i, x[0]] * init_vector[i]
    print(one_token_all_states)

[0.0, 0.0, 0.6666666666666666, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1111111111111111]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [453]:
# Try on a test file
with open('testfile.txt', 'w') as the_file:
    pass # Empty content before writing
    the_file.write("1\ti\n")
    the_file.write("2\t'd\n")
    the_file.write("3\tlike\n")
    the_file.write("4\tzachary\n")
    the_file.write("5\t's\n")
    the_file.write("6\ta\n")
    the_file.write("7\t-\n")
    the_file.write("8\tla\n")
    the_file.write("9\t-\n")
    the_file.write("10\tcarte\n")
    the_file.write("11\t.\n")
    the_file.write('\n')
    the_file.write("1\ti\n")
    the_file.write("2\tunseen\n")

In [361]:
# # Normalize needed?
# input_sentence = "i'd i've zachary's a-la-carte at 11 am."

# split_sentence = word_tokenize(input_sentence)
# print(split_sentence)

# for token_i, token in enumerate(split_sentence):
#     if ('-' in token):
#         split_token = re.split('(\W)', token)
#         print(split_token)
#         split_sentence[token_i] = split_token
        
# list(np.hstack(split_sentence))