In [5]:
import numpy as np
from collections import defaultdict
import itertools
from itertools import product

<b> Step 1. Calculate transition and emission probabilities </b>

In [None]:
def build_words_tags_sets(data):
    # Create a set of tags and set of words to process unseen words & get counts

    set_of_tags = set()
    set_of_tags.add("<s>"); set_of_tags.add("</s>")
    set_of_words = set()
    total_list_of_words = []
    
    # Add in UNK word class 
    set_of_words.add("UNK")

    for line in data:
        if line != '\n':
            word_tag = line.strip().split('\t')
            set_of_tags.add(word_tag[2])
            set_of_words.add(word_tag[1])
            total_list_of_words.append(word_tag[1])
            
    return set_of_tags, set_of_words, total_list_of_words

In [None]:
def build_lists_and_probabilities(train_data, set_of_tags, set_of_words, unk_count):

    # Get counts of tag, words, word-tags
    word_tag_count = defaultdict(int) # C(t, w)
    word_count = defaultdict(int) # C(w)
    tag_count = defaultdict(int) # C(t)

    tag_sequence = ["<s>"]  # Sequence of tags, a sentence ends with .
    tag_sequence_count = 0 
    tag_tag_count = defaultdict(int) # C(t_{i-1}, t_i)

    # print('{}'.format(train_data[0][2:]))

    for line in train_data:
         if line != '\n':
            word_tag = line.strip().split('\t')

            word_tag_count[(word_tag[1], word_tag[2])] += 1
            word_count[word_tag[1]] += 1
            tag_count[word_tag[2]] += 2

            # Get all tags from all lines
            tag_sequence.append(word_tag[2])
            if word_tag[1] == ".":
                tag_sequence.append("</s>")
                for i in range(0, len(tag_sequence)-1):
                    tag_tag_count[(tag_sequence[i], tag_sequence[i+1])] += 1
                tag_sequence = ["<s>"]
                tag_sequence_count += 1

    tag_count["<s>"] = tag_sequence_count
    tag_count["</s>"] = tag_sequence_count
    
    # For UNK words, count from corpus
    word_count["UNK"] = UNK_count

    # Combine tag set and word set 
    all_word_tag_combos = list(product(set_of_words, set_of_tags))
    for word_tag_combo in all_word_tag_combos:
        if word_tag_combo not in word_tag_count.keys():
            word_tag_count[word_tag_combo] = 0

    # Same thing for tag_tag_count
    all_tag_tag_combos = list(product(set_of_tags, set_of_tags))
    for tag_tag_combo in all_tag_tag_combos:
        if tag_tag_combo not in tag_tag_count.keys():
            tag_tag_count[tag_tag_combo] = 0

    # Get lists and probs
    # To smooth or to not smooth - that is the question...
    transition_probs = laplace_smooth_transition(tag_tag_count, tag_count, word_count)
    emission_probs = laplace_smooth_emission(word_tag_count, tag_count, word_count) 
#     transition_probs = calculate_transition(tag_tag_count, tag_count)
#     emission_probs = calculate_emission(word_tag_count, tag_count)
    list_of_tags = list(sorted(set_of_tags))
    list_of_words = list(sorted(set_of_words))
    
    return list_of_tags, list_of_words, word_count, transition_probs, emission_probs


In [None]:
def calculate_transition(tag_tag_count, tag_count):
    # Calculate transition probabilities
    # P(t_i | t_{i-1}) = C(t_{i-1, t_i}) / C(t_{i-1})
    transition_probs = {}
    for tag_tag in tag_tag_count:
        count_tag1tag2 = tag_tag_count[tag_tag]
        count_tag1 = tag_count[tag_tag[0]]
        transition_probs[tag_tag] = count_tag1tag2 / count_tag1        
    return transition_probs

In [None]:
def calculate_emission(word_tag_count, tag_count):
    # Calculate emission probabilities
    # P(w | t) = C(t, w) / C(t)
    # word-tag pairings with 0 prob not included, will be 0 by default if not in dict
    # No smoothing yet
    emission_probs = {}
    for word_tag in word_tag_count:
        count_word_tag = word_tag_count[word_tag]
        count_tag = tag_count[word_tag[1]]
        emission_probs[word_tag] = count_word_tag / count_tag
    return emission_probs

In [None]:
def laplace_smooth_transition(tag_tag_count, tag_count, word_count):
    vocab_size = len(word_count)
    
    transition_probs = {}
    for tag_tag in tag_tag_count:
        count_tag1tag2 = tag_tag_count[tag_tag]
        count_tag1 = tag_count[tag_tag[0]]
        transition_probs[tag_tag] = (count_tag1tag2 + 1) / (count_tag1 + vocab_size)        
    return transition_probs

In [None]:
def laplace_smooth_emission(word_tag_count, tag_count, word_count):
    vocab_size = len(word_count)
    
    emission_probs = {}
    for word_tag in word_tag_count:
        count_word_tag = word_tag_count[word_tag]
        count_tag = tag_count[word_tag[1]]
        emission_probs[word_tag] = (count_word_tag + 1) / (count_tag + vocab_size)
    return emission_probs

<b> Step 2. Viterbi algorithm </b>

In [None]:
def viterbi_pos_tagger(list_of_tags, input_sentence, transition_probs, emission_probs):
    # Initializing matrices & vector
    # Matrix with tags vs words
    p = np.zeros(shape=(len(list_of_tags), len(input_sentence)))
    # Matrix for backtrace
    back = np.zeros(shape=(len(list_of_tags), len(input_sentence)), dtype=np.int)

    # Initializing step: P(tag|start) * P(word1|tag)
    for tag_i, tag in enumerate(list_of_tags):
        # Fill first col of matrix p & back matrices
        tag_given_start = ('<s>', tag)
        word_given_tag = (input_sentence[0], tag)
        p[tag_i, 0] = transition_probs[tag_given_start] * emission_probs[word_given_tag]
        back[tag_i, 0] = 0  # RECHECK this - not sure how to initialize back pointer

    # Recursion step - go through every tag for each token:
    for word_i in range(1, len(input_sentence)):
        for tagi1, tag1 in enumerate(list_of_tags):
            # For each tag, get its prob given all other tags:
            # Prev column * P(tag|all tags) * P(word|tag)
            # Fill in viterbi matrix
            p[tagi1, word_i] = np.max([p[tagi2, word_i - 1] * transition_probs[tag2, tag1] * emission_probs[input_sentence[word_i], tag1] for tagi2,tag2 in enumerate(list_of_tags)])
            # Fill in backpointer
            back[tagi1, word_i] = np.argmax([p[tagi2, word_i - 1] * transition_probs[tag2, tag1] * emission_probs[input_sentence[word_i], tag1] for tagi2,tag2 in enumerate(list_of_tags)])

    # Termination steps
    best_path_prob = np.max([p[tag_i, len(input_sentence)-1] for tag_i, tag in enumerate(list_of_tags)])        
    best_path_pointer = np.argmax([p[tag_i, len(input_sentence)-1] for tag_i, tag in enumerate(list_of_tags)])        

    return p, back, best_path_pointer, best_path_prob

In [None]:
def backtrace(back, best_path_pointer, input_sentence):
    path_idx = [best_path_pointer]
    for column_i, column in enumerate(back.T[::-1]): # Starts at end 
        max_tag_idx = max(column)
        path_idx.append(max_tag_idx)
    # print(path_idx)

    tag_seq = []
    for i in range(0,len(path_idx)-1):
        tag_seq.append(list_of_tags[path_idx[i]])
        
    tag_seq = tag_seq[::-1]

    # print('Input sentence: {}'.format(input_sentence))
    # print('Part of speech: {}'.format(tag_seq))
    
    return tag_seq

# MIKE

In [6]:
with open('berp-POS-training.txt', 'r') as train_file:
    train_data = train_file.readlines()

In [27]:
XXX = np.array([ele.strip().split("\t") for ele in train_data if ele.strip() != ""])

In [28]:
XXX

array([['1', 'i', 'PRP'],
       ['2', "'d", 'MD'],
       ['3', 'like', 'VB'],
       ...,
       ['13', 'and', 'CC'],
       ['14', 'chips', 'NNS'],
       ['15', '.', '.']], dtype='<U17')

In [9]:
token_sentence_idxs = XXX[:,0]
tokens = XXX[:,1]
tags = XXX[:,2]

In [13]:
sentences = []
sentence = []
for token, tag in zip(tokens, tags):
    if token == '.':
        sentence.append(np.array([token, tag]))
        sentences.append(np.array(sentence))
        sentence = []
    else:
        sentence.append(np.array([token, tag]))

In [14]:
sentences = np.array(sentences)

In [15]:
num_total_sentences = sentences.shape[0]
num_total_sentences

15866

In [16]:
np.random.shuffle(sentences)

In [17]:
train_test_split = 0.8

In [18]:
num_train_sentences = int(num_total_sentences * train_test_split)
num_train_sentences

12692

In [19]:
num_test_sentences = num_total_sentences - num_train_sentences
num_test_sentences

3174

In [20]:
(num_test_sentences + num_train_sentences) == num_total_sentences

True

In [21]:
train_sentences = sentences[:num_train_sentences]
test_sentences = sentences[num_train_sentences:]

In [22]:
train_sentences.shape[0] == num_train_sentences

True

In [23]:
test_sentences.shape[0] == num_test_sentences

True

In [24]:
train_sentences.shape

(12692,)

In [25]:
test_sentences.shape

(3174,)

In [29]:
sentences

array([array([['do', 'VBP'],
       ['you', 'PRP'],
       ['have', 'VB'],
       ['any', 'DT'],
       ['vietnamese', 'JJ'],
       ['restaurants', 'NNS'],
       ['in', 'IN'],
       ['your', 'PRP$'],
       ['database', 'NN'],
       ['.', '.']], dtype='<U11'),
       array([['tell', 'VB'],
       ['me', 'PRP'],
       ['about', 'IN'],
       ['the', 'DT'],
       ['long', 'JJ'],
       ['life', 'NN'],
       ['vegi', 'NN'],
       ['house', 'NN'],
       ['.', '.']], dtype='<U5'),
       array([['i', 'PRP'],
       ["'d", 'MD'],
       ['like', 'VB'],
       ['to', 'TO'],
       ['go', 'VB'],
       ['no', 'DT'],
       ['further', 'NN'],
       ['than', 'IN'],
       ['two', 'CD'],
       ['miles', 'NNS'],
       ['.', '.']], dtype='<U7'),
       ...,
       array([['i', 'PRP'],
       ["'d", 'MD'],
       ['like', 'VB'],
       ['the', 'DT'],
       ['previous', 'JJ'],
       ['list', 'NN'],
       ['please', 'UH'],
       ['.', '.']], dtype='<U8'),
       array([['i', 'PRP'],
  

### Make Train/Test Files

In [None]:
def write_new_stuff(sentences, dataset_key):
    with open("{}_sentences_shuffled.txt".format(dataset_key), "w") as outfile:
        for sentence in sentences:
            for i, (token, tag) in enumerate(sentence, 1):
                out_write = "{}\t{}\t{}\n".format(i, token, tag)
                outfile.write(out_write)
            out_write = "\n"
            outfile.write(out_write)

In [None]:
write_new_stuff(train_sentences, "training")
write_new_stuff(test_sentences, "test")

## Unique Sets

In [26]:
# def get_unique_stuff(sentence, get="token"):
#     idx = 0 if get == "token" else 1
#     unique_raw = np.array([np.unique(ele[:,idx]) for ele in sentence])

#     flat_X = []
#     for ele in unique_raw:
#         flat_X += list(ele)
#     unique_set = set(np.unique(flat_X))
    
#     if get == "token":
#         unique_set.add()
#     else:
#         unique_set.add()
    
#     return unique_set

In [None]:
# comb_sentences = np.concatenate([train_sentences, test_sentences], axis=0)
# unique_total_tokens = get_unique_stuff(comb_sentences, get="token")
# unique_total_tags = get_unique_stuff(comb_sentences, get="tags")
# unique_total_tokens.shape, unique_total_tags.shape

In [None]:
# unique_train_tokens = get_unique_stuff(train_sentences, get="token")
# unique_train_tags = get_unique_stuff(train_sentences, get="tags")
# unique_train_tokens.shape, unique_train_tags.shape

In [None]:
# unique_test_tokens = get_unique_stuff(test_sentences, get="token")
# unique_test_tags = get_unique_stuff(test_sentences, get="tags")
# unique_test_tokens.shape, unique_test_tags.shape

# -----------//Mike---------

After splitting data into training & test set:
    1. Build word & tag sets for entire data & training data
    2. Words in entire data but not training data are turned into UNK & counted 
    3. Use training data and the UNK counts to build transition and emission matrices
    4. Load test set & turn tokens into a list, preserve "/n"
    5. Run viterbi function on test set, get list of tags 
    6. Write output file with idx-token-tag
    7. Compare output file and test set & calculate accuracy

In [None]:
### 1. 
# Entire data
with open('berp-POS-training.txt', 'r') as entire_file:
    entire_set = entire_file.readlines()

# Train set
with open('training_sentences_shuffled.txt', 'r') as train_file:
    train_set = train_file.readlines()
    
# Test set
with open('test_sentences_shuffled.txt', 'r') as test_file:
    test_set = test_file.readlines()
    
# Build word & tag sets
entire_tags, entire_words, entire_total_word_list = build_words_tags_sets(entire_set)
train_tags, train_words, train_total = build_words_tags_sets(train_set)
test_tags, test_words, test_total = build_words_tags_sets(test_set)

In [None]:
### 2. 
# Get UNK count
unk_count = 0
unk_words = []
for word in entire_words:
    if word != '/n':
        if word not in train_words:
            unk_words.append(word)
            unk_count += entire_total_word_list.count(unseen)
            
### 4. 
# Get sequence of words + Replace unseen words in test set with UNK
sequence_tokens = []
for line in test_set:
    if line != '\n':
        word_tag = line.strip().split('\t')
        if word_tag[1] not in train_words:
            sequence_tokens.append('UNK')
        else:
            sequence_tokens.append(word_tag[1])


In [None]:
XXX = np.array([ele.strip().split("\t") for ele in test_set if ele.strip() != ""])
tokens = XXX[:,1]
tags = XXX[:,2]

sentences_labels = []
sentence_labels = []

sentences = []
sentence = []
for token, tag in zip(tokens, tags):
    if token not in train_words:
        token = "UNK"
    if token == '.':
        sentence.append(token)
        sentences.append(sentence)
        sentence = []
        
    else:
        sentence.append(token)
        
    if tag == '.':
        sentence_labels.append(tag)
        sentences_labels.append(sentence_labels)
        sentence_labels = []
    else:
        sentence_labels.append(tag)

In [None]:
sentences_labels

In [None]:
# sequence_tokens = [['i', "'d", 'like', 'food', '.'], ['as', 'far', 'away', 'as', 'we', 'can', 'get', '.']]

### 3. 
# Use training data and the UNK counts to build transition and emission matrices
list_of_tags, list_of_words, word_count, transition_probs, emission_probs = build_lists_and_probabilities(train_set, train_tags, train_words, unk_count)

### 5. 
# Run viterbi function on test set, get list of tags 
sequence_tags = []
for seq in sentences:
    p, back, best_path_pointer, best_path_prob = viterbi_pos_tagger(list_of_tags, seq, transition_probs, emission_probs)
    output_tags = backtrace(back, best_path_pointer, seq)
    sequence_tags.append(output_tags)

sequence_tags   


In [None]:
sequence_tags

In [None]:
flat_list = [tag for tag_list in sequence_tags for tag in tag_list]
flat_list

In [None]:
flat_list = [tag_list for tag_list in sequence_tags]



In [None]:
flatten_list = lambda target_list : [tag for tag_list in target_list for tag in tag_list]

In [None]:
flat_list = flatten_list(sequence_tags)

In [None]:
flat_list = []
for tag_list in sequence_tags:
    for tag in tag_list:
        flat_list.append(tag)

In [None]:
# Flatten lists
sentences_labels_flat = flatten_list(sentences_labels)
sequence_tags_flat = flatten_list(sequence_tags)
len(sentences_labels_flat)

In [None]:
count_correct = 0
for tag in range(0, len(sentences_labels_flat)):
    if sentences_labels_flat[tag] == sequence_tags_flat[tag]:
        count_correct += 1
        
count_correct / len(sentences_labels_flat)

In [None]:
# Test on single sentence
input_sentence = ['i', "'d", 'like', 'to', 'go', 'to', 'UNK', '.', '/n'] 

# Training data
with open('berp-POS-training.txt', 'r') as train_file:
    train_data = train_file.readlines()
    
# Get tags and words from fixed lexicon from this train_data
set_of_tags, set_of_words = build_words_tags_sets(train_data)        

# Divide up data into train and test set
# Then get list of words from train_set
# Words not in train_set but in total set: process unseen & get counts

# Process unseen words & get cumulative counts
new_sentence = []
UNK_count = 0
for word in input_sentence:
    if (word != '/n'):
        if (word not in set_of_words):
            new_sentence.append('UNK')
            UNK_count += 1
        else:
            new_sentence.append(word)
        
# Get probability matrices, run viterbi & get output       
list_of_tags, list_of_words, word_count, transition_probs, emission_probs = build_lists_and_probabilities(train_data, set_of_tags, set_of_words, UNK_count)
p, back, best_path_pointer, best_path_prob = viterbi_pos_tagger(list_of_tags, new_sentence, transition_probs, emission_probs)
output_tags = backtrace(back, best_path_pointer, new_sentence)

In [None]:
# # Create a test set and try. 
# test_data = ""
# for line in train_data[:104]:
#     if len(line) == 1:
#         test_data += '\n'
#     elif len(line) > 1:
#         num_word = line.strip().split('\t')
#         num = num_word[0]
#         word = num_word[1]
#         new_line = num + '\t' + word
#         test_data += new_line + '\n'    
        
# # Write to file        
# with open('viterbi_testfile.txt', 'w') as testfile:
#     pass # Empty content before writing
#     testfile.write(test_data)

In [None]:
list2d = [[1,'AB',3],[4,5,6], [10], [8,9]]
merged = list(itertools.chain(*list2d))
merged