In [1]:
import numpy as np
from collections import defaultdict
import itertools
from itertools import product

<b> Step 1. Calculate transition and emission probabilities </b>

In [452]:
def build_words_tags_sets(data):
    # Create a set of tags and set of words to process unseen words & get counts

    set_of_tags = set()
    set_of_tags.add("<s>"); set_of_tags.add("</s>")
    set_of_words = set()
    total_list_of_words = []
    
    # Add in UNK word class 
    set_of_words.add("UNK")

    for line in data:
        if line != '\n':
            word_tag = line.strip().split('\t')
            set_of_tags.add(word_tag[2])
            set_of_words.add(word_tag[1])
            total_list_of_words.append(word_tag[1])
            
    return set_of_tags, set_of_words, total_list_of_words

In [160]:
def build_lists_and_probabilities(train_data, set_of_tags, set_of_words, unk_count):

    # Get counts of tag, words, word-tags
    word_tag_count = defaultdict(int) # C(t, w)
    word_count = defaultdict(int) # C(w)
    tag_count = defaultdict(int) # C(t)

    tag_sequence = ["<s>"]  # Sequence of tags, a sentence ends with .
    tag_sequence_count = 0 
    tag_tag_count = defaultdict(int) # C(t_{i-1}, t_i)

    # print('{}'.format(train_data[0][2:]))

    for line in train_data:
         if line != '\n':
            word_tag = line.strip().split('\t')

            word_tag_count[(word_tag[1], word_tag[2])] += 1
            word_count[word_tag[1]] += 1
            tag_count[word_tag[2]] += 2

            # Get all tags from all lines
            tag_sequence.append(word_tag[2])
            if word_tag[1] == ".":
                tag_sequence.append("</s>")
                for i in range(0, len(tag_sequence)-1):
                    tag_tag_count[(tag_sequence[i], tag_sequence[i+1])] += 1
                tag_sequence = ["<s>"]
                tag_sequence_count += 1

    tag_count["<s>"] = tag_sequence_count
    tag_count["</s>"] = tag_sequence_count
    
    # For UNK words, count from corpus
    word_count["UNK"] = UNK_count

    # Combine tag set and word set 
    all_word_tag_combos = list(product(set_of_words, set_of_tags))
    for word_tag_combo in all_word_tag_combos:
        if word_tag_combo not in word_tag_count.keys():
            word_tag_count[word_tag_combo] = 0

    # Same thing for tag_tag_count
    all_tag_tag_combos = list(product(set_of_tags, set_of_tags))
    for tag_tag_combo in all_tag_tag_combos:
        if tag_tag_combo not in tag_tag_count.keys():
            tag_tag_count[tag_tag_combo] = 0

    # Get lists and probs
    # To smooth or to not smooth - that is the question...
    transition_probs = laplace_smooth_transition(tag_tag_count, tag_count, word_count)
    emission_probs = laplace_smooth_emission(word_tag_count, tag_count, word_count) 
#     transition_probs = calculate_transition(tag_tag_count, tag_count)
#     emission_probs = calculate_emission(word_tag_count, tag_count)
    list_of_tags = list(sorted(set_of_tags))
    list_of_words = list(sorted(set_of_words))
    
    return list_of_tags, list_of_words, word_count, transition_probs, emission_probs


In [146]:
def calculate_transition(tag_tag_count, tag_count):
    # Calculate transition probabilities
    # P(t_i | t_{i-1}) = C(t_{i-1, t_i}) / C(t_{i-1})
    transition_probs = {}
    for tag_tag in tag_tag_count:
        count_tag1tag2 = tag_tag_count[tag_tag]
        count_tag1 = tag_count[tag_tag[0]]
        transition_probs[tag_tag] = count_tag1tag2 / count_tag1        
    return transition_probs

In [147]:
def calculate_emission(word_tag_count, tag_count):
    # Calculate emission probabilities
    # P(w | t) = C(t, w) / C(t)
    # word-tag pairings with 0 prob not included, will be 0 by default if not in dict
    # No smoothing yet
    emission_probs = {}
    for word_tag in word_tag_count:
        count_word_tag = word_tag_count[word_tag]
        count_tag = tag_count[word_tag[1]]
        emission_probs[word_tag] = count_word_tag / count_tag
    return emission_probs

In [148]:
def laplace_smooth_transition(tag_tag_count, tag_count, word_count):
    vocab_size = len(word_count)
    
    transition_probs = {}
    for tag_tag in tag_tag_count:
        count_tag1tag2 = tag_tag_count[tag_tag]
        count_tag1 = tag_count[tag_tag[0]]
        transition_probs[tag_tag] = (count_tag1tag2 + 1) / (count_tag1 + vocab_size)        
    return transition_probs

In [149]:
def laplace_smooth_emission(word_tag_count, tag_count, word_count):
    vocab_size = len(word_count)
    
    emission_probs = {}
    for word_tag in word_tag_count:
        count_word_tag = word_tag_count[word_tag]
        count_tag = tag_count[word_tag[1]]
        emission_probs[word_tag] = (count_word_tag + 1) / (count_tag + vocab_size)
    return emission_probs

<b> Step 2. Viterbi algorithm </b>

In [150]:
def viterbi_pos_tagger(list_of_tags, input_sentence, transition_probs, emission_probs):
    # Initializing matrices & vector
    # Matrix with tags vs words
    p = np.zeros(shape=(len(list_of_tags), len(input_sentence)))
    # Matrix for backtrace
    back = np.zeros(shape=(len(list_of_tags), len(input_sentence)), dtype=np.int)

    # Initializing step: P(tag|start) * P(word1|tag)
    for tag_i, tag in enumerate(list_of_tags):
        # Fill first col of matrix p & back matrices
        tag_given_start = ('<s>', tag)
        word_given_tag = (input_sentence[0], tag)
        p[tag_i, 0] = transition_probs[tag_given_start] * emission_probs[word_given_tag]
        back[tag_i, 0] = 0  # RECHECK this - not sure how to initialize back pointer

    # Recursion step - go through every tag for each token:
    for word_i in range(1, len(input_sentence)):
        for tagi1, tag1 in enumerate(list_of_tags):
            # For each tag, get its prob given all other tags:
            # Prev column * P(tag|all tags) * P(word|tag)
            # Fill in viterbi matrix
            p[tagi1, word_i] = np.max([p[tagi2, word_i - 1] * transition_probs[tag2, tag1] * emission_probs[input_sentence[word_i], tag1] for tagi2,tag2 in enumerate(list_of_tags)])
            # Fill in backpointer
            back[tagi1, word_i] = np.argmax([p[tagi2, word_i - 1] * transition_probs[tag2, tag1] * emission_probs[input_sentence[word_i], tag1] for tagi2,tag2 in enumerate(list_of_tags)])

    # Termination steps
    best_path_prob = np.max([p[tag_i, len(input_sentence)-1] for tag_i, tag in enumerate(list_of_tags)])        
    best_path_pointer = np.argmax([p[tag_i, len(input_sentence)-1] for tag_i, tag in enumerate(list_of_tags)])        

    return p, back, best_path_pointer, best_path_prob

In [587]:
def backtrace(back, best_path_pointer, input_sentence):
    path_idx = [best_path_pointer]
    for column_i, column in enumerate(back.T[::-1]): # Starts at end 
        max_tag_idx = max(column)
        path_idx.append(max_tag_idx)
    # print(path_idx)

    tag_seq = []
    for i in range(0,len(path_idx)-1):
        tag_seq.append(list_of_tags[path_idx[i]])
        
    tag_seq = tag_seq[::-1]

    # print('Input sentence: {}'.format(input_sentence))
    # print('Part of speech: {}'.format(tag_seq))
    
    return tag_seq

# MIKE

In [231]:
with open('berp-POS-training.txt', 'r') as train_file:
    train_data = train_file.readlines()

In [247]:
XXX = np.array([ele.strip().split("\t") for ele in train_data if ele.strip() != ""])
XXX[:12]

array([['1', 'i', 'PRP'],
       ['2', "'d", 'MD'],
       ['3', 'like', 'VB'],
       ['4', 'to', 'TO'],
       ['5', 'go', 'VB'],
       ['6', 'to', 'IN'],
       ['7', 'a', 'DT'],
       ['8', 'fancy', 'JJ'],
       ['9', 'restaurant', 'NN'],
       ['10', '.', '.'],
       ['1', 'i', 'PRP'],
       ['2', "'d", 'MD']], dtype='<U17')

In [240]:
XXX.shape

(152119, 3)

In [242]:
token_sentence_idxs = XXX[:,0]
tokens = XXX[:,1]
tags = XXX[:,2]

In [325]:
len(train_data)

167984

In [394]:
sentences = []
sentence = []
for token, tag in zip(tokens, tags):
    if token == '.':
        sentence.append(np.array([token, tag]))
        sentences.append(np.array(sentence))
        sentence = []
    else:
        sentence.append(np.array([token, tag]))

In [395]:
sentences = np.array(sentences)

In [396]:
sentences.shape[0]

15866

In [397]:
num_total_sentences = sentences.shape[0]
num_total_sentences

15866

In [477]:
np.random.shuffle(sentences)

In [478]:
train_test_split = 0.8

In [479]:
num_train_sentences = int(num_total_sentences * train_test_split)
num_train_sentences

12692

In [480]:
num_test_sentences = num_total_sentences - num_train_sentences
num_test_sentences

3174

In [481]:
(num_test_sentences + num_train_sentences) == num_total_sentences

True

In [482]:
train_sentences = sentences[:num_train_sentences]
test_sentences = sentences[num_train_sentences:]

In [483]:
train_sentences.shape[0] == num_train_sentences

True

In [484]:
test_sentences.shape[0] == num_test_sentences

True

In [485]:
train_sentences.shape

(12692,)

In [486]:
test_sentences.shape

(3174,)

## Unique Sets

In [487]:
def get_unique_stuff(sentence, get="token"):
    idx = 0 if get == "token" else 1
    unique_raw = np.array([np.unique(ele[:,idx]) for ele in sentence])

    flat_X = []
    for ele in unique_raw:
        flat_X += list(ele)
    unique_set = set(np.unique(flat_X))
    
    if get == "token":
        unique_set.add()
    else:
        unique_set.add()
    
    return unique_set

In [488]:
# comb_sentences = np.concatenate([train_sentences, test_sentences], axis=0)
# unique_total_tokens = get_unique_stuff(comb_sentences, get="token")
# unique_total_tags = get_unique_stuff(comb_sentences, get="tags")
# unique_total_tokens.shape, unique_total_tags.shape

In [491]:
# unique_train_tokens = get_unique_stuff(train_sentences, get="token")
# unique_train_tags = get_unique_stuff(train_sentences, get="tags")
# unique_train_tokens.shape, unique_train_tags.shape

In [492]:
# unique_test_tokens = get_unique_stuff(test_sentences, get="token")
# unique_test_tags = get_unique_stuff(test_sentences, get="tags")
# unique_test_tokens.shape, unique_test_tags.shape

### Make Train/Test Files

In [493]:
def write_new_stuff(sentences, dataset_key):
    with open("{}_sentences_shuffled.txt".format(dataset_key), "w") as outfile:
        for sentence in sentences:
            for i, (token, tag) in enumerate(sentence, 1):
                out_write = "{}\t{}\t{}\n".format(i, token, tag)
                outfile.write(out_write)
            out_write = "\n"
            outfile.write(out_write)

In [494]:
write_new_stuff(train_sentences, "training")
write_new_stuff(test_sentences, "test")

# -----------//Mike---------

After splitting data into training & test set:
    1. Build word & tag sets for entire data & training data
    2. Words in entire data but not training data are turned into UNK & counted 
    3. Use training data and the UNK counts to build transition and emission matrices
    4. Load test set & turn tokens into a list, preserve "/n"
    5. Run viterbi function on test set, get list of tags 
    6. Write output file with idx-token-tag
    7. Compare output file and test set & calculate accuracy

In [550]:
### 1. 
# Entire data
with open('berp-POS-training.txt', 'r') as entire_file:
    entire_set = entire_file.readlines()

# Train set
with open('training_sentences_shuffled.txt', 'r') as train_file:
    train_set = train_file.readlines()
    
# Test set
with open('test_sentences_shuffled.txt', 'r') as test_file:
    test_set = test_file.readlines()
    
# Build word & tag sets
entire_tags, entire_words, entire_total_word_list = build_words_tags_sets(entire_set)
train_tags, train_words, train_total = build_words_tags_sets(train_set)
test_tags, test_words, test_total = build_words_tags_sets(test_set)

In [551]:
### 2. 
# Get UNK count
unk_count = 0
unk_words = []
for word in entire_words:
    if word != '/n':
        if word not in train_words:
            unk_words.append(word)
            unk_count += entire_total_word_list.count(unseen)
            
### 4. 
# Get sequence of words + Replace unseen words in test set with UNK
sequence_tokens = []
for line in test_set:
    if line != '\n':
        word_tag = line.strip().split('\t')
        if word_tag[1] not in train_words:
            sequence_tokens.append('UNK')
        else:
            sequence_tokens.append(word_tag[1])


In [640]:
XXX = np.array([ele.strip().split("\t") for ele in test_set if ele.strip() != ""])
tokens = XXX[:,1]
tags = XXX[:,2]

sentences_labels = []
sentence_labels = []

sentences = []
sentence = []
for token, tag in zip(tokens, tags):
    if token not in train_words:
        token = "UNK"
    if token == '.':
        sentence.append(token)
        sentences.append(sentence)
        sentence = []
        
    else:
        sentence.append(token)
        
    if tag == '.':
        sentence_labels.append(tag)
        sentences_labels.append(sentence_labels)
        sentence_labels = []
    else:
        sentence_labels.append(tag)

In [641]:
sentences_labels

[['VB', 'PRP', 'IN', 'NN', 'HYPH', 'NN', '.'],
 ['UH', 'VB', 'PRP', 'JJ', 'NN', 'IN', 'DT', 'JJ', 'NN', '.'],
 ['PRP', 'MD', 'VB', 'TO', 'VB', 'IN', 'NNP', '.'],
 ['UH', 'VB', 'IN', 'CD', 'NNS', 'VB', 'CC', 'VB', '.'],
 ['IN', 'NN', '.'],
 ['JJ', 'JJ', 'UH', 'CD', 'TO', 'CD', 'NNS', '.'],
 ['PRP', 'MD', 'VB', 'RB', 'JJ', 'IN', 'NN', '.'],
 ['NN', 'NN', '.'],
 ['WRB', 'MD', 'PRP', 'VB', 'NN', 'IN', 'DT', 'CD', 'NNS', '.'],
 ['NN', 'IN', 'NN', '.'],
 ['VB',
  'PRP',
  'RBR',
  'IN',
  'JJ',
  'NN',
  'CC',
  'NNS',
  'PRP',
  'VBP',
  'PRP',
  'VBZ',
  '.'],
 ['NN',
  'NN',
  'JJ',
  'NN',
  'NN',
  'NN',
  'NN',
  'NN',
  'NN',
  'CC',
  'NNS',
  'NN',
  '.'],
 ['UH', 'RB', 'CD', 'NNS', '.'],
 ['VB', 'PRP', 'DT', 'NN', 'NNS', 'IN', 'NN', '.'],
 ['VBP', 'VBN', 'NN', '.'],
 ['RB',
  'HYPH',
  'FW',
  'FW',
  'FW',
  'NN',
  'NN',
  'POS',
  'NN',
  'NN',
  'NN',
  'POS',
  'NN',
  'JJ',
  'NN',
  '.'],
 ['JJR', 'IN', 'RB', 'NNS', 'VBP', '.'],
 ['PRP', 'MD', 'VB', 'DT', 'NN', 'IN', 'NN', '

In [611]:
# sequence_tokens = [['i', "'d", 'like', 'food', '.'], ['as', 'far', 'away', 'as', 'we', 'can', 'get', '.']]

### 3. 
# Use training data and the UNK counts to build transition and emission matrices
list_of_tags, list_of_words, word_count, transition_probs, emission_probs = build_lists_and_probabilities(train_set, train_tags, train_words, unk_count)

### 5. 
# Run viterbi function on test set, get list of tags 
sequence_tags = []
for seq in sentences:
    p, back, best_path_pointer, best_path_prob = viterbi_pos_tagger(list_of_tags, seq, transition_probs, emission_probs)
    output_tags = backtrace(back, best_path_pointer, seq)
    sequence_tags.append(output_tags)

sequence_tags   


[['VB', 'PRP', 'RB', 'NNP', 'HYPH', 'VBG', '.'],
 ['UH', 'VB', 'PRP', 'VBZ', 'NN', 'WDT', 'DT', 'VBZ', 'NN', '.'],
 ['PRP', 'MD', 'VB', 'TO', 'VB', 'TO', 'NNP', '.'],
 ['UH', 'WP', 'RB', 'CD', 'NNS', 'WDT', 'VBZ', 'VBP', '.'],
 ['WRB', 'UH', '.'],
 ['WRB', 'VBZ', 'VBP', 'TO', 'TO', 'VB', 'NNS', '.'],
 ['PRP', 'VBP', 'VB', 'RB', 'VB', 'TO', 'VB', '.'],
 ['WRB', 'VBZ', '.'],
 ['WRB', 'VBZ', 'PRP', 'VBP', 'TO', 'WDT', 'DT', 'VBZ', 'NNS', '.'],
 ['VBZ', 'WDT', 'NN', '.'],
 ['VB',
  'PRP',
  'VBP',
  'IN',
  'UH',
  'VBZ',
  'CC',
  'UH',
  'PRP',
  'VBP',
  'TO',
  'VBZ',
  '.'],
 ['WRB',
  'VBZ',
  'WDT',
  'TO',
  'VBZ',
  'VBZ',
  'VBZ',
  'VBZ',
  'VBZ',
  'CC',
  'UH',
  'VBP',
  '.'],
 ['UH', 'VB', 'DT', 'NNS', '.'],
 ['VB', 'PRP', 'VBP', 'TO', 'NNS', 'WDT', 'NNP', '.'],
 ['VBP', 'VBG', 'NN', '.'],
 ['WRB',
  'JJR',
  'NN',
  'NN',
  'NN',
  'NN',
  'VBZ',
  'VBZ',
  'NN',
  'VBZ',
  'VBZ',
  'VBZ',
  'NN',
  'WDT',
  'NN',
  '.'],
 ['WRB', 'IN', 'VBG', 'VB', 'UH', '.'],
 ['PRP', 'VB

In [618]:
sequence_tags

[['VB', 'PRP', 'RB', 'NNP', 'HYPH', 'VBG', '.'],
 ['UH', 'VB', 'PRP', 'VBZ', 'NN', 'WDT', 'DT', 'VBZ', 'NN', '.'],
 ['PRP', 'MD', 'VB', 'TO', 'VB', 'TO', 'NNP', '.'],
 ['UH', 'WP', 'RB', 'CD', 'NNS', 'WDT', 'VBZ', 'VBP', '.'],
 ['WRB', 'UH', '.'],
 ['WRB', 'VBZ', 'VBP', 'TO', 'TO', 'VB', 'NNS', '.'],
 ['PRP', 'VBP', 'VB', 'RB', 'VB', 'TO', 'VB', '.'],
 ['WRB', 'VBZ', '.'],
 ['WRB', 'VBZ', 'PRP', 'VBP', 'TO', 'WDT', 'DT', 'VBZ', 'NNS', '.'],
 ['VBZ', 'WDT', 'NN', '.'],
 ['VB',
  'PRP',
  'VBP',
  'IN',
  'UH',
  'VBZ',
  'CC',
  'UH',
  'PRP',
  'VBP',
  'TO',
  'VBZ',
  '.'],
 ['WRB',
  'VBZ',
  'WDT',
  'TO',
  'VBZ',
  'VBZ',
  'VBZ',
  'VBZ',
  'VBZ',
  'CC',
  'UH',
  'VBP',
  '.'],
 ['UH', 'VB', 'DT', 'NNS', '.'],
 ['VB', 'PRP', 'VBP', 'TO', 'NNS', 'WDT', 'NNP', '.'],
 ['VBP', 'VBG', 'NN', '.'],
 ['WRB',
  'JJR',
  'NN',
  'NN',
  'NN',
  'NN',
  'VBZ',
  'VBZ',
  'NN',
  'VBZ',
  'VBZ',
  'VBZ',
  'NN',
  'WDT',
  'NN',
  '.'],
 ['WRB', 'IN', 'VBG', 'VB', 'UH', '.'],
 ['PRP', 'VB

In [626]:
flat_list = [tag for tag_list in sequence_tags for tag in tag_list]
flat_list

['VB',
 'PRP',
 'RB',
 'NNP',
 'HYPH',
 'VBG',
 '.',
 'UH',
 'VB',
 'PRP',
 'VBZ',
 'NN',
 'WDT',
 'DT',
 'VBZ',
 'NN',
 '.',
 'PRP',
 'MD',
 'VB',
 'TO',
 'VB',
 'TO',
 'NNP',
 '.',
 'UH',
 'WP',
 'RB',
 'CD',
 'NNS',
 'WDT',
 'VBZ',
 'VBP',
 '.',
 'WRB',
 'UH',
 '.',
 'WRB',
 'VBZ',
 'VBP',
 'TO',
 'TO',
 'VB',
 'NNS',
 '.',
 'PRP',
 'VBP',
 'VB',
 'RB',
 'VB',
 'TO',
 'VB',
 '.',
 'WRB',
 'VBZ',
 '.',
 'WRB',
 'VBZ',
 'PRP',
 'VBP',
 'TO',
 'WDT',
 'DT',
 'VBZ',
 'NNS',
 '.',
 'VBZ',
 'WDT',
 'NN',
 '.',
 'VB',
 'PRP',
 'VBP',
 'IN',
 'UH',
 'VBZ',
 'CC',
 'UH',
 'PRP',
 'VBP',
 'TO',
 'VBZ',
 '.',
 'WRB',
 'VBZ',
 'WDT',
 'TO',
 'VBZ',
 'VBZ',
 'VBZ',
 'VBZ',
 'VBZ',
 'CC',
 'UH',
 'VBP',
 '.',
 'UH',
 'VB',
 'DT',
 'NNS',
 '.',
 'VB',
 'PRP',
 'VBP',
 'TO',
 'NNS',
 'WDT',
 'NNP',
 '.',
 'VBP',
 'VBG',
 'NN',
 '.',
 'WRB',
 'JJR',
 'NN',
 'NN',
 'NN',
 'NN',
 'VBZ',
 'VBZ',
 'NN',
 'VBZ',
 'VBZ',
 'VBZ',
 'NN',
 'WDT',
 'NN',
 '.',
 'WRB',
 'IN',
 'VBG',
 'VB',
 'UH',
 '.',
 'PRP'

In [None]:
flat_list = [tag_list for tag_list in sequence_tags]



In [629]:
flatten_list = lambda target_list : [tag for tag_list in target_list for tag in tag_list]

In [631]:
flat_list = flatten_list(sequence_tags)

In [632]:
flat_list = []
for tag_list in sequence_tags:
    for tag in tag_list:
        flat_list.append(tag)

In [648]:
# Flatten lists
sentences_labels_flat = flatten_list(sentences_labels)
sequence_tags_flat = flatten_list(sequence_tags)
len(sentences_labels_flat)

30783

In [650]:
count_correct = 0
for tag in range(0, len(sentences_labels_flat)):
    if sentences_labels_flat[tag] == sequence_tags_flat[tag]:
        count_correct += 1
        
count_correct / len(sentences_labels_flat)

0.596725465354254

In [534]:
# Test on single sentence
input_sentence = ['i', "'d", 'like', 'to', 'go', 'to', 'UNK', '.', '/n'] 

# Training data
with open('berp-POS-training.txt', 'r') as train_file:
    train_data = train_file.readlines()
    
# Get tags and words from fixed lexicon from this train_data
set_of_tags, set_of_words = build_words_tags_sets(train_data)        

# Divide up data into train and test set
# Then get list of words from train_set
# Words not in train_set but in total set: process unseen & get counts

# Process unseen words & get cumulative counts
new_sentence = []
UNK_count = 0
for word in input_sentence:
    if (word != '/n'):
        if (word not in set_of_words):
            new_sentence.append('UNK')
            UNK_count += 1
        else:
            new_sentence.append(word)
        
# Get probability matrices, run viterbi & get output       
list_of_tags, list_of_words, word_count, transition_probs, emission_probs = build_lists_and_probabilities(train_data, set_of_tags, set_of_words, UNK_count)
p, back, best_path_pointer, best_path_prob = viterbi_pos_tagger(list_of_tags, new_sentence, transition_probs, emission_probs)
output_tags = backtrace(back, best_path_pointer, new_sentence)

ValueError: too many values to unpack (expected 2)

In [None]:
# # Create a test set and try. 
# test_data = ""
# for line in train_data[:104]:
#     if len(line) == 1:
#         test_data += '\n'
#     elif len(line) > 1:
#         num_word = line.strip().split('\t')
#         num = num_word[0]
#         word = num_word[1]
#         new_line = num + '\t' + word
#         test_data += new_line + '\n'    
        
# # Write to file        
# with open('viterbi_testfile.txt', 'w') as testfile:
#     pass # Empty content before writing
#     testfile.write(test_data)

In [612]:
list2d = [[1,'AB',3],[4,5,6], [10], [8,9]]
merged = list(itertools.chain(*list2d))
merged

[1, 'AB', 3, 4, 5, 6, 10, 8, 9]