**Import Libraries**


In [2]:
import os
import math
import pandas as pd
import numpy as np

**Functions for Part 1**

In [3]:
def output_prediction(prediction, data, path):
    assert(len(prediction) == len(data))
    file = open(path, "w", encoding="utf-8")
    n = len(data)
    print("Writing", n, "lines")
    for i in range(n):
        assert(len(data[i]) == len(prediction[i]))
        m = len(data[i])
        for j in range(m):
            file.write(data[i][j] + " " + prediction[i][j] + "\n")
        file.write("\n")
    print("Wrote predictions to", path)

def get_training_set_words(data):
    words = set()
    for i in data:
        if len(data) > 1:
            words.add(i[0])
    return words

def dev_open(path):
  out = [[]]
  f = open(path, "r", encoding="utf-8")
  lines_in = f.readlines()
  for word in lines_in:
    if word == "\n":
      out.append([])
    else:
      out[-1].append(word.rstrip())
  return out[:-1]


def count_words_not_in_train(dev_data, train_words):
    words_not_in_train = 0
    for sentence in dev_data:
        for word in sentence:
            if word not in train_words:
                words_not_in_train += 1
    return words_not_in_train


def read_data(path):
  dataset = []
  f = open(path,"r", encoding="utf-8")
  training_set = f.readlines()
  for line in training_set:
    if len(line) == 1:
      dataset.append("\n")
    else:
      line = line.rstrip('\n')
      line = line.rpartition(' ')
      line = list(line)
      del line[1]
      if line != ['', '']:
        dataset.append(line)
  Edataset = [ele for ele in dataset]
  return Edataset


def count_tags(training_set):
  unique_tag_count = {'START':0,'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0,'STOP':0}
  for data_pair in training_set:
    if len(data_pair) > 1:
      if data_pair[1] in unique_tag_count.keys():
        unique_tag_count[data_pair[1]] += 1
    elif len(data_pair)==1:
      unique_tag_count['START'] += 1 
      unique_tag_count['STOP'] += 1
  return unique_tag_count


def count_words_for_each_tag(training_set):
  label_generate_all = {'O':{},'B-positive':{},'B-neutral':{},'B-negative':{},'I-positive':{},'I-neutral':{},'I-negative':{}}
  for data in training_set:
    if len(data) > 1:
        if data[0] not in label_generate_all[data[1]].keys():
          label_generate_all[data[1]][data[0]] = 1
        else:
          label_generate_all[data[1]][data[0]] += 1 

  return label_generate_all


def get_tags(data):
    unique_labels = set()
    for sentence in data:
        for _, label in sentence:
            unique_labels.add(label)
    return list(unique_labels)


def estimate_emission_params(count_tags, count_tag_words, smoothing_factor=1):
    emission_params = {}
    for tag_tuple, tag_word_counts in count_tag_words.items():
        tag_estimations = {}
        for word, word_count in tag_word_counts.items():
            estimated_value = word_count / (count_tags[tag_tuple] + smoothing_factor)
            tag_estimations[word] = estimated_value
        tag_estimations['#UNK#'] = smoothing_factor / (count_tags[tag_tuple] + smoothing_factor)
        emission_params[tag_tuple] = tag_estimations
    
    return emission_params


def predict_sentiment(words, e_params, word_set):
    sentiment_labels = ['O', 'B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']
    predictions = []
    for word in words:
        max_label = 'O'
        max_probability = 0
        if word not in word_set:
            word = "#UNK#"
        for label in sentiment_labels:
            if word not in e_params[label]:
                continue
            current_prob = e_params[label][word]
            if current_prob > max_probability:
                max_label = label
                max_probability = current_prob
        
        predictions.append(max_label)
    
    return predictions



def make_predictions(data, e_params, word_set):
    all_predictions = []
    for sentence in data:
        all_predictions.append(predict_sentiment(sentence, e_params, word_set))
    
    return all_predictions

**Calling functions, estimating emission params and tagging words for RU dev.in**

In [4]:
train_path_RU = "RU/train"
test_path_RU = "RU/dev.in"
output_path_RU = "RU/dev.p1.out"
train_data_RU = read_data(train_path_RU)
train_words_RU = get_training_set_words(train_data_RU)
tag_counts_RU = count_tags(train_data_RU)
test_data_RU = dev_open(test_path_RU)
tags_RU = count_tags(train_data_RU)
tag_word_counts_RU = count_words_for_each_tag(train_data_RU)
emission_params_RU = estimate_emission_params(tag_counts_RU, tag_word_counts_RU)
predictions_RU = make_predictions(test_data_RU, emission_params_RU, train_words_RU)
output_prediction(predictions_RU, test_data_RU, output_path_RU)

Writing 437 lines
Wrote predictions to RU/dev.p1.out


**Calling functions, estimating emission params and tagging words for ES dev.in**

In [5]:
train_path_ES = "ES/train"
test_path_ES = "ES/dev.in"
output_path_ES = "ES/dev.p1.out"
output_path_part_2_ES = "ES/dev.p2.out"
train_data_ES = read_data(train_path_ES)
train_words_ES = get_training_set_words(train_data_ES)
tag_counts_ES = count_tags(train_data_ES)
test_data_ES = dev_open(test_path_ES)
tags_ES = count_tags(train_data_ES)
tag_word_counts_ES = count_words_for_each_tag(train_data_ES)
emission_params_ES = estimate_emission_params(tag_counts_ES, tag_word_counts_ES)
predictions_ES = make_predictions(test_data_ES, emission_params_ES, train_words_ES)
output_prediction(predictions_ES, test_data_ES, output_path_ES)

Writing 266 lines
Wrote predictions to ES/dev.p1.out


**Functions for Part 2**

In [6]:
def process_data(path, unk_token):
    tokens = []
    labels = []

    with open(path, encoding='utf-8') as f:
        raw = f.read()
        # array of sentences
        sentences = raw.strip().split('\n\n')

    for sentence in sentences:
        pairs = sentence.split('\n')
        inner_tokens = []
        inner_labels = []
        for pair in pairs:
            try:
                token, label = pair.split(' ')
            except:
                pass
            inner_tokens.append(token)
            inner_labels.append(label)

        tokens.append(inner_tokens)
        labels.append(inner_labels)

    unique_tokens = get_unique(tokens)
    unique_tokens = unique_tokens + [unk_token]
    unique_labels = get_unique(labels)

    return tokens, labels, unique_tokens, unique_labels

def get_unique(data):
    unique_items = set()
    for item in data:
        unique_items.update(item)
    return list(unique_items)



In [7]:
train_file_path_RU = "RU/train"
unk_token = '#UNK#'
tokens_RU, labels_RU, unique_tokens_RU, unique_labels_RU = process_data(train_file_path_RU, unk_token)
print(tokens_RU)

[['Еда', 'вкусная', ',', 'но', 'отдельно', 'хочу', 'отметить', 'красивую', 'сервировку', 'блюд', ';', '.'], ['Филадельфию', 'мне', 'удалось', 'только', 'попробовать', ',', 'сделали', 'на', 'отлично', ',', 'хороший', 'кусок', 'лосося-филадельфии-чуток', 'риса', ',', 'смело', 'возьму', 'в', 'следующий', 'раз', 'Десерт', 'Тирамису', 'просто', 'таял', 'во', 'рту', ')', ')', ')'], ['Очень', 'благодарны', 'персоналу', 'за', 'качественное', 'обслуживание', '.'], ['Были', 'здесь', 'первый', 'раз', 'и', ',', 'точно', ',', 'не', 'последний', '!'], ['Резервировать', 'всё', 'помещение', 'не', 'было', 'необходимости', 'из-за', 'небольшого', 'количества', 'посетителей', '.'], ['А', 'вот', 'рыба', 'в', 'беконе', '-', 'порадовала', ')', ')', ')'], ['Обслуживала', 'нас', 'очень', 'приятная', 'девушка-официант', '.'], ['Ну', 'уж', 'очень', 'хотелось', 'на', 'диван', ')', ')'], ['Гулял', 'с', 'девушкой', 'своей', 'по', 'Крестовскому', ',', 'решили', 'зайти', 'посидеть', 'кофе', 'попить', '.'], ['Сделали'

In [8]:
train_file_path_ES = "ES/train"
unk_token = '#UNK#'
tokens_ES, labels_ES, unique_tokens_ES, unique_labels_ES = process_data(train_file_path_ES, unk_token)
print(tokens_ES)

[['Estuvimos', 'hace', 'poco', 'mi', 'pareja', 'y', 'yo', 'comiendo', 'y', 'resultó', 'todo', 'muy', 'bien', ',', 'tanto', 'la', 'comida', ',', 'el', 'vino', ',', 'el', 'trato', ',', 'la', 'decoración', '…', 'nos', 'gustó', 'todo', 'mucho', '.'], ['Por', 'poner', 'algún', 'pero', ',', 'quizá', 'el', 'jamón', 'no', 'era', 'todo', 'lo', '"', 'ibérico', '"', 'que', 'cabía', 'esperar', '.'], ['Bien', 'lo', 'sabe', 'el', 'autor', 'del', 'blog', '.', ')'], ['Comida', 'exquisita', '.'], ['Restaurante', 'diferente', ',', 'creativo', 'y', 'agradable', '.'], ['Si', 'no', 'has', 'probado', 'sus', 'carnes', 'te', 'estas', 'perdiendo', 'algo', 'muy', 'grande', '!'], ['En', 'resumen', ',', 'comida', 'bien-muy', 'bien', ',', 'servicio', 'correcto', 'y', 'profesional'], ['02-12-', '2012', 'elegimos', 'este', 'restaurante', 'por', 'los', 'comentarios', ',', 'pero', 'ha', 'sido', 'una', 'Grandisima', 'Decepción', '.'], ['Salimos', 'encantadas', 'del', 'restaurante', '.'], ['Ubicación'], ['Comimos', 'muy


**Forming Estimation and Transition as Matrix tables**


In [73]:
import numpy as np
import copy

import numpy as np
import copy

def estimate_e(unique_labels, unique_tokens, tokens, labels, unk_token="#UNK#", k=1):
    e_table = np.zeros((len(unique_labels), len(unique_tokens)+1))
    
    # Calculate the total count of each tag
    tag_counts = np.zeros(len(unique_labels))
    for label_seq in labels:
        for label in label_seq:
            tag_counts[unique_labels.index(label)] += 1
    
    for token_seq, label_seq in zip(tokens, labels):
        for token, label in zip(token_seq, label_seq):
            if token in unique_tokens:
                token_index = unique_tokens.index(token)
            else:
                token_index = unique_tokens.index(unk_token)  
            e_table[unique_labels.index(label)][token_index] += 1

    for i in range(len(unique_labels)):
        e_table[i, -1] += k / (tag_counts[i] + 1)  # Calculate emission probability for #UNK# based on k/count(tag) + 1

    e_table /= e_table.sum(axis=1)[:, np.newaxis]
    return e_table



def estimate_q(unique_labels, labels):
    q_table = np.zeros(
        (len(unique_labels)+1, len(unique_labels)+1))

    rows = ['START'] + unique_labels.copy()
    cols = unique_labels.copy() + ['STOP']

    for labels_seq in labels:
        x = copy.deepcopy(labels_seq)
        x.insert(0, 'START')
        x.append('STOP')

        for i in range(len(x)-1):
            cur_label = x[i]
            next_label = x[i+1]
            q_table[rows.index(cur_label)][cols.index(next_label)] += 1

    q_table /= q_table.sum(axis=1)[:, np.newaxis]
    return q_table


**Calling for RU**

In [75]:
unique_tokens_RU  = list(train_words_RU)
e_table_RU = estimate_e(unique_labels_RU, unique_tokens_RU, tokens_RU, labels_RU)
q_table_RU = estimate_q(unique_labels_RU, labels_RU)
print(e_table_RU)

[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 2.89829282e-07]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 2.13083316e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 2.32250273e-05]
 ...
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 5.06121540e-06]
 [2.46755169e-05 2.46755169e-05 2.46755169e-05 ... 2.46755169e-05
  2.46755169e-05 6.08866112e-10]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 4.99425660e-05]]


**Calling for ES**

In [68]:
unique_tokens_ES  = list(train_words_ES)
e_table_ES = estimate_e(unique_labels_ES, unique_tokens_ES, tokens_ES, labels_ES)
q_table_ES = estimate_q(unique_labels_ES, labels_ES)

**Viterbi Algorithm**

In [69]:
# def viterbi_algorithm(sentence, unique_labels, unique_tokens, q_table, e_table, unk_token):
#     n = len(sentence)
#     sentence = [None] + sentence
#     m = len(unique_labels)
#     pi = np.zeros((n+2, m))

#     for j in range(n):
#         if sentence[j+1] in unique_tokens:
#             cur_word = sentence[j+1]
#         else:
#             cur_word = unk_token

#         for cur_index in range(0, m):
#             if cur_word in unique_tokens:
#                 current_e = e_table[cur_index, unique_tokens.index(cur_word)]
#             else:
#                 current_e = e_table[cur_index, -1] 
#             if j == 0:
#                 current_q = q_table[0, cur_index]
#                 pi[j+1, cur_index] = 1 * current_e * current_q
#             else:
#                 max_prob = 0
#                 for vIndex in range(0, m):
#                     current_q = q_table[vIndex+1, cur_index]
#                     cur_prob = pi[j, vIndex] * current_e * current_q

#                     if cur_prob > max_prob:
#                         max_prob = cur_prob
#                 pi[j+1, cur_index] = max_prob

#     max_prob = 0
#     for prev_index in range(0, m):
#         current_q = q_table[prev_index+1, -1]
#         cur_prob = pi[n, prev_index] * current_q
#         if cur_prob > max_prob:
#             max_prob = cur_prob
#     pi[n+1, -1] = max_prob

#     y_star = [unique_labels.index("O")] * (n+1)
#     max_prob = 0

#     for cur_index in range(0, m):
#         current_q = q_table[cur_index+1, -1]
#         cur_prob = pi[n, cur_index] * current_q

#         if cur_prob > max_prob:
#             max_prob = cur_prob
#             y_star[n] = cur_index

#     for j in range(n-1, 0, -1):
#         max_prob = 0
#         for cur_index in range(0, m):
#             current_q = q_table[cur_index+1, y_star[j+1]]
#             cur_prob = pi[j, cur_index] * current_q
#             if cur_prob > max_prob:
#                 max_prob = cur_prob
#                 y_star[j] = cur_index

#     labelled_preds = [unique_labels[y] for y in y_star[1:]]
#     return labelled_preds

# def predict_p2(input_path, output_path, unique_labels, unique_tokens, q_table, e_table, unk_token):
#     total_preds = []
#     data = dev_open(input_path)

#     for sentence in data:
#         preds = viterbi_algorithm(sentence, unique_labels, unique_tokens, q_table, e_table, unk_token)
#         total_preds.append(preds)

#     with open(output_path, 'w', encoding='utf-8') as outp:
#         for _, (token, label) in enumerate(zip(data, total_preds)):
#             for _, (word, pos) in enumerate(zip(token, label)):
#                 result = word + " " + pos + "\n"
#                 outp.write(result)
#             outp.write('\n')



In [81]:
import numpy as np

def viterbi_algorithm(sentence, unique_labels, unique_tokens, q_table, e_table, unk_token):
    n = len(sentence)
    sentence = [None] + sentence
    m = len(unique_labels)
    pi = np.zeros((n+2, m))

    unk_index = unique_tokens.index(unk_token)
    unk_emission_probs = e_table[:, unk_index] 

    for j in range(n):
        if sentence[j+1] in unique_tokens:
            cur_word = sentence[j+1]
            cur_word_index = unique_tokens.index(cur_word)
        else:
            cur_word = unk_token
            cur_word_index = unk_index

        for cur_index in range(m):
            current_e = unk_emission_probs[cur_index] if cur_word == unk_token else e_table[cur_index, cur_word_index]
            if j == 0:
                current_q = q_table[0, cur_index]
                pi[j+1, cur_index] = 1 * current_e * current_q
            else:
                max_prob = 0
                for vIndex in range(m):
                    current_q = q_table[vIndex+1, cur_index]
                    cur_prob = pi[j, vIndex] * current_e * current_q

                    if cur_prob > max_prob:
                        max_prob = cur_prob
                pi[j+1, cur_index] = max_prob
    
    max_prob = 0
    for prev_index in range(0, m):
        current_q = q_table[prev_index+1, -1]
        cur_prob = pi[n, prev_index] * current_q
        if cur_prob > max_prob:
            max_prob = cur_prob
    pi[n+1, -1] = max_prob
    
    y_star = [np.argmax(unk_emission_probs)] * (n+1)  # Initialize with the tag that has highest UNK emission probability
    max_prob = 0
    
    for cur_index in range(0, m):
        current_q = q_table[cur_index+1, -1]
        cur_prob = pi[n, cur_index] * current_q
        
        if cur_prob > max_prob:
            max_prob = cur_prob
            y_star[n] = cur_index
    
    for j in range(n-1, 0, -1):
        max_prob = 0
        for cur_index in range(0, m):
            current_q = q_table[cur_index+1, y_star[j+1]]
            cur_prob = pi[j, cur_index] * current_q
            if cur_prob > max_prob:
                max_prob = cur_prob
                y_star[j] = cur_index
    
    labelled_preds = [unique_labels[y] for y in y_star[1:]]
    return labelled_preds

def predict_p2(input_path, output_path, unique_labels, unique_tokens, q_table, e_table, unk_token):
    total_preds = []
    data = dev_open(input_path) 

    for sentence in data:
        preds = viterbi_algorithm(sentence, unique_labels, unique_tokens, q_table, e_table, unk_token)
        total_preds.append(preds)

    with open(output_path, 'w', encoding='utf-8') as outp:
        for i, (token, label) in enumerate(zip(data, total_preds)):
            for j, (word, pos) in enumerate(zip(token, label)):
                result = word + " " + pos + "\n"
                outp.write(result)
            outp.write('\n')


**Calling functions for part 2 RU**

In [83]:
input_path_RU = "RU/dev.in"
output_path_RU = "RU/dev.p11.out"
unk_token = "#UNK#"
# unique_tokens_RU.append(unk_token)
output_RU = predict_p2(input_path_RU, output_path_RU, unique_labels_RU, unique_tokens_RU, q_table_RU, e_table_RU, unk_token)

**Calling functions for part 2 ES**

In [None]:
input_path_ES = "ES/dev.in"
output_path_ES = "ES/dev.p2.out"
unk_token = "#UNK#"
output_ES = predict_p2(input_path_ES, output_path_ES, unique_labels_ES, unique_tokens_ES, q_table_ES, e_table_ES, unk_token)

**Kth Best Viterbi**


In [None]:
import numpy as np

def viterbi_kthbest(sentence, unique_labels, unique_tokens, q_table, e_table, unk_token, rank):
    n = len(sentence)
    sentence = [None] + sentence
    m = len(unique_labels)
    pi = np.zeros((n+2, m, rank))

    for j in range(n):
        if sentence[j+1] in unique_tokens:
            cur_word = sentence[j+1]
        else:
            cur_word = unk_token

        for cur_index in range(0, m):
            if cur_word in unique_tokens:
                current_e = e_table[cur_index, unique_tokens.index(cur_word)]
            else:
                current_e = e_table[cur_index, -1] 
            if j == 0:
                current_q = q_table[0, cur_index]
                pi[j+1, cur_index, :] = 1 * current_e * current_q
            else:
                max_probs = []
                for prev_index in range(0, m):
                    for r in range(rank):
                        current_q = q_table[prev_index+1, cur_index]
                        cur_prob = pi[j, prev_index, r] * current_e * current_q
                        max_probs.append(cur_prob)
                max_probs.sort(reverse=True)

                if len(max_probs) > rank:
                    max_probs = max_probs[:rank]
                pi[j+1, cur_index] = max_probs

    max_probs = []
    for prev_index in range(0, m):
        for r in range(rank):
            current_q = q_table[prev_index+1, -1]
            cur_prob = pi[-1, prev_index, r] * current_q
            max_probs.append(cur_prob)

    max_probs.sort(reverse=True)
    if len(max_probs) > rank:
        max_probs = max_probs[:rank]
    pi[n+1, -1] = max_probs

    yxs = np.zeros((n+1, rank), dtype=int) + unique_labels.index("O")
    max_probs = []

    def take_last(elem):
        return elem[-1]

    for prev_index in range(0, m):
        for r in range(rank):
            current_q = q_table[prev_index+1, -1]
            cur_prob = pi[-1, prev_index, r] * current_q
            max_probs.append([prev_index, cur_prob])
    max_probs.sort(reverse=True, key=take_last)

    def removeRepeated(lst):
        new = []
        for elem in lst:
            if elem[1] != 0 and elem not in new:
                new.append(elem)
        return new

    max_probs = removeRepeated(max_probs)

    if len(max_probs) > rank:
        max_probs = max_probs[:rank]

    parents = [i[0] for i in max_probs]

    yxs[n, :len(max_probs)] = parents

    for j in range(n-1, 0, -1):
        max_probs = []
        for yx in yxs[j+1]:
            for cur_index in range(0, m):
                for r in range(rank):
                    current_q = q_table[cur_index+1, yx]
                    cur_prob = pi[j, cur_index, r] * current_q
                    max_probs.append([cur_index, cur_prob])

        max_probs.sort(reverse=True, key=take_last)
        max_probs = removeRepeated(max_probs)

        if len(max_probs) > rank:
            max_probs = max_probs[:rank]

        parents = [i[0] for i in max_probs]
        yxs[j, :len(max_probs)] = parents

    labelled_preds = [unique_labels[y] for y in yxs.T[-1][1:]]
    return labelled_preds

def predict_p3(input_path, output_path, unique_labels, unique_tokens, q_table, e_table, unk_token):
    rank = 8
    total_preds = []
    data = dev_open(input_path) 

    for sentence in data:
        preds = viterbi_kthbest(sentence, unique_labels, unique_tokens, q_table, e_table, unk_token, rank)
        total_preds.append(preds)

    with open(output_path, 'w', encoding='utf-8') as outp:
        for _, (token, label) in enumerate(zip(data, total_preds)):
            for _, (word, pos) in enumerate(zip(token, label)):
                result = word + " " + pos + "\n"
                outp.write(result)
            outp.write('\n')

In [None]:
input_path_RU = "RU/dev.in"
output_path_RU = "RU/dev.p3.out"
predict_p3(input_path_RU, output_path_RU, unique_labels_RU, unique_tokens_RU, q_table_RU, e_table_RU, unk_token)

In [None]:
input_path_ES = "ES/dev.in"
output_path_ES = "ES/dev.p3.out"
predict_p3(input_path_ES, output_path_ES, unique_labels_ES, unique_tokens_ES, q_table_ES, e_table_ES, unk_token)