**Import Libraries**


In [308]:
import os
import math
import pandas as pd
import numpy as np

**Functions for Part 1**

In [309]:
def output_prediction(prediction, data, path):
    assert(len(prediction) == len(data))
    file = open(path, "w", encoding="utf-8")
    n = len(data)
    print("Writing", n, "lines")
    for i in range(n):
        assert(len(data[i]) == len(prediction[i]))
        m = len(data[i])
        for j in range(m):
            file.write(data[i][j] + " " + prediction[i][j] + "\n")
        file.write("\n")
    print("Wrote predictions to", path)

def get_training_set_words(data):
    words = set()
    for i in data:
        if len(data) > 1:
            words.add(i[0])
    return words

def dev_open(path):
  out = [[]]
  f = open(path, "r", encoding="utf-8")
  lines_in = f.readlines()
  for word in lines_in:
    if word == "\n":
      out.append([])
    else:
      out[-1].append(word.rstrip())
  return out[:-1]


def count_words_not_in_train(dev_data, train_words):
    words_not_in_train = 0
    for sentence in dev_data:
        for word in sentence:
            if word not in train_words:
                words_not_in_train += 1
    return words_not_in_train


def read_data(path):
  dataset = []
  f = open(path,"r", encoding="utf-8")
  training_set = f.readlines()
  for line in training_set:
    if len(line) == 1:
      dataset.append("\n")
    else:
      line = line.rstrip('\n')
      line = line.rpartition(' ')
      line = list(line)
      del line[1]
      if line != ['', '']:
        dataset.append(line)
  Edataset = [ele for ele in dataset]
  return Edataset


def count_tags(training_set):
  unique_tag_count = {'START':0,'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0,'STOP':0}
  for data_pair in training_set:
    if len(data_pair) > 1:
      if data_pair[1] in unique_tag_count.keys():
        unique_tag_count[data_pair[1]] += 1
    elif len(data_pair)==1:
      unique_tag_count['START'] += 1 
      unique_tag_count['STOP'] += 1
  return unique_tag_count


def count_words_for_each_tag(training_set):
  label_generate_all = {'O':{},'B-positive':{},'B-neutral':{},'B-negative':{},'I-positive':{},'I-neutral':{},'I-negative':{}}
  for data in training_set:
    if len(data) > 1:
        if data[0] not in label_generate_all[data[1]].keys():
          label_generate_all[data[1]][data[0]] = 1
        else:
          label_generate_all[data[1]][data[0]] += 1 

  return label_generate_all


def get_tags(data):
    unique_labels = set()
    for sentence in data:
        for _, label in sentence:
            unique_labels.add(label)
    return list(unique_labels)


def estimate_emission_params(count_tags, count_tag_words, smoothing_factor=1):
    emission_params = {}
    for tag_tuple, tag_word_counts in count_tag_words.items():
        tag_estimations = {}
        for word, word_count in tag_word_counts.items():
            estimated_value = word_count / (count_tags[tag_tuple] + smoothing_factor)
            tag_estimations[word] = estimated_value
        tag_estimations['#UNK#'] = smoothing_factor / (count_tags[tag_tuple] + smoothing_factor)
        emission_params[tag_tuple] = tag_estimations
    
    return emission_params


def predict_sentiment(words, e_params, word_set):
    sentiment_labels = ['O', 'B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative']
    predictions = []
    for word in words:
        max_label = 'O'
        max_probability = 0
        if word not in word_set:
            word = "#UNK#"
        for label in sentiment_labels:
            if word not in e_params[label]:
                continue
            current_prob = e_params[label][word]
            if current_prob > max_probability:
                max_label = label
                max_probability = current_prob
        
        predictions.append(max_label)
    
    return predictions



def make_predictions(data, e_params, word_set):
    all_predictions = []
    for sentence in data:
        all_predictions.append(predict_sentiment(sentence, e_params, word_set))
    
    return all_predictions

**Calling functions, estimating emission params and tagging words for RU dev.in**

In [310]:
train_path_RU = "RU/train"
test_path_RU = "RU/dev.in"
output_path_RU = "RU/dev.p1.out"
train_data_RU = read_data(train_path_RU)
train_words_RU = get_training_set_words(train_data_RU)
tag_counts_RU = count_tags(train_data_RU)
test_data_RU = dev_open(test_path_RU)
tags_RU = count_tags(train_data_RU)
tag_word_counts_RU = count_words_for_each_tag(train_data_RU)
emission_params_RU = estimate_emission_params(tag_counts_RU, tag_word_counts_RU)
predictions_RU = make_predictions(test_data_RU, emission_params_RU, train_words_RU)
output_prediction(predictions_RU, test_data_RU, output_path_RU)

Writing 437 lines
Wrote predictions to RU/dev.p1.out


**Calling functions, estimating emission params and tagging words for ES dev.in**

In [311]:
train_path_ES = "ES/train"
test_path_ES = "ES/dev.in"
output_path_ES = "ES/dev.p1.out"
output_path_part_2_ES = "ES/dev.p2.out"
train_data_ES = read_data(train_path_ES)
train_words_ES = get_training_set_words(train_data_ES)
tag_counts_ES = count_tags(train_data_ES)
test_data_ES = dev_open(test_path_ES)
tags_ES = count_tags(train_data_ES)
tag_word_counts_ES = count_words_for_each_tag(train_data_ES)
emission_params_ES = estimate_emission_params(tag_counts_ES, tag_word_counts_ES)
predictions_ES = make_predictions(test_data_ES, emission_params_ES, train_words_ES)
output_prediction(predictions_ES, test_data_ES, output_path_ES)

Writing 266 lines
Wrote predictions to ES/dev.p1.out


**Functions for Part 2**

In [312]:
def estimate_transition_parameters(data, tag_counts):
    transition_counts = {'START': {}, 'O': {}, 'B-positive': {}, 'B-neutral': {}, 'B-negative': {}, 'I-positive': {}, 'I-neutral': {}, 'I-negative': {}}
    
    for label in transition_counts.keys():
        transition_counts[label] = {'O': 0, 'B-positive': 0, 'B-neutral': 0, 'B-negative': 0, 'I-positive': 0, 'I-neutral': 0, 'I-negative': 0, 'STOP': 0}
    
    last_position = 'START'
    
    for line in data:
        if line == "\n":
            transition_counts[last_position]["STOP"] += 1
            last_position = 'START'
        else:
            next_label = line[1]
            transition_counts[last_position][next_label] += 1
            last_position = next_label
    
    transition_probabilities = {'START': {}, 'O': {}, 'B-positive': {}, 'B-neutral': {}, 'B-negative': {}, 'I-positive': {}, 'I-neutral': {}, 'I-negative': {}, 'STOP': {}}
    
    for label_in in transition_probabilities.keys():
        transition_probabilities[label_in] = {'START': 0, 'O': 0, 'B-positive': 0, 'B-neutral': 0, 'B-negative': 0, 'I-positive': 0, 'I-neutral': 0, 'I-negative': 0, 'STOP': 0}
    
    for label_in, t_counts in transition_counts.items():
        for label_out, count in t_counts.items():
            transition_probabilities[label_in][label_out] = count / tag_counts[label_in]
    
    return transition_probabilities





def viterbi_algo(data, trans_probs, emiss_probs, vocab):
    n = len(data)
    labels = ['O', 'B-positive', 'B-neutral', 'B-negative', 'I-positive', 'I-neutral', 'I-negative', 'START']

    n_inf = -math.inf
    memo = [{'START': [n_inf, None],
             'STOP': [n_inf, None], 
             'O': [n_inf, None],
             'B-positive': [n_inf, None],
             'B-neutral': [n_inf, None],
             'B-negative': [n_inf, None],
             'I-positive': [n_inf, None],
             'I-neutral': [n_inf, None],
             'I-negative': [n_inf, None]} for i in range(n + 2)]

    memo[0]['START'][0] = 0 

    for j in range(0, n):
        next_wd = data[j]

        for u in labels:
            max_val = n_inf
            max_lbl = None
            for v in labels:
                if (memo[j][v][0] == n_inf or trans_probs[v][u] == 0):
                    continue
                prev_val = memo[j][v][0]
                if next_wd in vocab:
                    if next_wd not in emiss_probs[u].keys():
                        continue
                    else:
                        em_prob = emiss_probs[u][next_wd]
                else:
                    em_prob = emiss_probs[u]['#UNK#']
                trans_prob = trans_probs[v][u]
                prob = prev_val + math.log(em_prob) + math.log(trans_prob)
                if max_val < prob:
                    max_val = prob
                    max_lbl = v
            
            if max_val == n_inf:
                continue
            memo[j + 1][u][0] = max_val
            memo[j + 1][u][1] = max_lbl
    
    max_val = n_inf
    max_lbl = None
    for v in labels:
        prev_val = memo[n][v][0]
        trans_prob = trans_probs[v]['STOP']
        if (prev_val == 0 or trans_prob == 0):
            continue
        prob = prev_val + math.log(trans_prob)
        if max_val < prob:
            max_val = prob
            max_lbl = v

    if max_val != n_inf:
        memo[n + 1]['STOP'][0] = max_val
        memo[n + 1]['STOP'][1] = max_lbl

    output = ['' for i in range(n)]

    if max_lbl == None:
        max_lbl = "O"

    for j in range(n + 1, 1, -1):
        max_lbl = memo[j][max_lbl][1]
        if max_lbl == None:
            max_lbl = "O"
        output[j - 2] = max_lbl
    
    return output

def viterbi_call(sep_data, trans_probs, emiss_probs, vocab):
    final_result = []
    for doc in sep_data:
        final_result.append(viterbi_algo(doc, trans_probs, emiss_probs, vocab))
    return final_result


def viterbi_to_file(test, predictions, output_path):
    tags = []
    text = []
    
    for prediction in predictions:
        for tag in prediction:
            tags.append(tag)

    for words in test:
        for word in words:
            text.append(word)
    
    df = pd.DataFrame({
        0: text,
        1: tags
    })
    
    df.to_csv(output_path, sep='\t', index=False)
    print(f"Output written to {output_path}")
    
    return df

In [313]:
def process_data(path, unk_token):
    tokens = []
    labels = []

    with open(path, encoding='utf-8') as f:
        raw = f.read()
        # array of sentences
        sentences = raw.strip().split('\n\n')

    for sentence in sentences:
        pairs = sentence.split('\n')
        inner_tokens = []
        inner_labels = []
        for pair in pairs:
            try:
                token, label = pair.split(' ')
            except:
                pass
            inner_tokens.append(token)
            inner_labels.append(label)

        tokens.append(inner_tokens)
        labels.append(inner_labels)

    unique_tokens = get_unique(tokens)
    unique_tokens = unique_tokens + [unk_token]
    unique_labels = get_unique(labels)

    return tokens, labels, unique_tokens, unique_labels

def get_unique(data):
    unique_items = set()
    for item in data:
        unique_items.update(item)
    return list(unique_items)

train_file_path = "RU/train"
unk_token = '#UNK#'
tokens, labels, unique_tokens, unique_labels = process_data(train_file_path, unk_token)
print(tokens)

[['Еда', 'вкусная', ',', 'но', 'отдельно', 'хочу', 'отметить', 'красивую', 'сервировку', 'блюд', ';', '.'], ['Филадельфию', 'мне', 'удалось', 'только', 'попробовать', ',', 'сделали', 'на', 'отлично', ',', 'хороший', 'кусок', 'лосося-филадельфии-чуток', 'риса', ',', 'смело', 'возьму', 'в', 'следующий', 'раз', 'Десерт', 'Тирамису', 'просто', 'таял', 'во', 'рту', ')', ')', ')'], ['Очень', 'благодарны', 'персоналу', 'за', 'качественное', 'обслуживание', '.'], ['Были', 'здесь', 'первый', 'раз', 'и', ',', 'точно', ',', 'не', 'последний', '!'], ['Резервировать', 'всё', 'помещение', 'не', 'было', 'необходимости', 'из-за', 'небольшого', 'количества', 'посетителей', '.'], ['А', 'вот', 'рыба', 'в', 'беконе', '-', 'порадовала', ')', ')', ')'], ['Обслуживала', 'нас', 'очень', 'приятная', 'девушка-официант', '.'], ['Ну', 'уж', 'очень', 'хотелось', 'на', 'диван', ')', ')'], ['Гулял', 'с', 'девушкой', 'своей', 'по', 'Крестовскому', ',', 'решили', 'зайти', 'посидеть', 'кофе', 'попить', '.'], ['Сделали'


VITERBI CHANGE

In [314]:
import numpy as np
import copy

def estimate_e(unique_labels, unique_tokens, tokens, labels):
    e_table = np.zeros(
        (len(unique_labels), len(unique_tokens)+1))
    
    for token_seq, label_seq in zip(tokens, labels):
        for token, label in zip(token_seq, label_seq):
            e_table[unique_labels.index(label)][unique_tokens.index(token)] += 1

    for i in range(len(unique_labels)):
        e_table[i, -1] += 1

    e_table /= e_table.sum(axis=1)[:, np.newaxis]
    return e_table

def estimate_q(unique_labels, labels):
    q_table = np.zeros(
        (len(unique_labels)+1, len(unique_labels)+1))

    rows = ['START'] + unique_labels.copy()
    cols = unique_labels.copy() + ['STOP']

    for labels_seq in labels:
        x = copy.deepcopy(labels_seq)
        x.insert(0, 'START')
        x.append('STOP')

        for i in range(len(x)-1):
            cur_label = x[i]
            next_label = x[i+1]
            q_table[rows.index(cur_label)][cols.index(next_label)] += 1

    q_table /= q_table.sum(axis=1)[:, np.newaxis]
    return q_table


In [315]:
unique_labels = unique_labels
unique_tokens  = list(train_words_RU)
unk_token = "#UNK#"
labels = labels
tokens = tokens
e_table = estimate_e(unique_labels, unique_tokens, tokens, labels)
q_table = estimate_q(unique_labels, labels)

In [316]:
def viterbi_algorithm(sentence, unique_labels, unique_tokens, q_table, e_table, unk_token):
    n = len(sentence)
    sentence = [None] + sentence
    m = len(unique_labels)
    pi = np.zeros((n+2, m))

    for j in range(n):
        if sentence[j+1] in unique_tokens:
            cur_word = sentence[j+1]
        else:
            cur_word = unk_token

        for cur_index in range(0, m):
            if cur_word in unique_tokens:
                current_e = e_table[cur_index, unique_tokens.index(cur_word)]
            else:
                current_e = e_table[cur_index, -1]  # Use the last column for unk
            if j == 0:
                current_q = q_table[0, cur_index]
                pi[j+1, cur_index] = 1 * current_e * current_q
            else:
                max_prob = 0
                for vIndex in range(0, m):
                    current_q = q_table[vIndex+1, cur_index]
                    cur_prob = pi[j, vIndex] * current_e * current_q

                    if cur_prob > max_prob:
                        max_prob = cur_prob
                pi[j+1, cur_index] = max_prob

    max_prob = 0
    for prev_index in range(0, m):
        current_q = q_table[prev_index+1, -1]
        cur_prob = pi[n, prev_index] * current_q
        if cur_prob > max_prob:
            max_prob = cur_prob
    pi[n+1, -1] = max_prob

    y_star = [unique_labels.index("O")] * (n+1)
    max_prob = 0

    for cur_index in range(0, m):
        current_q = q_table[cur_index+1, -1]
        cur_prob = pi[n, cur_index] * current_q

        if cur_prob > max_prob:
            max_prob = cur_prob
            y_star[n] = cur_index

    for j in range(n-1, 0, -1):
        max_prob = 0
        for cur_index in range(0, m):
            current_q = q_table[cur_index+1, y_star[j+1]]
            cur_prob = pi[j, cur_index] * current_q
            if cur_prob > max_prob:
                max_prob = cur_prob
                y_star[j] = cur_index

    labelled_preds = [unique_labels[y] for y in y_star[1:]]
    return labelled_preds

def predict_p2(input_path, output_path, unique_labels, unique_tokens, q_table, e_table, unk_token):
    total_preds = []
    data = dev_open(input_path)

    for sentence in data:
        preds = viterbi_algorithm(sentence, unique_labels, unique_tokens, q_table, e_table, unk_token)
        total_preds.append(preds)

    with open(output_path, 'w', encoding='utf-8') as outp:
        for _, (token, label) in enumerate(zip(data, total_preds)):
            for _, (word, pos) in enumerate(zip(token, label)):
                result = word + " " + pos + "\n"
                outp.write(result)
            outp.write('\n')



In [317]:
unique_labels = unique_labels
unique_tokens  = unique_tokens
input_path = "RU/dev.in"
output_path = "RU/dev.p14.out"
unk_token = "#UNK#"
output = predict_p2(input_path, output_path, unique_labels, unique_tokens, q_table, e_table, unk_token)

**Calling functions for part 2 RU**

In [318]:
output_path_part_2_RU = "RU/dev.p2.out"
transition_params_RU = estimate_transition_parameters(train_data_RU, tag_counts_RU)
prediction_RU = viterbi_call(test_data_RU, transition_params_RU, emission_params_RU, train_words_RU)
result_RU = viterbi_to_file(test_data_RU, prediction_RU, output_path_part_2_RU)

Output written to RU/dev.p2.out


**Calling functions for part 2 ES**

In [319]:
output_path_part_2_RU = "RU/dev.p2.out"
transition_params_RU = estimate_transition_parameters(train_data_RU, tag_counts_RU)
prediction_RU = viterbi_call(test_data_RU, transition_params_RU, emission_params_RU, train_words_RU)
result_RU = viterbi_to_file(test_data_RU, prediction_RU, output_path_part_2_RU)

Output written to RU/dev.p2.out
