# Part 1

### Imports for Part 1

In [2]:
from copy import deepcopy

### 1a and 1b: Function to estimate emission params using MLE

In [3]:
# Get emission counts from a given file list
def get_emission_counts (file):
    emission_count = {} # Stores the count of tag corresponding to each work
    # Returns a dict of format {'Word1':{'O':count,'B-positive':count, ...}, 'Word2':{'O':count,'B-positive':count, ...}, ... }
    tag_count = {'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0} # Stores the total count of each tag
    new_count = {'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0} # Stores the counts for each line
    for line in file:   # [Word Tag\n] 
        if line != "\n":
            wordlist = line.split(" ")  # ["Word","Tag\n"] 
       
            word = ''.join([_ for _ in wordlist[:-1]])  # "Word"
            tag = wordlist[-1].strip() # Remove trailing \n  # "Tag"
            if word not in emission_count:
                emission_count[word] = deepcopy(new_count)
            emission_count[word][tag] += 1
            tag_count[tag] += 1
    return emission_count, tag_count

# Estimate the emission parameters using the given formula.
# k is used to include words not appearing in the training set.
def estimate_emission_params (file, k):
    emission_count, tag_count = get_emission_counts(file)
    emission_params = {}
    # Returns a dict of format {'Word1':{'O':param,'B-positive':param, ...}, 'Word2':{'O':param,'B-positive':param, ...}, ... }
    tag_prob = {'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0} # Stores the tag probability of each word
    emission_params["#UNK#"] = deepcopy(tag_prob)
    for tag in tag_count.keys():
        den = tag_count[tag] + k
        for word in emission_count.keys():
            if word not in emission_params:
                emission_params[word] = deepcopy(tag_prob)
            num = 0
            num += emission_count[word][tag]
            emission_params[word][tag] = num/den
        emission_params["#UNK#"][tag] = k/den
    return emission_params

# Using the emission params obtained, perform simple sentiment analysis, returning a list containing the predicted outputs
def sentiment_analysis(file, emission_params):
    lines = []  # List containing "Word Tag\n"
    for line in file:
        add = ""
        word = line.strip()
        if line != "\n":
            if word not in emission_params:
                assigned_tag = max(emission_params["#UNK#"],key=emission_params["#UNK#"].get)
            else:
                assigned_tag = max(emission_params[word],key=emission_params[word].get)
            add = word + " " + assigned_tag
        lines.append(add)
    return lines


### Getting emission params

In [4]:

# Reading lines from files
with open('Data/ES/train', 'r', encoding="utf-8") as f:
    ES_train = f.readlines()
with open('Data/ES/dev.in', 'r', encoding="utf-8") as f:
    ES_devin = f.readlines()
with open('Data/ES/dev.out', 'r', encoding="utf-8") as f:
    ES_devout = f.readlines()
with open('Data/RU/train', 'r', encoding="utf-8") as f:
    RU_train = f.readlines()
with open('Data/RU/dev.in', 'r', encoding="utf-8") as f:
    RU_devin = f.readlines()
with open('Data/RU/dev.out', 'r', encoding="utf-8") as f:
    RU_devout = f.readlines()

# Estimating Emission Params
ES_train_emission_params = estimate_emission_params (ES_train,1)
RU_train_emission_params = estimate_emission_params (RU_train,1)


emission_count, tag_count = get_emission_counts(ES_train)
print(emission_count[":"])
print(tag_count)
print(ES_train_emission_params[":"])
print(RU_train_emission_params)

{'O': 68, 'B-positive': 0, 'B-neutral': 0, 'B-negative': 0, 'I-positive': 0, 'I-neutral': 0, 'I-negative': 0}
{'O': 29035, 'B-positive': 1160, 'B-neutral': 72, 'B-negative': 381, 'I-positive': 314, 'I-neutral': 43, 'I-negative': 171}
{'O': 0.00234192037470726, 'B-positive': 0.0, 'B-neutral': 0.0, 'B-negative': 0.0, 'I-positive': 0.0, 'I-neutral': 0.0, 'I-negative': 0.0}
{'#UNK#': {'O': 2.467490808596738e-05, 'B-positive': 0.0005382131324004305, 'B-neutral': 0.004807692307692308, 'B-negative': 0.0022471910112359553, 'I-positive': 0.0016722408026755853, 'I-neutral': 0.014492753623188406, 'I-negative': 0.007042253521126761}, 'Еда': {'O': 2.467490808596738e-05, 'B-positive': 0.007534983853606028, 'B-neutral': 0.009615384615384616, 'B-negative': 0.0044943820224719105, 'I-positive': 0.0, 'I-neutral': 0.0, 'I-negative': 0.0}, 'вкусная': {'O': 0.0008882966910948257, 'B-positive': 0.0, 'B-neutral': 0.0, 'B-negative': 0.0, 'I-positive': 0.0, 'I-neutral': 0.0, 'I-negative': 0.0}, ',': {'O': 0.091

### Performing Sentiment Analysis and Writing to files

In [5]:
# Performing Sentiment analysis, returning lists of lines to be added
ES_devout_lines = sentiment_analysis(ES_devin,ES_train_emission_params)
RU_devout_lines = sentiment_analysis(RU_devin,RU_train_emission_params)

# Writing to Files
with open('Data/RU/dev.p1.out', 'w', encoding="utf-8") as f:
   f.write('\n'.join(RU_devout_lines))

with open('Data/ES/dev.p1.out', 'w', encoding="utf-8") as f:
   f.write('\n'.join(ES_devout_lines))

### Obtaining precision, recall, and F scores

In [6]:
# Reading lines from dev.p1.out files
with open('Data/ES/dev.p1.out', 'r', encoding="utf-8") as f:
    ES_p1_devout = f.readlines()
with open('Data/RU/dev.p1.out', 'r', encoding="utf-8") as f:
    RU_p1_devout = f.readlines()

a,b = get_emission_counts(ES_p1_devout)


# We then run the eval script provided to obtain the different scores

In [7]:
import subprocess
import os

output_es_p1 = os.path.join("Data", "ES", "dev.p1.out")
output_ru_p1 = os.path.join("Data", "RU", "dev.p1.out")
output_es = os.path.join("Data", "ES", "dev.out")
output_ru = os.path.join("Data", "RU", "dev.out")
evalresult = os.path.join("projet", "evalResult.py")

command = [
    "python",
    f"{evalresult}",
    f"{output_ru}",
    f"{output_ru_p1}",
]
command1 = [
    "python",
    f"{evalresult}",
    f"{output_es}",
    f"{output_es_p1}",
]

result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
result1 = subprocess.run(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

print('RU:\n' + result.stdout)
print('ES:\n' + result1.stdout)
if result.stderr:
    print("Error:", result.stderr)
if result.stderr:
    print("Error:", result1.stderr)

RU:

#Entity in gold data: 389
#Entity in prediction: 1816

#Correct Entity : 266
Entity  precision: 0.1465
Entity  recall: 0.6838
Entity  F: 0.2413

#Correct Sentiment : 129
Sentiment  precision: 0.0710
Sentiment  recall: 0.3316
Sentiment  F: 0.1170

ES:

#Entity in gold data: 229
#Entity in prediction: 1466

#Correct Entity : 178
Entity  precision: 0.1214
Entity  recall: 0.7773
Entity  F: 0.2100

#Correct Sentiment : 97
Sentiment  precision: 0.0662
Sentiment  recall: 0.4236
Sentiment  F: 0.1145



### Q2
    

In [8]:
import pandas as pd 
import numpy as np

In [9]:
def estimate_transition_parameters(training_data):
    tag_transition_count = {}
    tag_count = {}

    previous_tag = "START"
    for line in training_data + ['\n']:  # Adding an extra newline to process the last sentence
        if line.strip() == '':
            # Handle end of sentence
            tag_transition_count[(previous_tag, "END")] = tag_transition_count.get((previous_tag, "END"), 0) + 1
            tag_count["END"] = tag_count.get("END", 0) + 1
            previous_tag = "START"
        else:
            tag = line.split(' ')[-1]
            tag = tag.strip()
            tag_transition_count[(previous_tag, tag)] = tag_transition_count.get((previous_tag, tag), 0) + 1
            tag_count[previous_tag] = tag_count.get(previous_tag, 0) + 1
            previous_tag = tag

    tag_count["START"] = len([line for line in training_data if line.strip() == '']) + 1

    transition_parameters = {}
    for (prev_tag, tag), count in tag_transition_count.items():
        transition_parameters[(prev_tag, tag)] = 0 if tag_count[prev_tag] == 0 else count / tag_count[prev_tag]

    return transition_parameters


In [10]:
ES_train_transition_params= estimate_transition_parameters (ES_train)
RU_train_transition_params = estimate_transition_parameters (RU_train)
# print(ES_train_transition_params)
# print(RU_train_emission_params)

In [11]:
# print(ES_devin)
# print(ES_train_transition_params)
# print(RU_train_transition_params)
# print(ES_train_emission_params)
# print(RU_train_emission_params)


In [12]:
def viterbi(words, emission_params, transition_params):
    all_best_tags = []
    sentence = []
    for word in words + ['\n']:  # Adding an extra newline to process the last sentence
        if word.strip() == '':  # Sentence separator
            if sentence:  # If there are words in the sentence
                tags = list(emission_params["#UNK#"].keys())
                n = len(sentence)

                viterbi_matrix = [{tag: 0 for tag in tags} for _ in range(n+1)]  # Add 1 for START
                backpointers = [{tag: None for tag in tags} for _ in range(n+1)]

                # Initialization step (t=0)
                for tag in tags:
                    viterbi_matrix[0][tag] = transition_params.get(("START", tag), 0) * emission_params.get(sentence[0].strip(), emission_params["#UNK#"]).get(tag, 0)

                # Recursion step (t > 0)
                for t in range(1, n):
                    for tag in tags:
                        emission_prob = emission_params.get(sentence[t].strip(), emission_params["#UNK#"]).get(tag, 0)
                        max_score, prev_tag = max(
                            [(viterbi_matrix[t-1][prev_tag] * transition_params.get((prev_tag, tag), 0) * emission_prob, prev_tag) for prev_tag in tags]
                        )
                        viterbi_matrix[t][tag] = max_score
                        backpointers[t][tag] = prev_tag

                # Termination step (t=n)
                max_score, final_tag = max(
                    [(viterbi_matrix[n-1][tag] * transition_params.get((tag, "END"), 0), tag) for tag in tags]
                )

                # Trace back the best path
                best_tags = []  # Start with an empty line
                best_tags.append((sentence[-1].strip(), final_tag))
                for t in range(n-1, 0, -1):
                    prev_word, prev_tag = best_tags[-1]
                    best_tags.append((sentence[t-1].strip(), backpointers[t][prev_tag]))

                best_tags.append(('', ''))  # End with an empty line
                best_tags.reverse()
                all_best_tags.append(best_tags)

                sentence = []  # Reset the sentence
        else:
            sentence.append(word)

    return all_best_tags


In [13]:
best_tags_ES = sum(viterbi(ES_devin, ES_train_emission_params,  ES_train_transition_params), [])
print(best_tags_ES)
best_tags_RU = sum(viterbi(RU_devin, RU_train_emission_params,  RU_train_transition_params), [])
print(best_tags_RU)


[('', ''), ('Plato', 'B-negative'), ('degustación', 'I-negative'), (':', 'O'), ('un', 'O'), ('poco', 'O'), ('abundante', 'O'), ('de', 'O'), ('más', 'O'), (',', 'O'), ('pero', 'O'), ('bien', 'O'), ('cocinado', 'O'), ('.', 'O'), ('', ''), ('restaurante', 'O'), ('excelente', 'O'), ('con', 'O'), ('carne', 'B-positive'), ('de', 'O'), ('alta', 'O'), ('calidad', 'O'), ('.', 'O'), ('', ''), ('Las', 'O'), ('posibilidades', 'O'), ('en', 'O'), ('el', 'O'), ('restaurante', 'O'), ('son', 'O'), ('fundamentalmente', 'B-negative'), ('tres', 'I-negative'), (';', 'O'), ('carta', 'O'), ('normal', 'O'), (',', 'O'), ('menú', 'B-positive'), ('degustacion', 'I-positive'), ('y', 'O'), ('una', 'O'), ('opción', 'O'), ('intermedia', 'O'), ('que', 'O'), ('es', 'O'), ('una', 'O'), ('selección', 'O'), ('de', 'O'), ('primeros', 'O'), ('y', 'O'), ('postres', 'B-positive'), ('y', 'O'), ('carta', 'O'), ('para', 'O'), ('el', 'O'), ('segundo', 'O'), ('.', 'O'), ('', ''), ('No', 'O'), ('perderse', 'O'), ('el', 'O'), ('sor

In [14]:
# Assuming best_tags is a list of tuples containing (word, tag) pairs

with open('Data/ES/dev.p2.out', 'w', encoding="utf-8") as f:
    lines = ['{} {}'.format(word, tag) for word, tag in best_tags_ES[1:]]
    f.write('\n'.join(lines))

with open('Data/RU/dev.p2.out', 'w', encoding="utf-8") as f:
    lines = ['{} {}'.format(word, tag) for word, tag in best_tags_RU[1:]]
    f.write('\n'.join(lines))



In [15]:
import subprocess
import os

output_es_p2 = os.path.join("Data", "ES", "dev.p2.out")
output_ru_p2 = os.path.join("Data", "RU", "dev.p2.out")
output_es = os.path.join("Data", "ES", "dev.out")
output_ru = os.path.join("Data", "RU", "dev.out")
evalresult = os.path.join("projet", "evalResult.py")

command = [
    "python",
    f"{evalresult}",
    f"{output_ru}",
    f"{output_ru_p2}",
]
command1 = [
    "python",
    f"{evalresult}",
    f"{output_es}",
    f"{output_es_p2}",
]
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
result1 = subprocess.run(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

print('RU:\n' + result.stdout)
print('ES:\n' + result1.stdout)
if result.stderr:
    print("Error:", result.stderr)
if result.stderr:
    print("Error:", result1.stderr)


RU:

#Entity in gold data: 389
#Entity in prediction: 478

#Correct Entity : 188
Entity  precision: 0.3933
Entity  recall: 0.4833
Entity  F: 0.4337

#Correct Sentiment : 128
Sentiment  precision: 0.2678
Sentiment  recall: 0.3290
Sentiment  F: 0.2953

ES:

#Entity in gold data: 229
#Entity in prediction: 306

#Correct Entity : 127
Entity  precision: 0.4150
Entity  recall: 0.5546
Entity  F: 0.4748

#Correct Sentiment : 94
Sentiment  precision: 0.3072
Sentiment  recall: 0.4105
Sentiment  F: 0.3514



In [16]:
def kth_viterbi(words, emission_params, transition_params, k):
    all_best_tags = []
    sentence = []
    for word in words + ['\n']:
        if word.strip() == '':
            if sentence:
                tags = list(emission_params["#UNK#"].keys())
                n = len(sentence)

                viterbi_matrix = [[[float('-inf') for _ in range(k)] for tag in tags] for _ in range(n+1)]
                backpointers = [[[None for _ in range(k)] for tag in tags] for _ in range(n+1)]

                # Initialization step (t=0)
                for tag in tags:
                    for r in range(k):
                        viterbi_matrix[0][tag][r] = transition_params.get(("START", tag), 0) * emission_params.get(sentence[0].strip(), emission_params["#UNK#"]).get(tag, 0)

                # Recursion step (t > 0)
                for t in range(1, n):
                    for tag in tags:
                        scores = []
                        for prev_tag in tags:
                            emission_prob = emission_params.get(sentence[t].strip(), emission_params["#UNK#"]).get(tag, 0)
                            for r in range(k):
                                score = viterbi_matrix[t-1][prev_tag][r] * transition_params.get((prev_tag, tag), 0) * emission_prob
                                scores.append((score, prev_tag, r))
                        
                        scores.sort(reverse=True)
                        for r in range(k):
                            viterbi_matrix[t][tag][r], backpointers[t][tag][r] = scores[r][:2]

                # Termination step and traceback
                final_scores = []
                for tag in tags:
                    for r in range(k):
                        score = viterbi_matrix[n-1][tag][r] * transition_params.get((tag, "END"), 0)
                        final_scores.append((score, tag, r))

                final_scores.sort(reverse=True)

                # ...
                all_best_tags = []
                for r in range(k):
                    best_tags = []
                    _, final_tag, final_rank = final_scores[r]
                    best_tags.append((sentence[-1].strip(), final_tag))
                    for t in range(n-1, 0, -1):
                        _, prev_tag, prev_rank = best_tags[-1]
                        best_tags.append((sentence[t-1].strip(), backpointers[t][prev_tag][prev_rank]))

                    best_tags.append(('', ''))
                    best_tags.reverse()
                    all_best_tags.append(best_tags)



    return all_best_tags


In [18]:
k2best_tags_ES = kth_viterbi(ES_devin, ES_train_emission_params,  ES_train_transition_params, 2)
k8best_tags_ES = kth_viterbi(ES_devin, ES_train_emission_params,  ES_train_transition_params, 8)

print(best_tags_ES)
k2best_tags_RU = kth_viterbi(RU_devin, RU_train_emission_params,  RU_train_transition_params, 2)
k8best_tags_RU = kth_viterbi(RU_devin, RU_train_emission_params,  RU_train_transition_params, 8)

print(best_tags_RU)

[('', ''), ('Plato', 'B-negative'), ('degustación', 'I-negative'), (':', 'O'), ('un', 'O'), ('poco', 'O'), ('abundante', 'O'), ('de', 'O'), ('más', 'O'), (',', 'O'), ('pero', 'O'), ('bien', 'O'), ('cocinado', 'O'), ('.', 'O'), ('', ''), ('restaurante', 'O'), ('excelente', 'O'), ('con', 'O'), ('carne', 'B-positive'), ('de', 'O'), ('alta', 'O'), ('calidad', 'O'), ('.', 'O'), ('', ''), ('Las', 'O'), ('posibilidades', 'O'), ('en', 'O'), ('el', 'O'), ('restaurante', 'O'), ('son', 'O'), ('fundamentalmente', 'B-negative'), ('tres', 'I-negative'), (';', 'O'), ('carta', 'O'), ('normal', 'O'), (',', 'O'), ('menú', 'B-positive'), ('degustacion', 'I-positive'), ('y', 'O'), ('una', 'O'), ('opción', 'O'), ('intermedia', 'O'), ('que', 'O'), ('es', 'O'), ('una', 'O'), ('selección', 'O'), ('de', 'O'), ('primeros', 'O'), ('y', 'O'), ('postres', 'B-positive'), ('y', 'O'), ('carta', 'O'), ('para', 'O'), ('el', 'O'), ('segundo', 'O'), ('.', 'O'), ('', ''), ('No', 'O'), ('perderse', 'O'), ('el', 'O'), ('sor

In [None]:
import subprocess
import os

output_es_p3_2nd = os.path.join("Data", "ES", "dev.p3.2nd.out")
output_es_p3_8th = os.path.join("Data", "ES", "dev.p3.8th.out")
output_ru_p3_2nd = os.path.join("Data", "RU", "dev.p3.2nd.out")
output_ru_p3_8th= os.path.join("Data", "RU", "dev.p3.8th.out")
output_es = os.path.join("Data", "ES", "dev.out")
output_ru = os.path.join("Data", "RU", "dev.out")
evalresult = os.path.join("projet", "evalResult.py")

command = [
    "python",
    f"{evalresult}",
    f"{output_ru}",
    f"{output_ru_p3_2nd}",
]
command1 = [
    "python",
    f"{evalresult}",
    f"{output_ru}",
    f"{output_ru_p3_8th}",
]
command2 = [
    "python",
    f"{evalresult}",
    f"{output_es}",
    f"{output_es_p3_2nd}",
]
command3 = [
    "python",
    f"{evalresult}",
    f"{output_es}",
    f"{output_es_p3_8th}",
]
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
result1 = subprocess.run(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
result2 = subprocess.run(command2, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
result3 = subprocess.run(command3, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

print('RU_2nd:\n' + result.stdout)
print('RU_8th:\n' + result1.stdout)
print('ES_2nd:\n' + result2.stdout)
print('ES_8th:\n' + result3.stdout)

if result.stderr:
    print("Error:", result.stderr)
if result1.stderr:
    print("Error:", result1.stderr)
if result2.stderr:
    print("Error:", result2.stderr)
if result3.stderr:
    print("Error:", result3.stderr)
