# Part 1

### Imports for Part 1

In [None]:
from copy import deepcopy

### 1a and 1b: Function to estimate emission params using MLE

In [None]:
# Get emission counts from a given file list
def get_emission_counts (file):
    emission_count = {} # Stores the count of tag corresponding to each work
    # Returns a dict of format {'Word1':{'O':count,'B-positive':count, ...}, 'Word2':{'O':count,'B-positive':count, ...}, ... }
    tag_count = {'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0} # Stores the total count of each tag
    new_count = {'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0} # Stores the counts for each line
    for line in file:   # [Word Tag\n] 
        if line != "\n":
            wordlist = line.split(" ")  # ["Word","Tag\n"] 
       
            word = ''.join([_ for _ in wordlist[:-1]])  # "Word"
            tag = wordlist[-1].strip() # Remove trailing \n  # "Tag"
            if word not in emission_count:
                emission_count[word] = deepcopy(new_count)
            emission_count[word][tag] += 1
            tag_count[tag] += 1
    return emission_count, tag_count

# Estimate the emission parameters using the given formula.
# k is used to include words not appearing in the training set.
def estimate_emission_params (file, k):
    emission_count, tag_count = get_emission_counts(file)
    emission_params = {}
    # Returns a dict of format {'Word1':{'O':param,'B-positive':param, ...}, 'Word2':{'O':param,'B-positive':param, ...}, ... }
    tag_prob = {'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0} # Stores the tag probability of each word
    emission_params["#UNK#"] = deepcopy(tag_prob)
    for tag in tag_count.keys():
        den = tag_count[tag] + k
        for word in emission_count.keys():
            if word not in emission_params:
                emission_params[word] = deepcopy(tag_prob)
            num = 0
            num += emission_count[word][tag]
            emission_params[word][tag] = num/den
        emission_params["#UNK#"][tag] = k/den
    return emission_params

# Using the emission params obtained, perform simple sentiment analysis, returning a list containing the predicted outputs
def sentiment_analysis(file, emission_params):
    lines = []  # List containing "Word Tag\n"
    for line in file:
        add = ""
        word = line.strip()
        if line != "\n":
            if word not in emission_params:
                assigned_tag = max(emission_params["#UNK#"],key=emission_params["#UNK#"].get)
            else:
                assigned_tag = max(emission_params[word],key=emission_params[word].get)
            add = word + " " + assigned_tag
        lines.append(add)
    return lines


### Getting emission params

In [None]:

# Reading lines from files
with open('Data/ES/train', 'r', encoding="utf-8") as f:
    ES_train = f.readlines()
with open('Data/ES/dev.in', 'r', encoding="utf-8") as f:
    ES_devin = f.readlines()
with open('Data/ES/dev.out', 'r', encoding="utf-8") as f:
    ES_devout = f.readlines()
with open('Data/RU/train', 'r', encoding="utf-8") as f:
    RU_train = f.readlines()
with open('Data/RU/dev.in', 'r', encoding="utf-8") as f:
    RU_devin = f.readlines()
with open('Data/RU/dev.out', 'r', encoding="utf-8") as f:
    RU_devout = f.readlines()

# Estimating Emission Params
ES_train_emission_params = estimate_emission_params (ES_train,1)
RU_train_emission_params = estimate_emission_params (RU_train,1)


emission_count, tag_count = get_emission_counts(ES_train)
print(emission_count[":"])
print(tag_count)
print(ES_train_emission_params[":"])
print(RU_train_emission_params)

### Performing Sentiment Analysis and Writing to files

In [None]:
# Performing Sentiment analysis, returning lists of lines to be added
ES_devout_lines = sentiment_analysis(ES_devin,ES_train_emission_params)
RU_devout_lines = sentiment_analysis(RU_devin,RU_train_emission_params)

# Writing to Files
with open('Data/RU/dev.p1.out', 'w', encoding="utf-8") as f:
   f.write('\n'.join(RU_devout_lines))

with open('Data/ES/dev.p1.out', 'w', encoding="utf-8") as f:
   f.write('\n'.join(ES_devout_lines))

### Obtaining precision, recall, and F scores

In [None]:
# Reading lines from dev.p1.out files
with open('Data/ES/dev.p1.out', 'r', encoding="utf-8") as f:
    ES_p1_devout = f.readlines()
with open('Data/RU/dev.p1.out', 'r', encoding="utf-8") as f:
    RU_p1_devout = f.readlines()

a,b = get_emission_counts(ES_p1_devout)
print(b)

# We then run the eval script provided to obtain the different scores

### Results for ES
#Entity in gold data: 229

#Entity in prediction: 1466

#Correct Entity : 178
- Entity  precision: 0.1214
- Entity  recall: 0.7773
- Entity  F: 0.2100

#Correct Sentiment : 97
- Sentiment  precision: 0.0662
- Sentiment  recall: 0.4236
- Sentiment  F: 0.1145

### Results for RU
#Entity in gold data: 389

#Entity in prediction: 1816

#Correct Entity : 266
- Entity  precision: 0.1465
- Entity  recall: 0.6838
- Entity  F: 0.2413

#Correct Sentiment : 129
- Sentiment  precision: 0.0710
- Sentiment  recall: 0.33

### Q2
    

In [None]:
import pandas as pd 
import numpy as np

In [None]:
def estimate_transition_parameters(training_data):
    tag_transition_count = {}
    tag_count = {}

    previous_tag = "START"
    for line in training_data:
           if line != "\n":

                wordlist = line.split(' ')    
                tag = wordlist[-1].strip()
                tag_transition_count[(previous_tag, tag)] = tag_transition_count.get((previous_tag, tag), 0) + 1
                tag_count[previous_tag] = tag_count.get(previous_tag, 0) + 1
                previous_tag = tag
    
    # Consider special cases: q(STOP|yn) and q(y1|START)
    tag_transition_count[("STOP", "START")] = 0
    tag_count["STOP"] = 0
    tag_count["START"] = len(training_data)

    transition_parameters = {}
    for (prev_tag, tag), count in tag_transition_count.items():
        transition_parameters[(prev_tag, tag)] = 0 if tag_count[prev_tag] == 0 else  count / tag_count[prev_tag]

    return transition_parameters


In [None]:
ES_train_transition_params = estimate_transition_parameters (ES_train)
RU_train_transition_params = estimate_transition_parameters (RU_train)


In [132]:
ES_devin = [word.strip() for word in ES_devin]
print(ES_devin)

print(ES_train_transition_params)

['Plato', 'degustación', ':', 'un', 'poco', 'abundante', 'de', 'más', ',', 'pero', 'bien', 'cocinado', '.', '', 'restaurante', 'excelente', 'con', 'carne', 'de', 'alta', 'calidad', '.', '', 'Las', 'posibilidades', 'en', 'el', 'restaurante', 'son', 'fundamentalmente', 'tres', ';', 'carta', 'normal', ',', 'menú', 'degustacion', 'y', 'una', 'opción', 'intermedia', 'que', 'es', 'una', 'selección', 'de', 'primeros', 'y', 'postres', 'y', 'carta', 'para', 'el', 'segundo', '.', '', 'No', 'perderse', 'el', 'sorbete', 'de', 'mojito', '.', '', 'para', 'mi', 'perfecto', '!', '', 'Devolucion', 'a', 'cocina', ',', 'amabilidad', 'de', 'camarera', ',', 'requerimiento', 'de', 'cuenta', 'y', 'adios', '.', '', 'Así', 'como', 'el', 'romesco', ',', 'que', 'era', 'un', 'poco', '"', 'de', 'bote', '"', '.', '', 'Destacar', 'los', 'arroces', ',', 'la', 'caldereta', 'de', 'bogavante', ',', 'las', 'zamburiñas', 'al', 'horno', 'y', 'los', 'platos', 'de', '"', 'picoteo', '"', 'y', 'los', 'pescados', 'en', 'general

In [119]:
print(ES_train_emission_params)

{'#UNK#': {'O': 3.444000551040088e-05, 'B-positive': 0.0008613264427217916, 'B-neutral': 0.0136986301369863, 'B-negative': 0.002617801047120419, 'I-positive': 0.0031746031746031746, 'I-neutral': 0.022727272727272728, 'I-negative': 0.005813953488372093}, 'Estuvimos': {'O': 0.00020664003306240529, 'B-positive': 0.0, 'B-neutral': 0.0, 'B-negative': 0.0, 'I-positive': 0.0, 'I-neutral': 0.0, 'I-negative': 0.0}, 'hace': {'O': 0.0008954401432704229, 'B-positive': 0.0, 'B-neutral': 0.0, 'B-negative': 0.0, 'I-positive': 0.0, 'I-neutral': 0.0, 'I-negative': 0.0}, 'poco': {'O': 0.0018942003030720485, 'B-positive': 0.0, 'B-neutral': 0.0, 'B-negative': 0.0, 'I-positive': 0.0, 'I-neutral': 0.0, 'I-negative': 0.0}, 'mi': {'O': 0.0024796803967488635, 'B-positive': 0.0, 'B-neutral': 0.0, 'B-negative': 0.0, 'I-positive': 0.0, 'I-neutral': 0.0, 'I-negative': 0.0}, 'pareja': {'O': 0.00044772007163521146, 'B-positive': 0.0, 'B-neutral': 0.0, 'B-negative': 0.0, 'I-positive': 0.0, 'I-neutral': 0.0, 'I-negati

In [None]:
def viterbi_algorithm(sentence, emission_parameters, transition_parameters):
    for word in sentence:
        word = word.strip() 
    sentence = [i for i in sentence if i]
    tags = list(set([tag for tag in emission_parameters.keys()]))
    n = len(sentence)
    
    # Initialize the Viterbi matrix and backpointers
    viterbi = [{} for _ in range(n)]
    backpointers = [{} for _ in range(n)]

    # Base case: Start probabilities
    for tag in tags:
        viterbi[0][tag] = transition_parameters.get(("START", tag), 0) * emission_parameters.get((sentence[0], tag), 0)
        backpointers[0][tag] = None

    # Recursive case
    for t in range(1, n):
        for tag in tags:
            max_prob = 0
            best_prev_tag = None

            for prev_tag in tags:
                prob = viterbi[t - 1][prev_tag] * transition_parameters.get((prev_tag, tag), 0) * emission_parameters.get((sentence[t], tag), 0)

                if prob > max_prob:
                    max_prob = prob
                    best_prev_tag = prev_tag

            viterbi[t][tag] = max_prob
            backpointers[t][tag] = best_prev_tag

    # Termination step: End probabilities
    max_prob = 0
    best_last_tag = None

    for tag in tags:
        prob = viterbi[n - 1][tag] * transition_parameters.get((tag, "STOP"), 0)

        if prob > max_prob:
            max_prob = prob
            best_last_tag = tag

    # Backtrace to find the best tag sequence
    best_tag_sequence = [best_last_tag]
    for t in range(n - 1, 0, -1):
        best_tag_sequence.insert(0, backpointers[t][best_tag_sequence[0]])

    return best_tag_sequence


In [190]:
def viterbi(sentence, emission_params, transition_params):
    tags = list(emission_params["#UNK#"].keys())  # List of tags from the #UNK# emission probabilities
    n = len(sentence)

    # Initialize the Viterbi matrix and backpointers
    viterbi_matrix = [{tag: 0 for tag in tags} for _ in range(n)]
    backpointers = [{tag: None for tag in tags} for _ in range(n)]

    # Initialization step (t=0)
    for tag in tags:
        emission_prob = emission_params.get(sentence[0], emission_params["#UNK#"]).get(tag, 0)
        viterbi_matrix[0][tag] = transition_params[("START", tag)] * emission_prob

    # Recursion step (t > 0)
    for t in range(1, n):
        for tag in tags:
            emission_prob = emission_params.get(sentence[t], emission_params["#UNK#"]).get(tag, 0)
            max_score, prev_tag = max(
                [(viterbi_matrix[t-1][prev_tag] * transition_params.get((prev_tag, tag), 0) * emission_prob, prev_tag) for prev_tag in tags]
            )
            viterbi_matrix[t][tag] = max_score
            backpointers[t][tag] = prev_tag

    # Termination step (t=n)
    max_score, final_tag = max(
        [(viterbi_matrix[n-1][tag] * transition_params.get((tag, "STOP"), 0), tag) for tag in tags]
    )

    # Trace back the best path
    best_tags = [(sentence[-1], final_tag)]  # Start with the last word and its final tag
    for t in range(n-2, -1, -1):  # Updated to start from t=n-2 and go backward to t=0
        prev_word, prev_tag = best_tags[-1]
        best_tags.append((sentence[t], backpointers[t+1][prev_tag]))  # Store the word and its tag as a tuple

    best_tags.reverse()

    return best_tags


In [191]:

best_tags = viterbi(ES_devin, ES_train_emission_params,  ES_train_transition_params)
print(best_tags)

[('Plato', 'O'), ('degustación', 'O'), (':', 'O'), ('un', 'O'), ('poco', 'O'), ('abundante', 'O'), ('de', 'O'), ('más', 'O'), (',', 'O'), ('pero', 'O'), ('bien', 'O'), ('cocinado', 'O'), ('.', 'O'), ('', 'O'), ('restaurante', 'O'), ('excelente', 'O'), ('con', 'O'), ('carne', 'B-positive'), ('de', 'O'), ('alta', 'O'), ('calidad', 'O'), ('.', 'O'), ('', 'O'), ('Las', 'O'), ('posibilidades', 'O'), ('en', 'O'), ('el', 'O'), ('restaurante', 'O'), ('son', 'O'), ('fundamentalmente', 'B-negative'), ('tres', 'I-negative'), (';', 'O'), ('carta', 'O'), ('normal', 'O'), (',', 'O'), ('menú', 'B-positive'), ('degustacion', 'I-positive'), ('y', 'O'), ('una', 'O'), ('opción', 'O'), ('intermedia', 'O'), ('que', 'O'), ('es', 'O'), ('una', 'O'), ('selección', 'O'), ('de', 'O'), ('primeros', 'O'), ('y', 'O'), ('postres', 'B-positive'), ('y', 'O'), ('carta', 'O'), ('para', 'O'), ('el', 'O'), ('segundo', 'O'), ('.', 'O'), ('', 'B-positive'), ('No', 'I-positive'), ('perderse', 'I-positive'), ('el', 'O'), ('s

In [193]:
# Assuming best_tags is a list of tuples containing (word, tag) pairs

with open('Data/ES/dev.p2.out', 'w', encoding="utf-8") as f:
    lines = ['{} {}'.format(word, tag) for word, tag in best_tags]
    f.write('\n'.join(lines))
