# Part 1

### Imports for Part 1

In [129]:
from copy import deepcopy

### 1a and 1b: Function to estimate emission params using MLE

In [130]:
# Get emission counts from a given file list
def get_emission_counts (file):
    emission_count = {} # Stores the count of tag corresponding to each work
    # Returns a dict of format {'Word1':{'O':count,'B-positive':count, ...}, 'Word2':{'O':count,'B-positive':count, ...}, ... }
    tag_count = {'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0} # Stores the total count of each tag
    new_count = {'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0} # Stores the counts for each line
    for line in file:   # [Word Tag\n] 
        if line != "\n":
            wordlist = line.split(" ")  # ["Word","Tag\n"] 
       
            word = ''.join([_ for _ in wordlist[:-1]])  # "Word"
            tag = wordlist[-1].strip() # Remove trailing \n  # "Tag"
            if word not in emission_count:
                emission_count[word] = deepcopy(new_count)
            emission_count[word][tag] += 1
            tag_count[tag] += 1
    return emission_count, tag_count

# Estimate the emission parameters using the given formula.
# k is used to include words not appearing in the training set.
def estimate_emission_params (file, k):
    emission_count, tag_count = get_emission_counts(file)
    emission_params = {}
    # Returns a dict of format {'Word1':{'O':param,'B-positive':param, ...}, 'Word2':{'O':param,'B-positive':param, ...}, ... }
    tag_prob = {'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0} # Stores the tag probability of each word
    emission_params["#UNK#"] = deepcopy(tag_prob)
    for tag in tag_count.keys():
        den = tag_count[tag] + k
        for word in emission_count.keys():
            if word not in emission_params:
                emission_params[word] = deepcopy(tag_prob)
            num = 0
            num += emission_count[word][tag]
            emission_params[word][tag] = num/den
        emission_params["#UNK#"][tag] = k/den
    return emission_params

# Using the emission params obtained, perform simple sentiment analysis, returning a list containing the predicted outputs
def sentiment_analysis(file, emission_params):
    lines = []  # List containing "Word Tag\n"
    for line in file:
        add = ""
        word = line.strip()
        if line != "\n":
            if word not in emission_params:
                assigned_tag = max(emission_params["#UNK#"],key=emission_params["#UNK#"].get)
            else:
                assigned_tag = max(emission_params[word],key=emission_params[word].get)
            add = word + " " + assigned_tag
        lines.append(add)
    return lines


### Getting emission params

In [131]:

# Reading lines from files
with open('Data/ES/train', 'r', encoding="utf-8") as f:
    ES_train = f.readlines()
with open('Data/ES/dev.in', 'r', encoding="utf-8") as f:
    ES_devin = f.readlines()
with open('Data/ES/dev.out', 'r', encoding="utf-8") as f:
    ES_devout = f.readlines()
with open('Data/RU/train', 'r', encoding="utf-8") as f:
    RU_train = f.readlines()
with open('Data/RU/dev.in', 'r', encoding="utf-8") as f:
    RU_devin = f.readlines()
with open('Data/RU/dev.out', 'r', encoding="utf-8") as f:
    RU_devout = f.readlines()

# Estimating Emission Params
ES_train_emission_params = estimate_emission_params (ES_train,1)
RU_train_emission_params = estimate_emission_params (RU_train,1)


emission_count, tag_count = get_emission_counts(ES_train)
print(emission_count[":"])
print(tag_count)
print(ES_train_emission_params[":"])
print(RU_train_emission_params)

{'O': 68, 'B-positive': 0, 'B-neutral': 0, 'B-negative': 0, 'I-positive': 0, 'I-neutral': 0, 'I-negative': 0}
{'O': 29035, 'B-positive': 1160, 'B-neutral': 72, 'B-negative': 381, 'I-positive': 314, 'I-neutral': 43, 'I-negative': 171}
{'O': 0.00234192037470726, 'B-positive': 0.0, 'B-neutral': 0.0, 'B-negative': 0.0, 'I-positive': 0.0, 'I-neutral': 0.0, 'I-negative': 0.0}
{'#UNK#': {'O': 2.467490808596738e-05, 'B-positive': 0.0005382131324004305, 'B-neutral': 0.004807692307692308, 'B-negative': 0.0022471910112359553, 'I-positive': 0.0016722408026755853, 'I-neutral': 0.014492753623188406, 'I-negative': 0.007042253521126761}, 'Еда': {'O': 2.467490808596738e-05, 'B-positive': 0.007534983853606028, 'B-neutral': 0.009615384615384616, 'B-negative': 0.0044943820224719105, 'I-positive': 0.0, 'I-neutral': 0.0, 'I-negative': 0.0}, 'вкусная': {'O': 0.0008882966910948257, 'B-positive': 0.0, 'B-neutral': 0.0, 'B-negative': 0.0, 'I-positive': 0.0, 'I-neutral': 0.0, 'I-negative': 0.0}, ',': {'O': 0.091

### Performing Sentiment Analysis and Writing to files

In [132]:
# Performing Sentiment analysis, returning lists of lines to be added
ES_devout_lines = sentiment_analysis(ES_devin,ES_train_emission_params)
RU_devout_lines = sentiment_analysis(RU_devin,RU_train_emission_params)

# Writing to Files
with open('Data/RU/dev.p1.out', 'w', encoding="utf-8") as f:
   f.write('\n'.join(RU_devout_lines))

with open('Data/ES/dev.p1.out', 'w', encoding="utf-8") as f:
   f.write('\n'.join(ES_devout_lines))

### Obtaining precision, recall, and F scores

In [133]:
# Reading lines from dev.p1.out files
with open('Data/ES/dev.p1.out', 'r', encoding="utf-8") as f:
    ES_p1_devout = f.readlines()
with open('Data/RU/dev.p1.out', 'r', encoding="utf-8") as f:
    RU_p1_devout = f.readlines()

a,b = get_emission_counts(ES_p1_devout)
print(b)

# We then run the eval script provided to obtain the different scores

{'O': 2622, 'B-positive': 169, 'B-neutral': 77, 'B-negative': 166, 'I-positive': 391, 'I-neutral': 764, 'I-negative': 123}


### Results for ES
#Entity in gold data: 229

#Entity in prediction: 1466

#Correct Entity : 178
- Entity  precision: 0.1214
- Entity  recall: 0.7773
- Entity  F: 0.2100

#Correct Sentiment : 97
- Sentiment  precision: 0.0662
- Sentiment  recall: 0.4236
- Sentiment  F: 0.1145

### Results for RU
#Entity in gold data: 389

#Entity in prediction: 1816

#Correct Entity : 266
- Entity  precision: 0.1465
- Entity  recall: 0.6838
- Entity  F: 0.2413

#Correct Sentiment : 129
- Sentiment  precision: 0.0710
- Sentiment  recall: 0.33

### Q2
    

In [134]:
import pandas as pd 
import numpy as np

In [157]:
def estimate_transition_parameters(training_data):
    tag_transition_count = {}
    tag_count = {}

    previous_tag = "START"
    for line in training_data:
           if line != "\n":

                wordlist = line.split(' ')    
                tag = wordlist[-1].strip()
                tag_transition_count[(previous_tag, tag)] = tag_transition_count.get((previous_tag, tag), 0) + 1
                tag_count[previous_tag] = tag_count.get(previous_tag, 0) + 1
                previous_tag = tag
    
    # Consider special cases: q(STOP|yn) and q(y1|START)
    tag_transition_count[("STOP", "START")] = 0
    tag_count["STOP"] = 0
    tag_count["START"] = len(training_data)

    transition_parameters = {}
    for (prev_tag, tag), count in tag_transition_count.items():
        transition_parameters[(prev_tag, tag)] = 0 if tag_count[prev_tag] == 0 else  count / tag_count[prev_tag]

    return transition_parameters


In [136]:
ES_train_transition_params = estimate_transition_parameters (ES_train)
RU_train_transition_params = estimate_transition_parameters (RU_train)


In [147]:
# ES_devin = [word.strip() for word in ES_devin]
# print(ES_devin)
print(ES_train_transition_params)
print(RU_train_transition_params)
print(ES_train_emission_params)
print(RU_train_emission_params)


{('START', 'O'): 3.027275754548482e-05, ('O', 'O'): 0.9445822139560516, ('O', 'B-positive'): 0.03984983123234828, ('B-positive', 'O'): 0.8801724137931034, ('O', 'B-negative'): 0.0131225459805745, ('B-negative', 'O'): 0.821522309711286, ('O', 'B-neutral'): 0.002445408831025694, ('B-neutral', 'I-neutral'): 0.20833333333333334, ('I-neutral', 'I-neutral'): 0.6511627906976745, ('I-neutral', 'O'): 0.3488372093023256, ('B-positive', 'I-positive'): 0.11637931034482758, ('I-positive', 'I-positive'): 0.5700636942675159, ('I-positive', 'O'): 0.4299363057324841, ('B-neutral', 'O'): 0.7916666666666666, ('B-negative', 'I-negative'): 0.1784776902887139, ('I-negative', 'O'): 0.39766081871345027, ('I-negative', 'I-negative'): 0.6023391812865497, ('B-positive', 'B-neutral'): 0.0008620689655172414, ('B-positive', 'B-positive'): 0.002586206896551724, ('STOP', 'START'): 0, ('START', 'B-positive'): 0, ('START', 'B-neutral'): 0, ('START', 'B-negative'): 0, ('START', 'I-positive'): 0, ('START', 'I-neutral'): 

In [154]:
def viterbi(sentence, emission_params, transition_params):
    tags = list(emission_params["#UNK#"].keys())  # List of tags from the #UNK# emission probabilities
    n = len(sentence)

    # Initialize the Viterbi matrix and backpointers
    viterbi_matrix = [{tag: 0 for tag in tags} for _ in range(n)]
    backpointers = [{tag: None for tag in tags} for _ in range(n)]

    # Initialization step (t=0)
    for tag in tags:
        
        emission_prob = emission_params.get(sentence[0], emission_params["#UNK#"]).get(tag)
        transition_params[("START", tag)] = transition_params.get(("START", tag), 0)
        viterbi_matrix[0][tag] = transition_params[("START", tag)] * emission_prob

    # Recursion step (t > 0)
    for t in range(1, n):
      
        for tag in tags:
            
            emission_prob = emission_params.get(sentence[t], emission_params["#UNK#"]).get(tag)
            max_score, prev_tag = max(
                [(viterbi_matrix[t-1][prev_tag] * transition_params.get((prev_tag, tag), 0) * emission_prob, prev_tag) for prev_tag in tags]
            )
            viterbi_matrix[t][tag] = max_score
            backpointers[t][tag] = prev_tag 

    # Termination step (t=n)
    max_score, final_tag = max(
        [(viterbi_matrix[n-1][tag] * transition_params.get((tag, "STOP"), 0), tag) for tag in tags]
    )

    # Trace back the best path
    best_tags = [(sentence[-1], final_tag)]  # Start with the last word and its final tag
    for t in range(n-2, -1, -1):  # Updated to start from t=n-2 and go backward to t=0
        prev_word, prev_tag = best_tags[-1]
      
        best_tags.append((sentence[t], backpointers[t+1][prev_tag]))  # Store the word and its tag as a tuple

    best_tags.reverse()

    best_tags = [ ('','') if word.strip() == '' else (word.strip(),tag) for word, tag in best_tags]

    return best_tags


In [155]:

best_tags_ES = viterbi(ES_devin, ES_train_emission_params,  ES_train_transition_params)
# print(best_tags)
best_tags_RU = viterbi(RU_devin, RU_train_emission_params,  RU_train_transition_params)
# print(best_tags_RU)


In [156]:
# Assuming best_tags is a list of tuples containing (word, tag) pairs

with open('Data/ES/dev.p2.out', 'w', encoding="utf-8") as f:
    lines = ['{} {}'.format(word, tag) for word, tag in best_tags]
    f.write('\n'.join(lines))

with open('Data/RU/dev.p2.out', 'w', encoding="utf-8") as f:
    lines = ['{} {}'.format(word, tag) for word, tag in best_tags_RU]
    f.write('\n'.join(lines))



### ES
#Entity in gold data: 229
#Entity in prediction: 12

#Correct Entity : 2
Entity  precision: 0.1667
Entity  recall: 0.0087
Entity  F: 0.0166

#Correct Sentiment : 1
Sentiment  precision: 0.0833
Sentiment  recall: 0.0044
Sentiment  F: 0.0083

### RU

#Entity in gold data: 389
#Entity in prediction: 9

#Correct Entity : 1
Entity  precision: 0.1111
Entity  recall: 0.0026
Entity  F: 0.0050

#Correct Sentiment : 1
Sentiment  precision: 0.1111
Sentiment  recall: 0.0026
Sentiment  F: 0.0050