# Part 1

### Imports for Part 1

In [None]:
from copy import deepcopy

### 1a and 1b: Function to estimate emission params using MLE

In [None]:
# Get emission counts from a given file list
def get_emission_counts (file):
    emission_count = {} # Stores the count of tag corresponding to each work
    # Returns a dict of format {'Word1':{'O':count,'B-positive':count, ...}, 'Word2':{'O':count,'B-positive':count, ...}, ... }
    tag_count = {'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0} # Stores the total count of each tag
    new_count = {'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0} # Stores the counts for each line
    for line in file:   # [Word Tag\n] 
        if line != "\n":
            wordlist = line.split(" ")  # ["Word","Tag\n"] 
       
            word = ''.join([_ for _ in wordlist[:-1]])  # "Word"
            tag = wordlist[-1].strip() # Remove trailing \n  # "Tag"
            if word not in emission_count:
                emission_count[word] = deepcopy(new_count)
            emission_count[word][tag] += 1
            tag_count[tag] += 1
    return emission_count, tag_count

# Estimate the emission parameters using the given formula.
# k is used to include words not appearing in the training set.
def estimate_emission_params (file, k):
    emission_count, tag_count = get_emission_counts(file)
    emission_params = {}
    # Returns a dict of format {'Word1':{'O':param,'B-positive':param, ...}, 'Word2':{'O':param,'B-positive':param, ...}, ... }
    tag_prob = {'O':0,'B-positive':0,'B-neutral':0,'B-negative':0,'I-positive':0,'I-neutral':0,'I-negative':0} # Stores the tag probability of each word
    emission_params["#UNK#"] = deepcopy(tag_prob)
    for tag in tag_count.keys():
        den = tag_count[tag] + k
        for word in emission_count.keys():
            if word not in emission_params:
                emission_params[word] = deepcopy(tag_prob)
            num = 0
            num += emission_count[word][tag]
            emission_params[word][tag] = num/den
        emission_params["#UNK#"][tag] = k/den
    return emission_params

# Using the emission params obtained, perform simple sentiment analysis, returning a list containing the predicted outputs
def sentiment_analysis(file, emission_params):
    lines = []  # List containing "Word Tag\n"
    for line in file:
        add = ""
        word = line.strip()
        if line != "\n":
            if word not in emission_params:
                assigned_tag = max(emission_params["#UNK#"],key=emission_params["#UNK#"].get)
            else:
                assigned_tag = max(emission_params[word],key=emission_params[word].get)
            add = word + " " + assigned_tag
        lines.append(add)
    return lines


### Getting emission params

In [None]:

# Reading lines from files
with open('Data/ES/train', 'r', encoding="utf-8") as f:
    ES_train = f.readlines()
with open('Data/ES/dev.in', 'r', encoding="utf-8") as f:
    ES_devin = f.readlines()
with open('Data/ES/dev.out', 'r', encoding="utf-8") as f:
    ES_devout = f.readlines()
with open('Data/RU/train', 'r', encoding="utf-8") as f:
    RU_train = f.readlines()
with open('Data/RU/dev.in', 'r', encoding="utf-8") as f:
    RU_devin = f.readlines()
with open('Data/RU/dev.out', 'r', encoding="utf-8") as f:
    RU_devout = f.readlines()

# Estimating Emission Params
ES_train_emission_params = estimate_emission_params (ES_train,1)
RU_train_emission_params = estimate_emission_params (RU_train,1)


emission_count, tag_count = get_emission_counts(ES_train)
print(emission_count[":"])
print(tag_count)
print(ES_train_emission_params[":"])
print(RU_train_emission_params)

### Performing Sentiment Analysis and Writing to files

In [None]:
# Performing Sentiment analysis, returning lists of lines to be added
ES_devout_lines = sentiment_analysis(ES_devin,ES_train_emission_params)
RU_devout_lines = sentiment_analysis(RU_devin,RU_train_emission_params)

# Writing to Files
with open('Data/RU/dev.p1.out', 'w', encoding="utf-8") as f:
   f.write('\n'.join(RU_devout_lines))

with open('Data/ES/dev.p1.out', 'w', encoding="utf-8") as f:
   f.write('\n'.join(ES_devout_lines))

### Obtaining precision, recall, and F scores

In [None]:
# Reading lines from dev.p1.out files
with open('Data/ES/dev.p1.out', 'r', encoding="utf-8") as f:
    ES_p1_devout = f.readlines()
with open('Data/RU/dev.p1.out', 'r', encoding="utf-8") as f:
    RU_p1_devout = f.readlines()

a,b = get_emission_counts(ES_p1_devout)


# We then run the eval script provided to obtain the different scores

In [None]:
import subprocess

command = [
    "python",
    "C:\\Users\\user\\Documents\\SUTD\\term 5\\ml\\Machinelearning\\projet\\evalResult.py",
    "C:\\Users\\user\\Documents\\SUTD\\term 5\\ml\\Machinelearning\\Data\\RU\\dev.out",
    "C:\\Users\\user\\Documents\\SUTD\\term 5\\ml\\Machinelearning\\Data\\RU\\dev.p1.out",
]
command1 = [
    "python",
    "C:\\Users\\user\\Documents\\SUTD\\term 5\\ml\\Machinelearning\\projet\\evalResult.py",
    "C:\\Users\\user\\Documents\\SUTD\\term 5\\ml\\Machinelearning\\Data\\ES\\dev.out",
    "C:\\Users\\user\\Documents\\SUTD\\term 5\\ml\\Machinelearning\\Data\\ES\\dev.p1.out",
]

result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
result1 = subprocess.run(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

print('RU:\n' + result.stdout)
print('ES:\n' + result1.stdout)
if result.stderr:
    print("Error:", result.stderr)
if result.stderr:
    print("Error:", result1.stderr)

### Q2
    

In [None]:
import pandas as pd 
import numpy as np

In [None]:
def estimate_transition_parameters(training_data):
    tag_transition_count = {}
    tag_count = {}

    previous_tag = "START"
    for line in training_data + ['\n']:  # Adding an extra newline to process the last sentence
        if line.strip() == '':
            # Handle end of sentence
            tag_transition_count[(previous_tag, "END")] = tag_transition_count.get((previous_tag, "END"), 0) + 1
            tag_count["END"] = tag_count.get("END", 0) + 1
            previous_tag = "START"
        else:
            tag = line.split(' ')[-1]
            tag = tag.strip()
            tag_transition_count[(previous_tag, tag)] = tag_transition_count.get((previous_tag, tag), 0) + 1
            tag_count[previous_tag] = tag_count.get(previous_tag, 0) + 1
            previous_tag = tag

    tag_count["START"] = len([line for line in training_data if line.strip() == '']) + 1

    transition_parameters = {}
    for (prev_tag, tag), count in tag_transition_count.items():
        transition_parameters[(prev_tag, tag)] = 0 if tag_count[prev_tag] == 0 else count / tag_count[prev_tag]

    return transition_parameters


In [None]:
ES_train_transition_params= estimate_transition_parameters (ES_train)
RU_train_transition_params = estimate_transition_parameters (RU_train)
# print(ES_train_transition_params)
# print(RU_train_emission_params)

In [None]:
# print(ES_devin)
# print(ES_train_transition_params)
# print(RU_train_transition_params)
# print(ES_train_emission_params)
# print(RU_train_emission_params)


In [None]:
def viterbi(words, emission_params, transition_params):
    all_best_tags = []
    sentence = []
    for word in words + ['\n']:  # Adding an extra newline to process the last sentence
        if word.strip() == '':  # Sentence separator
            if sentence:  # If there are words in the sentence
                tags = list(emission_params["#UNK#"].keys())
                n = len(sentence)

                viterbi_matrix = [{tag: 0 for tag in tags} for _ in range(n+1)]  # Add 1 for START
                backpointers = [{tag: None for tag in tags} for _ in range(n+1)]

                # Initialization step (t=0)
                for tag in tags:
                    viterbi_matrix[0][tag] = transition_params.get(("START", tag), 0) * emission_params.get(sentence[0].strip(), emission_params["#UNK#"]).get(tag, 0)

                # Recursion step (t > 0)
                for t in range(1, n):
                    for tag in tags:
                        emission_prob = emission_params.get(sentence[t].strip(), emission_params["#UNK#"]).get(tag, 0)
                        max_score, prev_tag = max(
                            [(viterbi_matrix[t-1][prev_tag] * transition_params.get((prev_tag, tag), 0) * emission_prob, prev_tag) for prev_tag in tags]
                        )
                        viterbi_matrix[t][tag] = max_score
                        backpointers[t][tag] = prev_tag

                # Termination step (t=n)
                max_score, final_tag = max(
                    [(viterbi_matrix[n-1][tag] * transition_params.get((tag, "END"), 0), tag) for tag in tags]
                )

                # Trace back the best path
                best_tags = []  # Start with an empty line
                best_tags.append((sentence[-1].strip(), final_tag))
                for t in range(n-1, 0, -1):
                    prev_word, prev_tag = best_tags[-1]
                    best_tags.append((sentence[t-1].strip(), backpointers[t][prev_tag]))

                best_tags.append(('', ''))  # End with an empty line
                best_tags.reverse()
                all_best_tags.append(best_tags)

                sentence = []  # Reset the sentence
        else:
            sentence.append(word)

    return all_best_tags


In [None]:

best_tags_ES = sum(viterbi(ES_devin, ES_train_emission_params,  ES_train_transition_params), [])
print(best_tags_ES)
best_tags_RU = sum(viterbi(RU_devin, RU_train_emission_params,  RU_train_transition_params), [])
print(best_tags_RU)


In [None]:
# Assuming best_tags is a list of tuples containing (word, tag) pairs

with open('Data/ES/dev.p2.out', 'w', encoding="utf-8") as f:
    lines = ['{} {}'.format(word, tag) for word, tag in best_tags_ES]
    f.write('\n'.join(lines))

with open('Data/RU/dev.p2.out', 'w', encoding="utf-8") as f:
    lines = ['{} {}'.format(word, tag) for word, tag in best_tags_RU]
    f.write('\n'.join(lines))



In [15]:
import subprocess

command = [
    "python",
    "C:\\Users\\user\\Documents\\SUTD\\term 5\\ml\\Machinelearning\\projet\\evalResult.py",
    "C:\\Users\\user\\Documents\\SUTD\\term 5\\ml\\Machinelearning\\Data\\RU\\dev.out",
    "C:\\Users\\user\\Documents\\SUTD\\term 5\\ml\\Machinelearning\\Data\\RU\\dev.p2.out",
]
command1 = [
    "python",
    "C:\\Users\\user\\Documents\\SUTD\\term 5\\ml\\Machinelearning\\projet\\evalResult.py",
    "C:\\Users\\user\\Documents\\SUTD\\term 5\\ml\\Machinelearning\\Data\\ES\\dev.out",
    "C:\\Users\\user\\Documents\\SUTD\\term 5\\ml\\Machinelearning\\Data\\ES\\dev.p2.out",
]

result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
result1 = subprocess.run(command1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

print('RU:\n' + result.stdout)
print('ES:\n' + result1.stdout)
if result.stderr:
    print("Error:", result.stderr)
if result.stderr:
    print("Error:", result1.stderr)


RU:

#Entity in gold data: 389
#Entity in prediction: 478

#Correct Entity : 18
Entity  precision: 0.0377
Entity  recall: 0.0463
Entity  F: 0.0415

#Correct Sentiment : 12
Sentiment  precision: 0.0251
Sentiment  recall: 0.0308
Sentiment  F: 0.0277

ES:

#Entity in gold data: 229
#Entity in prediction: 306

#Correct Entity : 11
Entity  precision: 0.0359
Entity  recall: 0.0480
Entity  F: 0.0411

#Correct Sentiment : 6
Sentiment  precision: 0.0196
Sentiment  recall: 0.0262
Sentiment  F: 0.0224

