In [None]:
import re
import numpy as np
import math
from collections import defaultdict

In [None]:
class NaiveBayesWSD:
    def __init__(self):
        self.priors = defaultdict(float)
        self.word_counts = defaultdict(lambda: defaultdict(float))
        self.sense_counts = defaultdict(float)

    def train(self, training_data):
        for context, sense in training_data:
            self.sense_counts[sense] += 1
            words = re.findall(r'\w+', context.lower())
            for word in words:
                self.word_counts[word][sense] += 1

        total_examples = len(training_data)
        for sense in self.sense_counts:
            self.priors[sense] = self.sense_counts[sense] / total_examples

    def classify(self, context):
        words = re.findall(r'\w+', context.lower())
        sense_probabilities = {}
        for sense in self.sense_counts:
            sense_prob = math.log(self.priors[sense])
            for word in words:
                if word in self.word_counts:
                    word_sense_count = self.word_counts[word][sense]
                    word_total_count = sum(self.word_counts[word].values())
                    prob = (word_sense_count + 1) / (word_total_count + len(self.sense_counts))
                    sense_prob += math.log(prob)
            sense_probabilities[sense] = sense_prob
        return max(sense_probabilities, key=sense_probabilities.get)

In [None]:
def convert_to_list_of_words(character_list):
    word_list = ['']
    inside_brackets = False

    for char in character_list:
        if char == '<':
            inside_brackets = True
            if word_list[-1] != '':
                word_list.append('')
        elif char == '>':
            inside_brackets = False
            if word_list[-1] != '':
                word_list.append('')
        elif char == '\n':
            continue  # Ignore new line characters
        else:
            if inside_brackets:
                word_list[-1] += char
            else:
                if char == ' ':
                    if word_list[-1] != '':
                        word_list.append('')
                else:
                    word_list[-1] += char

    return [word for word in word_list if word != '']

In [None]:
def clump_context_words(word_list):
    clumped_list = []
    inside_context = False
    current_context = []

    for word in word_list:
        if word == 'context':
            if inside_context:
                current_context = []  # Start a new context block
            inside_context = True
        elif word == '/context':
            if inside_context:
                clumped_list.append(" ".join(current_context))
                current_context = []
                inside_context = False
        elif inside_context:
            current_context.append(word)
        else:
            clumped_list.append(word)

    return clumped_list


In [None]:
def organize_instances(word_list):
    organized_list = []
    current_instance = []

    for word in word_list:
        if word == 'instance':
            current_instance = []
        elif word == '/instance':
            if current_instance:
                organized_list.append(current_instance)
                current_instance = []
        elif current_instance is not None:
            current_instance.append(word)

    return organized_list


In [None]:
def pre_process(text):
    character_list = text
    word_list = convert_to_list_of_words(character_list)
    clumped_list = clump_context_words(word_list)
    organized_list = organize_instances(clumped_list)
    input_data = []
    for i in range(len(organized_list)):
        context = organized_list[i][2]
        context = context.split()
        simplified_context = []
        for word in context:
            if word == 'head' or word == '/head':
                pass
            else:
                simplified_context.append(word)
        final_context = ' '.join(simplified_context)
        sense = organized_list[i][1]
        words_sense = sense.split()
        sense_id = words_sense[2][:-2]
        sense_id = sense_id[9:]
        tuple = (final_context, sense_id)
        input_data.append(tuple)
    return input_data

In [None]:
import re
import math
from collections import defaultdict
filename = 'plant.wsd'

In [None]:
filename = 'plant.wsd'
text = open(filename, 'r').read()
input_data = pre_process(text)

In [None]:
wsd = NaiveBayesWSD()
wsd.train(input_data)

In [None]:
def divide_data_into_folds(data, num_folds):
    fold_size = len(data) // num_folds
    folds = []
    for i in range(num_folds):
        start = i * fold_size
        end = start + fold_size
        if i == num_folds - 1:
            end = len(data)
        folds.append(data[start:end])
    return folds

In [None]:
text = open('plant.wsd', 'r').read()

In [None]:
print(pre_process(text)[0])

In [None]:
num_folds = 5
folds = divide_data_into_folds(training_data, num_folds)
test_data = training_data
print(training_data)
accuracy_list = []

for i in range(num_folds):
    # Use one fold as the test data and the rest as training data
    test_data = folds[i]
    training_data = [fold for j, fold in enumerate(folds) if j != i for fold in folds]

    nb_wsd = NaiveBayesWSD()
    nb_wsd.train(training_data)

    correct_predictions = 0
    total_predictions = len(test_data)

    for context, sense in test_data:
        predicted_sense = nb_wsd.classify(context)
        if predicted_sense == sense:
            correct_predictions += 1

    accuracy = correct_predictions / total_predictions
    accuracy_list.append(accuracy)

    print(f"Fold {i + 1} Accuracy: {accuracy:.2%}")

average_accuracy = sum(accuracy_list) / num_folds
print(f"Average Accuracy: {average_accuracy:.2%}")