In [71]:
# import libraries

from bs4 import BeautifulSoup
import re
import sys
from nltk.util import ngrams
from nltk import ConditionalFreqDist
from nltk.stem.snowball import SnowballStemmer
from nltk.probability import ConditionalProbDist, ELEProbDist
from collections import defaultdict

In [62]:
# Function to process XML files using BeautifulSoup

def process_xml_file(file_path):
    with open(file_path, 'r') as f:
        content = f.read()
    soup = BeautifulSoup(content, 'xml')
    return soup

### CHANGE TO YOUR PATH ###

desktop_path = '/Users/viruthika/Desktop'
file1_path = os.path.join(desktop_path, 'line-test.xml')
file2_path = os.path.join(desktop_path, 'line-train.xml')

soup1 = process_xml_file(file1_path)
soup2 = process_xml_file(file2_path)

In [63]:
# Preprocess the train and test data by converting to lowercase, standardizing plural lines, 
# and removing special characters

def preprocess_data(soup):
    instances = soup.find_all('instance')
    data_dict = {}
    
    for instance in instances:
        instance_id = instance['id']
        answer = instance.find('answer')
        if answer:
            sense = answer['senseid']
        else:
            sense = " "
        # try context = instance.get_text() if below doesn't work # 
        context = instance.context.get_text()
        
        context = context.lower()
        context = re.sub(r'[^\w\s]', ' ', context)
        
        tokens = context.split()
        stemmer = SnowballStemmer('english')
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        context = ' '.join(stemmed_tokens)
        
#3 Create a dictionary with id, sense, and list of tokens to use for further operations        
        data_dict[instance_id] = {'sense': sense, 'tokens': stemmed_tokens}
        
    return data_dict


In [64]:
# Function to create decision list

def create_decision_list(train_data):
    cfdist = ConditionalFreqDist()
    
    #iterate over data and update cfdist
    for instance_id, data in train_data.items():
        sense = data['sense']
        tokens = data['tokens']
        
        # window size two words to the left, one word to the right
        for i, token in enumerate(tokens):
            if token == 'line':
                left_tokens = tokens[i - 2:i] if i >= 2 else tokens[:1]
                right_tokens = tokens[i + 1:i + 2]
                
                for lt in left_tokens:
                    cfdist[sense][lt] += 1
                for rt in right_tokens:
                    cfdist[sense][rt] += 1
                    
    cpdist = ConditionalProbDist(cfdist, ELEProbDist, bins = 290)
    print(cpdist['run'].max)
    print(cpdist['run'].prob('NN'))
    
    pdists = {}
    for sense in cfdist:
        fd = cfdist[sense]
        pd = ELEProbDist(fd)
        pdists[sense] = pd
                          
    # print decision list
    for sense, freqdist in cfdist.items():
        print(f"Senses: {sense}")
        for word, freq in freqdist.items():
            print(f"\tWord: {word} Frequency: {freq}")
        
    return cfdist

In [65]:
# apply decision list to test data and write output to predicted_labels_file

def apply_decision_list(decision_list, test_data):
    predicted_labels = {}
    
    # iterate over test instances and predict label using decision tree
    for instance_id, data in test_data.items():
        tokens = data['tokens']
        label = None
        max_freq = 0
        
        # iterate over senes in dec. list and find the sense with highest freq
        for sense, freqdist in decision_list.items():
            freq = 0
            for token in tokens:
                freq += freqdist[token]
            if freq > max_freq:
                label = sense
                max_freq = freq
                
        predicted_labels[instance_id] = label
        
    with open(predicted_labels_file, 'w') as f:
        for instance_id, label in predicted_labels.items():
            f.write(instance_id + ' ' + label + '\n')
    print(predicted_labels)

In [67]:
# main function to call process_xml, preprocess_data, and apply_decision_list
# export decision list as text file

def main(train_file, test_file, decision_list_file, predicted_labels_file):
    train_soup = process_xml_file(train_file)
    test_soup = process_xml_file(test_file)
    
    train_data = preprocess_data(train_soup)
    test_data = preprocess_data(test_soup)
    
    decision_list = create_decision_list(train_data)
    predicted_labels = apply_decision_list(decision_list, test_data)
    
    with open(decision_list_file, 'w') as f:
        for instance_id, data in train_data.items():
            sense = data['sense']
            tokens = ' '.join(data['tokens'])
            
            f.write(f'<answer instance="{instance_id}" senseid="{sense}"/>\n')
            
# specifiy file names

"""if __name__ == '__main__':
    
    train_file = 'line-train.xml'
    test_file = 'line-test.xml'
    
    decision_list_file = 'my-line-answers.txt'
    predicted_labels_file = 'my-predicted-labels.txt'
    
    main(train_file, test_file, decision_list_file, predicted_labels_file)"""

if __name__ == '__main__':
    desktop_path = '/Users/viruthika/Desktop'

    train_file = os.path.join(desktop_path, 'line-train.xml')
    test_file = os.path.join(desktop_path, 'line-test.xml')
    decision_list_file = os.path.join(desktop_path, 'my-line-answers.txt')
    predicted_labels_file = os.path.join(desktop_path, 'my-predicted-labels.txt')

    main(train_file, test_file, decision_list_file, predicted_labels_file)

    

<bound method LidstoneProbDist.max of <ELEProbDist based on 0 samples>>
0.0034482758620689655
Senses: phone
	Word: 5 Frequency: 1
	Word: access Frequency: 19
	Word: growth Frequency: 3
	Word: 000 Frequency: 8
	Word: new Frequency: 13
	Word: to Frequency: 21
	Word: on Frequency: 26
	Word: the Frequency: 77
	Word: gab Frequency: 8
	Word: have Frequency: 2
	Word: telephon Frequency: 42
	Word: tip Frequency: 3
	Word: that Frequency: 9
	Word: trader Frequency: 1
	Word: thousand Frequency: 2
	Word: keep Frequency: 3
	Word: busi Frequency: 5
	Word: residenti Frequency: 2
	Word: increas Frequency: 4
	Word: cellular Frequency: 1
	Word: mobil Frequency: 1
	Word: grew Frequency: 5
	Word: distanc Frequency: 1
	Word: or Frequency: 3
	Word: dedic Frequency: 3
	Word: make Frequency: 2
	Word: privat Frequency: 7
	Word: and Frequency: 26
	Word: switch Frequency: 1
	Word: was Frequency: 12
	Word: an Frequency: 4
	Word: outsid Frequency: 2
	Word: up Frequency: 1
	Word: but Frequency: 3
	Word: largest Fre

In [70]:
# read key and return dict mapping instance ids to their correct sense
def read_key_file(key_file):
    key = {}
    with open(key_file, 'r') as f:
        for line in f: 
            answer, instance_id, sense = line.strip().split()[0], line.strip().split()[1], line.strip().split()[2]
            key[instance_id[10:-1]] = sense[9:-3]
    return key

# read tagged file and return dict mapping instance ids to their predicted sense
def read_tagged_file(tagged_file):
    tagged = {}
    with open(tagged_file, 'r') as f:
        for line in f:
            answer, instance_id, sense = line.strip().split()[0], line.strip().split()[1], line.strip().split()[2]
            tagged[instance_id[10:-1]] = sense[9:-3]
    return tagged

# compare key and tagged dicts and return overall accuracy and confusion matrix
def evaluate(key, tagged):
    confusion_matrix = defaultdict(lambda: defaultdict(int))
    correct = 0
    total = len(key)
    for instance_id, key_sense in key.items():
        tagged_sense = tagged.get(instance_id)
        if not tagged_sense:
            continue
        confusion_matrix[key_sense][tagged_sense] += 1
        if tagged_sense == key_sense:
            correct += 1
    return (correct / total), confusion_matrix

## general path ##
"""if __name__ == '__main__':
    tagged_file = 'my-line-answers.txt'
    key_file = '/Users/viruthika/Desktop/line-answers.txt' 
    
    key = read_key_file(key_file)
    tagged = read_tagged_file(tagged_file)
    
    accuracy, confusion_matrix = evaluate(key, tagged)
    print('Accuracy:', accuracy)
    print('Confusion matrix:')
    for true_label in confusion_matrix:
        row = confusion_matrix[true_label]
        print(true_label, end='\t')
    print()"""

#change as needed for your file path
if __name__ == '__main__':
    desktop_path = '/Users/viruthika/Desktop'

    train_file = os.path.join(desktop_path, 'line-train.xml')
    test_file = os.path.join(desktop_path, 'line-test.xml')
    decision_list_file = os.path.join(desktop_path, 'my-line-answers.txt')
    predicted_labels_file = os.path.join(desktop_path, 'my-predicted-labels.txt')

    main(train_file, test_file, decision_list_file, predicted_labels_file)

    tagged_file = os.path.join(desktop_path, 'my-line-answers.txt')
    key_file = os.path.join(desktop_path, 'line-answers.txt')

    key = read_key_file(key_file)
    tagged = read_tagged_file(tagged_file)

    accuracy, confusion_matrix = evaluate(key, tagged)
    print('Accuracy:', accuracy)
    print('Confusion matrix:')
    for true_label in confusion_matrix:
        row = confusion_matrix[true_label]
        print(true_label, end='\t')
    print()



<bound method LidstoneProbDist.max of <ELEProbDist based on 0 samples>>
0.0034482758620689655
Senses: phone
	Word: 5 Frequency: 1
	Word: access Frequency: 19
	Word: growth Frequency: 3
	Word: 000 Frequency: 8
	Word: new Frequency: 13
	Word: to Frequency: 21
	Word: on Frequency: 26
	Word: the Frequency: 77
	Word: gab Frequency: 8
	Word: have Frequency: 2
	Word: telephon Frequency: 42
	Word: tip Frequency: 3
	Word: that Frequency: 9
	Word: trader Frequency: 1
	Word: thousand Frequency: 2
	Word: keep Frequency: 3
	Word: busi Frequency: 5
	Word: residenti Frequency: 2
	Word: increas Frequency: 4
	Word: cellular Frequency: 1
	Word: mobil Frequency: 1
	Word: grew Frequency: 5
	Word: distanc Frequency: 1
	Word: or Frequency: 3
	Word: dedic Frequency: 3
	Word: make Frequency: 2
	Word: privat Frequency: 7
	Word: and Frequency: 26
	Word: switch Frequency: 1
	Word: was Frequency: 12
	Word: an Frequency: 4
	Word: outsid Frequency: 2
	Word: up Frequency: 1
	Word: but Frequency: 3
	Word: largest Fre