# PREPROCESSING

In [1]:
# import ElementTree to parse XML test corpus
import xml.etree.ElementTree as ET

In [2]:
# get all files in the Train-corpus folder
import glob
path = '/home/kaustubh/Dropbox/Sem5/AI/Assignments/Train-corups/*.xml'
files = glob.glob(path)

In [3]:
# open a file to write the cleaned data
writer = open("train_data.txt", "w+")

# iterate over all files
for file in files:
    
    # parse each XML file
    tree = ET.parse(file)
    root = tree.getroot()   
    
    # get w elements in the file
    for word in root.iter('w'):      
        writer.write("%s_%s\n" % (word.text.strip(), word.attrib['pos'].strip()))
        
writer.close()

# WORD+TAG FREQUENCY COUNT

In [4]:
freq = {}

reader = open("train_data.txt", "r")
for line in reader:
    freq[line.strip()] = freq.get(line.strip(), 0) + 1

reader.close()

In [5]:
freq

{'USING_VERB': 3,
 'THE_ART': 219,
 'RULES_SUBST': 1,
 'TO_PREP': 49,
 'HELP_VERB': 5,
 'YOU_PRON': 26,
 'WIN_VERB': 9,
 'It_PRON': 2123,
 'is_VERB': 10777,
 'surprising_ADJ': 33,
 'how_ADV': 822,
 'many_ADJ': 693,
 'people_SUBST': 1015,
 'take_VERB': 583,
 'part_SUBST': 464,
 'in_PREP': 14009,
 'competition_SUBST': 161,
 'without_PREP': 463,
 'knowing_VERB': 67,
 'the_ART': 46249,
 'rules_SUBST': 63,
 'even_ADV': 833,
 'at_PREP': 4222,
 'élite_UNC': 24,
 'level_SUBST': 156,
 'This_ADJ': 1175,
 'chapter_SUBST': 49,
 'will_VERB': 2500,
 'examine_VERB': 14,
 'of_PREP': 25860,
 'from_PREP': 3561,
 'competitors_SUBST': 27,
 "'_UNC": 431,
 'point_SUBST': 345,
 'view_SUBST': 206,
 'rather_ADV': 390,
 'than_CONJ': 1214,
 'referees_SUBST': 11,
 'show_VERB': 136,
 'a_ART': 19068,
 'knowledge_SUBST': 174,
 'can_VERB': 2719,
 'be_VERB': 6167,
 'used_VERB': 432,
 'to_PREP': 23319,
 'help_VERB': 292,
 'your_PRON': 2218,
 'tournament_SUBST': 41,
 'training_SUBST': 339,
 'by_PREP': 3827,
 'outlining_

# WORD AND TAG DISTRIBUTION IN THE CORPUS

In [6]:
word_f = {}
tag_f = {}

reader = open("train_data.txt", "r")

for line in reader:
    components = line.split("_")     # split the word_tag by '_'
    word_f[components[0]] = word_f.get(components[0], 0) + 1     # update word frequency
    tag_f[components[1].strip()] = tag_f.get(components[1].strip(), 0) + 1       # update tag frequency
    
reader.close()

In [7]:
# get the 10 most used words
sorted_word = sorted(word_f.items(), key = lambda x : x[1])
sorted_word.reverse()
sorted_word[:10]

[('the', 46249),
 ('of', 25860),
 ('to', 23320),
 ('and', 22704),
 ('a', 19111),
 ('in', 14513),
 ('is', 10777),
 ('I', 8428),
 ('that', 7996),
 ('it', 7087)]

In [8]:
# get the 10 most used tags
sorted_tags = sorted(tag_f.items(), key = lambda x : x[1])
sorted_tags.reverse()
sorted_tags[:10]

[('SUBST', 211526),
 ('VERB', 159864),
 ('PREP', 110905),
 ('ADJ', 98153),
 ('ART', 76658),
 ('PRON', 75307),
 ('ADV', 59998),
 ('CONJ', 51824),
 ('UNC', 6664),
 ('INTERJ', 1249)]

# COMPUTE PROBABILITIES

In [9]:
import numpy as np