# PREPROCESSING

In [1]:
# import ElementTree to parse XML test corpus
import xml.etree.ElementTree as ET

In [2]:
# get all files in the Train-corpus folder
import glob
path = '/home/kaustubh/Dropbox/Sem5/AI/Assignments/Train-corups/*.xml'
files = glob.glob(path)

In [3]:
# open a file to write the cleaned data
writer = open("train_data.txt", "w+")

# iterate over all files
for file in files:
    
    # parse each XML file
    tree = ET.parse(file)
    root = tree.getroot()   
    
    # get w elements in the file
    for word in root.iter('w'):      
        writer.write("%s_%s\n" % (word.text.strip(), word.attrib['pos'].strip()))
        
writer.close()

# WORD+TAG FREQUENCY COUNT

In [4]:
freq = {}

reader = open("train_data.txt", "r")
for line in reader:
    freq[line.strip()] = freq.get(line.strip(), 0) + 1

reader.close()

In [5]:
freq

{'USING_VERB': 3,
 'THE_ART': 219,
 'RULES_SUBST': 1,
 'TO_PREP': 49,
 'HELP_VERB': 5,
 'YOU_PRON': 26,
 'WIN_VERB': 9,
 'It_PRON': 2123,
 'is_VERB': 10777,
 'surprising_ADJ': 33,
 'how_ADV': 822,
 'many_ADJ': 693,
 'people_SUBST': 1015,
 'take_VERB': 583,
 'part_SUBST': 464,
 'in_PREP': 14009,
 'competition_SUBST': 161,
 'without_PREP': 463,
 'knowing_VERB': 67,
 'the_ART': 46249,
 'rules_SUBST': 63,
 'even_ADV': 833,
 'at_PREP': 4222,
 'élite_UNC': 24,
 'level_SUBST': 156,
 'This_ADJ': 1175,
 'chapter_SUBST': 49,
 'will_VERB': 2500,
 'examine_VERB': 14,
 'of_PREP': 25860,
 'from_PREP': 3561,
 'competitors_SUBST': 27,
 "'_UNC": 431,
 'point_SUBST': 345,
 'view_SUBST': 206,
 'rather_ADV': 390,
 'than_CONJ': 1214,
 'referees_SUBST': 11,
 'show_VERB': 136,
 'a_ART': 19068,
 'knowledge_SUBST': 174,
 'can_VERB': 2719,
 'be_VERB': 6167,
 'used_VERB': 432,
 'to_PREP': 23319,
 'help_VERB': 292,
 'your_PRON': 2218,
 'tournament_SUBST': 41,
 'training_SUBST': 339,
 'by_PREP': 3827,
 'outlining_

# WORD AND TAG DISTRIBUTION IN THE CORPUS

In [6]:
word_f = {}
tag_f = {}

reader = open("train_data.txt", "r")

for line in reader:
    components = line.split("_")     # split the word_tag by '_'
    word_f[components[0]] = word_f.get(components[0], 0) + 1     # update word frequency
    tag_f[components[1].strip()] = tag_f.get(components[1].strip(), 0) + 1       # update tag frequency
    
reader.close()

In [7]:
# get the 10 most used words
sorted_word = sorted(word_f.items(), key = lambda x : x[1])
sorted_word.reverse()
sorted_word[:10]

[('the', 46249),
 ('of', 25860),
 ('to', 23320),
 ('and', 22704),
 ('a', 19111),
 ('in', 14513),
 ('is', 10777),
 ('I', 8428),
 ('that', 7996),
 ('it', 7087)]

In [8]:
# get the 10 most used tags
sorted_tags = sorted(tag_f.items(), key = lambda x : x[1])
sorted_tags.reverse()
sorted_tags[:10]

[('SUBST', 211526),
 ('VERB', 159864),
 ('PREP', 110905),
 ('ADJ', 98153),
 ('ART', 76658),
 ('PRON', 75307),
 ('ADV', 59998),
 ('CONJ', 51824),
 ('UNC', 6664),
 ('INTERJ', 1249)]

# COMPUTE PROBABILITIES

In [9]:
probab_w_t = {}

for key in freq:
    components = key.split("_")
    probab_w_t[key] = freq.get(key)/word_f.get(components[0])
    
probab_w_t

{'USING_VERB': 1.0,
 'THE_ART': 1.0,
 'RULES_SUBST': 1.0,
 'TO_PREP': 1.0,
 'HELP_VERB': 0.29411764705882354,
 'YOU_PRON': 1.0,
 'WIN_VERB': 1.0,
 'It_PRON': 0.9976503759398496,
 'is_VERB': 1.0,
 'surprising_ADJ': 1.0,
 'how_ADV': 1.0,
 'many_ADJ': 1.0,
 'people_SUBST': 1.0,
 'take_VERB': 0.9965811965811966,
 'part_SUBST': 0.9957081545064378,
 'in_PREP': 0.9652725142975264,
 'competition_SUBST': 1.0,
 'without_PREP': 0.9935622317596566,
 'knowing_VERB': 0.9054054054054054,
 'the_ART': 1.0,
 'rules_SUBST': 0.984375,
 'even_ADV': 0.995221027479092,
 'at_PREP': 1.0,
 'élite_UNC': 1.0,
 'level_SUBST': 0.8524590163934426,
 'This_ADJ': 0.9991496598639455,
 'chapter_SUBST': 1.0,
 'will_VERB': 0.9819324430479183,
 'examine_VERB': 1.0,
 'of_PREP': 1.0,
 'from_PREP': 1.0,
 'competitors_SUBST': 1.0,
 "'_UNC": 1.0,
 'point_SUBST': 0.905511811023622,
 'view_SUBST': 0.9279279279279279,
 'rather_ADV': 1.0,
 'than_CONJ': 1.0,
 'referees_SUBST': 1.0,
 'show_VERB': 0.6445497630331753,
 'a_ART': 0.997749

In [10]:
input_tag = input("Give tag input...")

Give tag input...VERB


In [14]:
for key in probab_w_t:
    component = key.split("_")
    if component[1] == input_tag:
        print(key,probab_w_t.get(key))

USING_VERB 1.0
HELP_VERB 0.29411764705882354
WIN_VERB 1.0
is_VERB 1.0
take_VERB 0.9965811965811966
knowing_VERB 0.9054054054054054
will_VERB 0.9819324430479183
examine_VERB 1.0
show_VERB 0.6445497630331753
can_VERB 0.9956060051263274
be_VERB 1.0
used_VERB 0.9152542372881356
help_VERB 0.5971370143149284
outlining_VERB 1.0
need_VERB 0.7610474631751227
scored_VERB 1.0
explaining_VERB 1.0
failed_VERB 0.9310344827586207
score_VERB 0.41935483870967744
felt_VERB 0.9879032258064516
should_VERB 1.0
have_VERB 1.0
are_VERB 1.0
dealt_VERB 1.0
ensure_VERB 1.0
make_VERB 0.9941451990632318
may_VERB 1.0
shrink_VERB 1.0
fit_VERB 0.6464646464646465
miscalculate_VERB 1.0
crowded_VERB 0.3333333333333333
imperilled_VERB 1.0
missing_VERB 0.7575757575757576
marked_VERB 0.6571428571428571
highlighted_VERB 1.0
using_VERB 1.0
alert_VERB 0.391304347826087
approach_VERB 0.13970588235294118
pick_VERB 0.9491525423728814
stepping_VERB 1.0
makes_VERB 0.9834254143646409
get_VERB 1.0
sparring_VERB 0.5
crowd_VERB 0.0243

fits_VERB 0.9166666666666666
supinate_VERB 1.0
aiding_VERB 1.0
omitted_VERB 1.0
overtrain_VERB 1.0
correcting_VERB 1.0
overemphasised_VERB 1.0
reducing_VERB 1.0
tearing_VERB 1.0
eased_VERB 1.0
reapplied_VERB 1.0
suggests_VERB 1.0
attain_VERB 1.0
hurts_VERB 1.0
release_VERB 0.3333333333333333
overstretch_VERB 1.0
weightlifting_VERB 1.0
trains_VERB 0.25
brewed_VERB 1.0
granted_VERB 1.0
reserved_VERB 0.8333333333333334
letting_VERB 1.0
ferment_VERB 0.75
crush_VERB 0.5
transforming_VERB 0.75
ends_VERB 0.3829787234042553
harvested_VERB 1.0
soaked_VERB 0.2857142857142857
transferred_VERB 0.9444444444444444
heated_VERB 0.5
unlocked_VERB 1.0
milled_VERB 1.0
mixed_VERB 0.4358974358974359
dissolve_VERB 1.0
slotted_VERB 1.0
sprinkled_VERB 1.0
sparging_VERB 0.5
boiled_VERB 0.5
boiling_VERB 0.625
hopped_VERB 1.0
flows_VERB 1.0
cooled_VERB 1.0
fermenting_VERB 0.6666666666666666
confronted_VERB 1.0
sink_VERB 0.30434782608695654
saved_VERB 0.9285714285714286
sold_VERB 0.9555555555555556
continues_VERB

prickling_VERB 1.0
depicting_VERB 1.0
foresaw_VERB 1.0
Writing_VERB 0.48
clarified_VERB 1.0
fly_VERB 0.9117647058823529
interview_VERB 0.06976744186046512
collects_VERB 1.0
volunteer_VERB 0.375
muttered_VERB 1.0
overlook_VERB 1.0
forking_VERB 1.0
hovered_VERB 1.0
gripped_VERB 1.0
regretted_VERB 1.0
grazed_VERB 1.0
sizzled_VERB 1.0
shrinking_VERB 0.5
avoiding_VERB 1.0
roamed_VERB 1.0
transformed_VERB 1.0
adjusted_VERB 1.0
unzipped_VERB 1.0
inspect_VERB 1.0
peeked_VERB 1.0
Wearing_VERB 1.0
admiring_VERB 0.375
propel_VERB 1.0
Holding_VERB 1.0
guessed_VERB 1.0
Looks_VERB 1.0
pursed_VERB 1.0
trailing_VERB 0.23076923076923078
toads_VERB 0.25
glowed_VERB 1.0
retreating_VERB 0.6666666666666666
toad_VERB 0.043478260869565216
gnash_VERB 1.0
foam_VERB 0.1
attracts_VERB 1.0
silvering_VERB 1.0
munching_VERB 1.0
dissolved_VERB 0.5
nested_VERB 1.0
Following_VERB 1.0
swarm_VERB 0.25
reseeded_VERB 1.0
clinging_VERB 0.8333333333333334
flourished_VERB 1.0
bloomed_VERB 1.0
twined_VERB 1.0
rambled_VERB 1.0

voicing_VERB 1.0
entitle_VERB 1.0
implement_VERB 1.0
marks_VERB 0.2222222222222222
accelerate_VERB 1.0
forecast_VERB 0.3333333333333333
display_VERB 0.1956521739130435
reassess_VERB 1.0
ensured_VERB 1.0
neutered_VERB 1.0
under-used_VERB 1.0
desire_VERB 0.0379746835443038
anticipated_VERB 1.0
pours_VERB 1.0
neglect_VERB 0.23076923076923078
proclaims_VERB 1.0
penalizes_VERB 1.0
stigmatizes_VERB 1.0
diffuse_VERB 1.0
circulated_VERB 1.0
co-operate_VERB 1.0
Research_VERB 0.04
distribute_VERB 1.0
mooted_VERB 1.0
sided_VERB 1.0
collating_VERB 1.0
circulating_VERB 1.0
justify_VERB 1.0
costed_VERB 1.0
embodied_VERB 1.0
exhibited_VERB 0.8
deride_VERB 1.0
negates_VERB 1.0
compiled_VERB 1.0
forms_VERB 0.18811881188118812
channel_VERB 0.2857142857142857
authorize_VERB 1.0
endow_VERB 1.0
cascading_VERB 0.5
proffer_VERB 1.0
encompassing_VERB 1.0
elaborate_VERB 0.2857142857142857
denies_VERB 1.0
disciplined_VERB 0.625
retires_VERB 1.0
groaned_VERB 1.0
reiterates_VERB 1.0
owing_VERB 1.0
authorised_VERB

dawdle_VERB 1.0
Cheer_VERB 1.0
Score_VERB 1.0
screamed_VERB 1.0
sail_VERB 0.5
Admit_VERB 1.0
shrieked_VERB 1.0
trusting_VERB 0.6
GONE_VERB 1.0
Passed_VERB 1.0
heaving_VERB 0.4
Corrupted_VERB 1.0
Felled_VERB 1.0
reborn_VERB 0.75
Noting_VERB 1.0
quivered_VERB 1.0
Blew_VERB 1.0
Laughed_VERB 1.0
soared_VERB 1.0
Sent_VERB 1.0
dwelled_VERB 1.0
reserve_VERB 0.2727272727272727
underlined_VERB 1.0
SWEEP_VERB 1.0
flashing_VERB 0.625
enraged_VERB 1.0
snatched_VERB 1.0
lament_VERB 0.5
immersing_VERB 1.0
marvel_VERB 1.0
braved_VERB 1.0
chew_VERB 1.0
severing_VERB 1.0
striking_VERB 0.13793103448275862
deflowering_VERB 1.0
shutting_VERB 1.0
pinched_VERB 0.7
transport_VERB 0.16666666666666666
bubbled_VERB 1.0
conjured_VERB 1.0
promising_VERB 0.11764705882352941
disinfecting_VERB 1.0
spit_VERB 0.4
grownups_VERB 1.0
echoing_VERB 0.3333333333333333
reverted_VERB 1.0
wheeling_VERB 0.5
rearing_VERB 1.0
throbbing_VERB 0.5
interrupt_VERB 1.0
dumbfounded_VERB 1.0
resorting_VERB 1.0
trample_VERB 1.0
vent_VERB 

WEDD_VERB 1.0
rediscovered_VERB 1.0
feted_VERB 1.0
rivalled_VERB 1.0
poking_VERB 1.0
Formed_VERB 1.0
WISH_VERB 1.0
WERE_VERB 1.0
brave_VERB 0.07692307692307693
GET_VERB 1.0
LAID_VERB 1.0
KILLS_VERB 1.0
scripted_VERB 1.0
SCREENING_VERB 0.15
utilise_VERB 1.0
Featuring_VERB 1.0
Needs_VERB 1.0
unearthed_VERB 1.0
hosting_VERB 1.0
slots_VERB 1.0
Happening_VERB 1.0
BE_VERB 1.0
LIMITED_VERB 0.5
highlighting_VERB 1.0
CLOUD_VERB 1.0
refracted_VERB 1.0
reveres_VERB 1.0
talkies_VERB 0.5
recapture_VERB 0.5
fleeting_VERB 0.2222222222222222
experimenting_VERB 1.0
GIVES_VERB 1.0
REPRODUCED_VERB 1.0
booking_VERB 0.2222222222222222
BUY_VERB 1.0
POINTS_VERB 1.0
relay_VERB 1.0
heralded_VERB 1.0
Pioneered_VERB 1.0
bid_VERB 0.2
price_VERB 0.037037037037037035
nurture_VERB 0.5
campaigns_VERB 0.09090909090909091
bids_VERB 0.3333333333333333
occupying_VERB 1.0
Nominated_VERB 1.0
Heading_VERB 1.0
LIVING_VERB 1.0
HANG_VERB 1.0
UPLINE_VERB 1.0
SELLING_VERB 1.0
Armisted_VERB 1.0
SHOOT_VERB 0.5
KILL_VERB 1.0
script

invert_VERB 1.0
chaff_VERB 1.0
sieving_VERB 1.0
Seal_VERB 1.0
Store_VERB 0.5
perforated_VERB 0.25
turnips_VERB 0.5
regulated_VERB 0.5
photosynthesise_VERB 1.0
Striving_VERB 1.0
tackling_VERB 1.0
Understanding_VERB 0.8
tolerating_VERB 1.0
scorching_VERB 1.0
overhanging_VERB 1.0
Capitalise_VERB 1.0
honeysuckle_VERB 0.25
mulch_VERB 0.16666666666666666
foliar_VERB 1.0
brightening_VERB 1.0
orphan_VERB 0.125
Swinging_VERB 0.5
deepening_VERB 0.5
Wuthering_VERB 0.5
rumoured_VERB 1.0
pigtails_VERB 0.5
languishes_VERB 1.0
Tortured_VERB 1.0
publicise_VERB 1.0
fireproof_VERB 1.0
glimpsed_VERB 0.5
imparted_VERB 1.0
heightened_VERB 0.5714285714285714
deserts_VERB 0.5
cancels_VERB 1.0
discerns_VERB 1.0
records_VERB 0.13043478260869565
diminish_VERB 1.0
distinguishes_VERB 1.0
threatens_VERB 1.0
envelop_VERB 1.0
buggers_VERB 1.0
exchanges_VERB 1.0
hark_VERB 1.0
kneel_VERB 1.0
dabbling_VERB 1.0
litter_VERB 0.5
looted_VERB 1.0
subdued_VERB 0.5
enslaved_VERB 1.0
necessitated_VERB 1.0
narrated_VERB 1.0
tre

transduces_VERB 1.0
enduring_VERB 0.3333333333333333
conserved_VERB 1.0
bumps_VERB 0.5
differentiating_VERB 1.0
distrusting_VERB 1.0
Recall_VERB 1.0
contrasting_VERB 0.2
switches_VERB 0.3333333333333333
by-passed_VERB 1.0
blinked_VERB 1.0
parsing_VERB 0.4
parse_VERB 1.0
Believing_VERB 1.0
slotting_VERB 1.0
deliberate_VERB 0.06666666666666667
overstating_VERB 1.0
canalized_VERB 1.0
disbelieve_VERB 1.0
computes_VERB 1.0
theorize_VERB 1.0
interact_VERB 1.0
input_VERB 0.04878048780487805
Functionlust_VERB 1.0
glance_VERB 0.18181818181818182
ascribing_VERB 1.0
scanning_VERB 1.0
tiled_VERB 0.3333333333333333
willed_VERB 1.0
highfalutin'_VERB 1.0
flourishing_VERB 0.3333333333333333
posit_VERB 1.0
espousing_VERB 1.0
constructs_VERB 0.14285714285714285
underlying_VERB 0.35714285714285715
retrieving_VERB 1.0
retrieves_VERB 1.0
witnesses_VERB 0.1
transposing_VERB 1.0
disprove_VERB 1.0
sketching_VERB 1.0
swamps_VERB 1.0
co-ordinated_VERB 0.3333333333333333
misinterpret_VERB 1.0
misinterpreting_VER