In [None]:
#Importing libraries
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint , time


#download the treebook corpus from nltk
nltk.download('treebank')


#download the universal tagset from nltk
nltk.download('universal_tagset')


#reading the Treebank tagged sentences
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset = 'universal'))


#print the first two sentences along with tags
print(nltk_data[:2])


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


In [None]:
#print each word with its respective tag for first two sentences


for sent in nltk_data[:2]:
  for tuple in sent:
    print(tuple)

('Pierre', 'NOUN')
('Vinken', 'NOUN')
(',', '.')
('61', 'NUM')
('years', 'NOUN')
('old', 'ADJ')
(',', '.')
('will', 'VERB')
('join', 'VERB')
('the', 'DET')
('board', 'NOUN')
('as', 'ADP')
('a', 'DET')
('nonexecutive', 'ADJ')
('director', 'NOUN')
('Nov.', 'NOUN')
('29', 'NUM')
('.', '.')
('Mr.', 'NOUN')
('Vinken', 'NOUN')
('is', 'VERB')
('chairman', 'NOUN')
('of', 'ADP')
('Elsevier', 'NOUN')
('N.V.', 'NOUN')
(',', '.')
('the', 'DET')
('Dutch', 'NOUN')
('publishing', 'VERB')
('group', 'NOUN')
('.', '.')


In [None]:
# split data into training and validation set in the ratio 80:20
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)


# create list of train and test tagged words
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
print(len(train_tagged_words))
print(len(test_tagged_words))

80310
20366


In [None]:
# check some of the tagged words.
train_tagged_words[:5]

[('Drink', 'NOUN'),
 ('Carrier', 'NOUN'),
 ('Competes', 'VERB'),
 ('With', 'ADP'),
 ('Cartons', 'NOUN')]

In [None]:
#use set datatype to check how many unique tags are present in training data
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)

# check total words in vocabulary
vocab = {word for word,tag in train_tagged_words}

12
{'NUM', 'DET', 'ADV', 'ADP', 'CONJ', 'PRON', 'X', '.', 'NOUN', 'ADJ', 'VERB', 'PRT'}


In [None]:
# compute Emission Probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)#total number of times the passed tag occurred in train_bag
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
#now calculate the total number of times the passed word occurred as the passed tag.
    count_w_given_tag = len(w_given_tag_list)

    return (count_w_given_tag, count_tag)


# compute  Transition Probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1] == t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)


# creating t x t transition matrix of tags, t= no of tags
# Matrix(i, j) represents P(jth tag after the ith tag)

tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)):
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]

print(tags_matrix)




[[1.84219927e-01 3.57015361e-03 3.57015361e-03 3.74866128e-02
  1.42806144e-02 1.42806140e-03 2.02427700e-01 1.19243130e-01
  3.51660132e-01 3.53445187e-02 2.07068902e-02 2.60621198e-02]
 [2.28546783e-02 6.03708485e-03 1.20741697e-02 9.91806854e-03
  4.31220367e-04 3.30602261e-03 4.51343954e-02 1.73925534e-02
  6.35906279e-01 2.06410810e-01 4.02472317e-02 2.87480245e-04]
 [2.98681147e-02 7.13731572e-02 8.14584941e-02 1.19472459e-01
  6.98215654e-03 1.20248254e-02 2.28859577e-02 1.39255241e-01
  3.21955010e-02 1.30721495e-01 3.39022487e-01 1.47401085e-02]
 [6.32751212e-02 3.20931405e-01 1.45532778e-02 1.69577319e-02
  1.01240189e-03 6.96026310e-02 3.45482156e-02 3.87243740e-02
  3.23588967e-01 1.07061505e-01 8.47886596e-03 1.26550242e-03]
 [4.06147093e-02 1.23490669e-01 5.70801310e-02 5.59824370e-02
  5.48847427e-04 6.03732169e-02 9.33040585e-03 3.51262353e-02
  3.49066973e-01 1.13611415e-01 1.50384188e-01 4.39077942e-03]
 [6.83371304e-03 9.56719834e-03 3.69020514e-02 2.23234631e-02
  5