In [1]:
%config IPCompleter.greedy = True

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
import random
import pprint

import nltk
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

from collections import defaultdict

### 1. Load NLTK and Test Dataset</font>
***

#### Loading Treebank tagged sentences using **universal** tagset

`VERB - verbs (all tenses and modes)
NOUN - nouns (common and proper)
PRON - pronouns 
ADJ - adjectives
ADV - adverbs
ADP - adpositions (prepositions and postpositions)
CONJ - conjunctions
DET - determiners
NUM - cardinal numbers
PRT - particles or other function words
X - other: foreign words, typos, abbreviations
. - punctuation
`

In [4]:
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

In [5]:
# observe a few tagged sentences from the corpora
print(nltk_data[:2])

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')]]


#### Loading Test Data

In [6]:
file_object = open(r"test-sentences.txt","r", encoding="latin1")
test_data = file_object.read()
test_data

"Android is a mobile operating system developed by Google.\nAndroid has been the best-selling OS worldwide on smartphones since 2011 and on tablets since 2013.\nGoogle and Twitter made a deal in 2015 that gave Google access to Twitter's firehose.\nTwitter is an online news and social networking service on which users post and interact with messages known as tweets.\nBefore entering politics, Donald Trump was a domineering businessman and a television personality.\nThe 2018 FIFA World Cup is the 21st FIFA World Cup, an international football tournament contested once every four years.\nThis is the first World Cup to be held in Eastern Europe and the 11th time that it has been held in Europe.\nShow me the cheapest round trips from Dallas to Atlanta\nI would like to see flights from Denver to Philadelphia.\nShow me the price of the flights leaving Atlanta at about 3 in the afternoon and arriving in San Francisco.\nNASA invited social media users to experience the launch of ICESAT-2 Satell

In [7]:
# number of words in the test dataset
test_data_words = nltk.word_tokenize(test_data)
len(test_data_words)

181

#### Tagging Test Dataset With NLTK POS Tagger

In [8]:
test_tagged_words = {}
test_tagged = nltk.pos_tag(test_data_words, tagset='universal')
universal_tagset = [
    'VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 
    'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.'
]

for utag in universal_tagset:
    test_tagged_words[utag] = sorted(
        set([word for (word, tag) in test_tagged if tag == utag]))

i = random.randrange(len(universal_tagset))

pprint.pprint('words with tagged with {}'.format(universal_tagset[i]))
pprint.pprint(test_tagged_words[universal_tagset[i]])

'words with tagged with ADJ'
['best-selling',
 'cheapest',
 'domineering',
 'first',
 'international',
 'mobile',
 'online',
 'social']


### 2. Split data into train and validation datasets
***

In [9]:
train_set, validation_set = train_test_split(nltk_data,
                                             test_size=0.05,
                                             random_state=1234)

print('Number of sentences in train dataset : {0}'.format(len(train_set)))
print('Number of sentences in validation dataset : {0}'.format(len(validation_set)))

Number of sentences in train dataset : 3718
Number of sentences in validation dataset : 196


In [10]:
train_tagged_words = [tup for sent in train_set for tup in sent]

In [11]:
tokens = [pair[0] for pair in train_tagged_words]
print('total number of words in the training set : {0}'.format(len(tokens)))

vocabulary = set(tokens)
print('total number of unique words in the training set: {0}'.format(len(vocabulary)))

total number of words in the training set : 95799
total number of unique words in the training set: 12073


In [12]:
all_tags = [pair[1] for pair in train_tagged_words]
unique_tags = sorted(set(all_tags))

print('number of tags in the universal tagset : {}'.format(len(unique_tags)))
print(unique_tags)

number of tags in the universal tagset : 12
['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']


### 3. Helper Functions
***

#### Store number of times a tag 'T' appears in the training dataset

In [13]:
tag_count_dict = dict()

for utag in unique_tags:
    tag_list = [pair[1] for pair in train_tagged_words if pair[1] == utag]
    tag_count_dict[utag] = len(tag_list)
    
print(tag_count_dict)    

{'.': 11130, 'ADJ': 6063, 'ADP': 9387, 'ADV': 3052, 'CONJ': 2144, 'DET': 8269, 'NOUN': 27471, 'NUM': 3364, 'PRON': 2619, 'PRT': 3070, 'VERB': 12910, 'X': 6320}


#### List of Unknown Words in Validation Dataset

In [14]:
val_data_unknown_words = [word for sent in validation_set for (word, tag) in sent if word not in vocabulary]
print('number of unknown words in validation data set : {0}'.format(len(set(val_data_unknown_words))))

number of unknown words in validation data set : 335


#### List of Unknown Words in Test Dataset

In [15]:
test_data_unknown_words = [word for word in test_data_words if word not in vocabulary]
print('number of unknown words in test data set : {0}'.format(len(set(test_data_unknown_words))))

number of unknown words in test data set : 28


#### Calculate Number of Words correctly tagged in Test Dataset

In [17]:
def calc_test_dataset_accuracy(tagged_test_set):
    total_words = 0
    correct_tagged_words = 0

    for word, tag in tagged_test_set:
        try:
            list_for_tag = test_tagged_words[tag]
        except KeyError:
            list_for_tag = []

        total_words += 1

        if word in list_for_tag:
            correct_tagged_words += 1

    print('total words - {0}. correctly tagged words - {1}. accuracy - {2}'.
          format(total_words, correct_tagged_words,
                 correct_tagged_words / total_words))

### 4. Learning HMM Model Parameters
***

#### Emission Probabilities

In [18]:
def word_given_tag(word, tag, train_bag=train_tagged_words):

    w_given_tag_list = [
        pair[0] for pair in train_bag if pair[0] == word and pair[1] == tag
    ]
    count_w_given_tag = len(w_given_tag_list)

    return count_w_given_tag

#### Transition Probabilities

In [19]:
def t2_given_t1(t2, t1, train_bag=train_tagged_words):
    
    count_t2_t1 = 0

    for index in range(len(all_tags) - 1):
        if all_tags[index] == t1 and all_tags[index + 1] == t2:
            count_t2_t1 += 1

    return count_t2_t1

In [20]:
tags_matrix = np.zeros((len(unique_tags), len(unique_tags)), dtype='float32')

for i, t1 in enumerate(list(unique_tags)):
    for j, t2 in enumerate(list(unique_tags)):
        count_t1 = tag_count_dict[t1]
        tags_matrix[i, j] = t2_given_t1(t2, t1) / count_t1

In [21]:
df_tag = pd.DataFrame(tags_matrix,
                      columns=list(unique_tags),
                      index=list(unique_tags))

df_tag

Unnamed: 0,.,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRON,PRT,VERB,X
.,0.09407,0.044654,0.090386,0.051932,0.057772,0.173226,0.223091,0.080593,0.065768,0.002336,0.088769,0.027314
ADJ,0.065809,0.065314,0.077519,0.004948,0.016658,0.004948,0.698499,0.021112,0.00066,0.010886,0.012205,0.021442
ADP,0.039842,0.105785,0.016512,0.013849,0.000959,0.322893,0.322893,0.062001,0.070203,0.001491,0.008522,0.035048
ADV,0.134666,0.129751,0.118611,0.081258,0.006881,0.06848,0.031127,0.031455,0.0154,0.014744,0.344364,0.023263
CONJ,0.033116,0.118937,0.052705,0.05597,0.000466,0.11847,0.348881,0.041511,0.057369,0.005131,0.158582,0.008862
DET,0.017777,0.203652,0.009191,0.012698,0.000484,0.005442,0.63865,0.022373,0.003749,0.000242,0.039545,0.046197
NOUN,0.239307,0.012231,0.177023,0.017182,0.042263,0.01325,0.264898,0.009537,0.004769,0.043974,0.146336,0.029231
NUM,0.115933,0.032402,0.035672,0.002973,0.013377,0.002973,0.354637,0.184899,0.001486,0.027051,0.018133,0.210464
PRON,0.040473,0.073692,0.023291,0.032837,0.004582,0.009164,0.207331,0.007255,0.007637,0.011837,0.487972,0.093929
PRT,0.041694,0.084039,0.021173,0.009772,0.00228,0.099674,0.247883,0.056678,0.017915,0.001954,0.402932,0.014007


#### Start Probabilities

In [22]:
df_tag.loc['.', :]

.       0.094070
ADJ     0.044654
ADP     0.090386
ADV     0.051932
CONJ    0.057772
DET     0.173226
NOUN    0.223091
NUM     0.080593
PRON    0.065768
PRT     0.002336
VERB    0.088769
X       0.027314
Name: ., dtype: float32