# Assignment 2
### Daniel Mehta

## Exercise 1: [POS (Part-of-Speech) Tagging with Hidden Markov Model](https://www.mygreatlearning.com/blog/pos-tagging/)

In [1]:
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time

In [None]:
nltk.download('treebank')
nltk.download('universal_tagset')
nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
for sent in nltk_data[:2]:
  for tuple in sent:
    print(tuple)

In [None]:
#Split training and validation
train_set,test_set =train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state = 101)

In [None]:
train_tagged_words = [ tup for sent in train_set for tup in sent ]
test_tagged_words = [ tup for sent in test_set for tup in sent ]
print(len(train_tagged_words))
print(len(test_tagged_words))

In [None]:
train_tagged_words[:5]

In [None]:
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)

In [None]:
vocab = {word for word,tag in train_tagged_words}

In [None]:
#compute emission probability
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
     
    return (count_w_given_tag, count_tag)

In [None]:
# compute transition probability
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1]== t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [None]:
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
 
print(tags_matrix)

In [None]:
tags_df = pd.DataFrame(tags_matrix, columns = list(tags), index=list(tags))
display(tags_df)

In [None]:
def Viterbi(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and  probability states
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        
        state_max = T[p.index(pmax)] 
        state.append(state_max)
    return list(zip(words, state))

In [None]:
random.seed(42)
rndom = [random.randint(1,len(test_set)) for x in range(10)]
test_run = [test_set[i] for i in rndom]
test_run_base = [tup for sent in test_run for tup in sent]
test_tagged_words = [tup[0] for sent in test_run for tup in sent]

In [None]:
start = time.time()
tagged_seq = Viterbi(test_tagged_words)
end = time.time()
difference = end-start
print("Time taken in seconds: ", difference)

check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

In [None]:
#Takes too long to run
'''
test_tagged_words = [tup for sent in test_set for tup in sent]
test_untagged_words = [tup[0] for sent in test_set for tup in sent]
test_untagged_words

start = time.time()
tagged_seq = Viterbi(test_untagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)

check = [i for i, j in zip(test_tagged_words, test_untagged_words) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)
'''

In [None]:
patterns = [
    (r'.*ing$', 'VERB'),              # gerund
    (r'.*ed$', 'VERB'),               # past tense 
    (r'.*es$', 'VERB'),               # verb    
    (r'.*\'s$', 'NOUN'),              # possessive nouns
    (r'.*s$', 'NOUN'),                # plural nouns
    (r'\*T?\*?-[0-9]+$', 'X'),        # X
    (r'^-?[0-9]+(.[0-9]+)?$', 'NUM'), # cardinal numbers
    (r'.*', 'NOUN')                   # nouns
]

rule_based_tagger = nltk.RegexpTagger(patterns)

In [None]:
def Viterbi_rule_based(words, train_bag = train_tagged_words):
    state = []
    T = list(set([pair[1] for pair in train_bag]))
     
    for key, word in enumerate(words):
        p = [] 
        for tag in T:
            if key == 0:
                transition_p = tags_df.loc['.', tag]
            else:
                transition_p = tags_df.loc[state[-1], tag]
                 
            # compute emission and  probability states
            emission_p = word_given_tag(words[key], tag)[0]/word_given_tag(words[key], tag)[1]
            state_probability = emission_p * transition_p    
            p.append(state_probability)
             
        pmax = max(p)
        state_max = rule_based_tagger.tag([word])[0][1]       
        
         
        if(pmax==0):
            state_max = rule_based_tagger.tag([word])[0][1]
        else:
            if state_max != 'X':
                state_max = T[p.index(pmax)]                
             
         
        state.append(state_max)
    return list(zip(words, state))

In [None]:
start = time.time()
tagged_seq = Viterbi_rule_based(test_tagged_words)
end = time.time()
difference = end-start
 
print("Time taken in seconds: ", difference)

check = [i for i, j in zip(tagged_seq, test_run_base) if i == j] 
 
accuracy = len(check)/len(tagged_seq)
print('Viterbi Algorithm Accuracy: ',accuracy*100)

In [None]:
test_sent="Will can see Marry"
pred_tags_rule=Viterbi_rule_based(test_sent.split())
pred_tags_withoutRules= Viterbi(test_sent.split())
print(pred_tags_rule)
print(pred_tags_withoutRules)

## Exercise 2:

- Find a new text dataset
- Convert it into csv format
- Redo the same exercise


### a & b)

In [5]:
import nltk.corpus

In [9]:
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/danielmehta/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/danielmehta/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [11]:
nltk.corpus.brown.tagged_sents(tagset='universal')

[[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ('said', 'VERB'), ('Friday', 'NOUN'), ('an', 'DET'), ('investigation', 'NOUN'), ('of', 'ADP'), ("Atlanta's", 'NOUN'), ('recent', 'ADJ'), ('primary', 'NOUN'), ('election', 'NOUN'), ('produced', 'VERB'), ('``', '.'), ('no', 'DET'), ('evidence', 'NOUN'), ("''", '.'), ('that', 'ADP'), ('any', 'DET'), ('irregularities', 'NOUN'), ('took', 'VERB'), ('place', 'NOUN'), ('.', '.')], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ('term-end', 'NOUN'), ('presentments', 'NOUN'), ('that', 'ADP'), ('the', 'DET'), ('City', 'NOUN'), ('Executive', 'ADJ'), ('Committee', 'NOUN'), (',', '.'), ('which', 'DET'), ('had', 'VERB'), ('over-all', 'ADJ'), ('charge', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('election', 'NOUN'), (',', '.'), ('``', '.'), ('deserves', 'VERB'), ('the', 'DET'), ('praise', 'NOUN'), ('and', 'CONJ'), ('thanks', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('City

In [13]:
brown_tagged = nltk.corpus.brown.tagged_sents(tagset='universal')[:500]

In [17]:
flattened = [(word, tag) for sent in brown_tagged for (word, tag) in sent]

In [19]:
df = pd.DataFrame(flattened, columns=['Word', 'Tag'])
df.to_csv('brown_pos.csv', index=False)

### c)

In [55]:
#Split training and validation 80/20
train_sents, test_sents = train_test_split(brown_tagged, train_size=0.8, random_state=5501)

In [23]:
train_tagged_words = [tup for sent in train_sents for tup in sent]
test_tagged_words = [tup for sent in test_sents for tup in sent]

In [25]:
print("Train:", len(train_tagged_words))
print("Test:", len(test_tagged_words))

Train: 9492
Test: 2219


In [27]:
def word_given_tag(word, tag, train_bag = train_tagged_words):
    tag_list = [pair for pair in train_bag if pair[1]==tag]
    count_tag = len(tag_list)
    w_given_tag_list = [pair[0] for pair in tag_list if pair[0]==word]
    count_w_given_tag = len(w_given_tag_list)
     
    return (count_w_given_tag, count_tag)

In [29]:
def t2_given_t1(t2, t1, train_bag = train_tagged_words):
    tags = [pair[1] for pair in train_bag]
    count_t1 = len([t for t in tags if t==t1])
    count_t2_t1 = 0
    for index in range(len(tags)-1):
        if tags[index]==t1 and tags[index+1]== t2:
            count_t2_t1 += 1
    return (count_t2_t1, count_t1)

In [33]:
tags = sorted(list({tag for _, tag in train_tagged_words}))

In [35]:
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')
for i, t1 in enumerate(list(tags)):
    for j, t2 in enumerate(list(tags)): 
        tags_matrix[i, j] = t2_given_t1(t2, t1)[0]/t2_given_t1(t2, t1)[1]
 
print(tags_matrix)

[[0.10965795 0.04426559 0.09959759 0.04627766 0.05633803 0.18812877
  0.23138833 0.02615694 0.07645875 0.02012073 0.10060363 0.        ]
 [0.05322581 0.06129032 0.0483871  0.00322581 0.02741935 0.00322581
  0.75       0.01129032 0.         0.02580645 0.01451613 0.0016129 ]
 [0.01071723 0.08244023 0.02555647 0.00741962 0.0016488  0.4286892
  0.30008245 0.05441055 0.03050289 0.00824402 0.05028854 0.        ]
 [0.08       0.12       0.15272728 0.06909091 0.01454545 0.06909091
  0.02181818 0.00727273 0.04       0.04       0.38545454 0.        ]
 [0.00921659 0.14285715 0.05990783 0.05529954 0.         0.16129032
  0.33179724 0.01382488 0.04147466 0.02304148 0.16129032 0.        ]
 [0.01358696 0.21014492 0.01086957 0.00905797 0.         0.00543478
  0.6512681  0.02173913 0.00724638 0.00271739 0.06793478 0.        ]
 [0.22210708 0.017962   0.23039724 0.01692573 0.03937824 0.01761658
  0.25423142 0.00725389 0.01139896 0.02901554 0.1537133  0.        ]
 [0.14857143 0.10285714 0.14857143 0.00571

In [39]:
tags_matrix = np.zeros((len(tags), len(tags)), dtype='float32')

for i, t1 in enumerate(tags):
    for j, t2 in enumerate(tags):
        count_t2_t1, count_t1 = t2_given_t1(t2, t1)
        if count_t1 == 0:
            tags_matrix[i, j] = 0.0
        else:
            tags_matrix[i, j] = count_t2_t1 / count_t1

tags_df = pd.DataFrame(tags_matrix, columns=tags, index=tags)
print(tags_df)

             .       ADJ       ADP       ADV      CONJ       DET      NOUN  \
.     0.109658  0.044266  0.099598  0.046278  0.056338  0.188129  0.231388   
ADJ   0.053226  0.061290  0.048387  0.003226  0.027419  0.003226  0.750000   
ADP   0.010717  0.082440  0.025556  0.007420  0.001649  0.428689  0.300082   
ADV   0.080000  0.120000  0.152727  0.069091  0.014545  0.069091  0.021818   
CONJ  0.009217  0.142857  0.059908  0.055300  0.000000  0.161290  0.331797   
DET   0.013587  0.210145  0.010870  0.009058  0.000000  0.005435  0.651268   
NOUN  0.222107  0.017962  0.230397  0.016926  0.039378  0.017617  0.254231   
NUM   0.148571  0.102857  0.148571  0.005714  0.040000  0.028571  0.417143   
PRON  0.064935  0.004329  0.051948  0.047619  0.008658  0.021645  0.000000   
PRT   0.038462  0.008547  0.068376  0.021368  0.004274  0.042735  0.038462   
VERB  0.069843  0.045039  0.172977  0.072454  0.009138  0.171671  0.143603   
X     0.000000  0.000000  0.000000  0.000000  0.000000  0.000000

In [41]:
def Viterbi(words, train_bag=train_tagged_words):
    state = []
    T = list(set([tag for _, tag in train_bag]))

    for i, word in enumerate(words):
        p = []
        for tag in T:
            if i == 0:
                trans_p = tags_df.loc['.',tag] if '.' in tags_df.index else 1e-6
            else:
                trans_p = tags_df.loc[state[-1],tag] if state[-1] in tags_df.index else 1e-6

            # Emission
            emission_count, tag_count = word_given_tag(word, tag)
            emission_p = emission_count /tag_count if tag_count > 0 else 1e-6

            # Combined prob
            state_p = emission_p * trans_p
            p.append(state_p)

        max_p = max(p)
        max_state = T[p.index(max_p)]
        state.append(max_state)

    return list(zip(words, state))

In [53]:
start = time.time()

test_words = [word for word, _ in test_tagged_words]
tagged_seq = Viterbi(test_words)

end = time.time()
difference = end - start

print("Time taken in seconds:", round(difference, 4))
correct = [pred for pred, actual in zip(tagged_seq, test_tagged_words) if pred == actual]
accuracy = len(correct) / len(tagged_seq)

print("Viterbi Algorithm Accuracy:", round(accuracy * 100, 2), "%")

Time taken in seconds: 12.6162
Viterbi Algorithm Accuracy: 79.77 %
