# Ex2 - NLP

In [1]:
import nltk
nltk.download("treebank")


[nltk_data] Downloading package treebank to /home/nbuser/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [2]:
from collections import Counter, defaultdict
import random
import numpy as np
import operator
from nltk.corpus import treebank
len(treebank.tagged_sents())


3914

In [3]:
train_data = treebank.tagged_sents()[:3000]
test_data = treebank.tagged_sents()[3000:]
print(train_data[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


## simple tager

In [7]:
class simple_tagger:
    def __init__(self):
        self.map = {}
        self.pos_list = []
        self.words_result = 0
        self.sentences_result = 0
    
    def train(self, data):
        corpus_counts = defaultdict(Counter)
        for d in data:
            for word, pos in d:
                if pos not in self.pos_list:
                    self.pos_list.append(pos)
                corpus_counts[word][pos] +=1
        for word in corpus_counts:
            # takefind the most common pos, if there is tie - the tie breaker is random
            l = corpus_counts[word].most_common(1000)
            options = [l[0][0]]
            j=1
            while len(l)>j and l[j][1] == l[j-1][1]:
                options.append(l[j][0])
                j += 1
            value = random.choice(options)
            self.map[word] = value
            
        
    def evaluate(self, data):
        count = 0
        words_success = 0
        sentences_success = 0
        for d in data:
            sentence_flag = True
            for word, pos in d:
                count += 1
                if word in self.map:
                    suggested_pos = self.map[word]
                else:
                    suggested_pos = random.sample(self.pos_list, k=1)[0]
                if suggested_pos == pos:
                    words_success += 1
                else:
                    sentence_flag = False
            if sentence_flag:
                sentences_success += 1
        self.words_result = words_success/count # word accuracy 
        self.sentences_result = sentences_success/len(data) # sentence accuracy
                
        
c = simple_tagger()
c.train(train_data)
c.evaluate(test_data)


In [8]:
print("Simple tagger word result: " + str(c.words_result))
print("Simple tagger sentences result: " + str(c.sentences_result))


Simple tagger word result: 0.8598748111374919
Simple tagger sentences result:0.06892778993435449


## HMM tagger

In [16]:
class hmm_tagger:
    def __init__(self):
        self.pos_dict = {}
        self.words_dict = {}
        self.A = None
        self.B = None
        self.PI = None
        self.words_result = 0
        self.sentences_result = 0
        
    def get_b_matrix(self, corpus_counts, pos_count):
        B = np.zeros((len(self.pos_dict), len(self.words_dict)), dtype=float)
        for word in corpus_counts:
            word_index = self.words_dict[word]
            for pos in corpus_counts[word]:
                pos_index = self.pos_dict[pos]
                B[pos_index][word_index] = corpus_counts[word][pos]/pos_count[pos]
        return B
    
    def get_pi_matrix(self, pos_count):
        PI = np.zeros(len(self.pos_dict), dtype=float) # intilazie pi vector
        for pos in pos_count:
            PI[self.pos_dict[pos]] = pos_count[pos]
        PI = PI/sum(pos_count.values()) 
        return PI
    
    def get_a_matrix(self, data):
        A = np.zeros((len(self.pos_dict),len(self.pos_dict)), dtype=float) # intilazie A Matrix
        pos_counter = np.zeros(len(self.pos_dict))
        for line in data:
            last_pos = ""
            for word, pos in line:
                if (last_pos == ""):
                    last_pos = pos
                    continue
                A[self.pos_dict[last_pos]][self.pos_dict[pos]] += 1
                pos_counter[self.pos_dict[last_pos]] += 1
                last_pos = pos
        for pos, pos_index in self.pos_dict.items():
            A[pos_index] = A[pos_index]/pos_counter[pos_index] #normalize A
        return A
            
        
    def train(self, data): #train HMM 
        corpus_counts = defaultdict(Counter)
        pos_count = {}
        word_index = 0
        pos_index = 0
        for d in data:
            for word, pos in d:
                if pos not in self.pos_dict:
                    self.pos_dict[pos] = pos_index
                    pos_count[pos] = 0
                    pos_index = pos_index + 1 
                if word not in self.words_dict:
                    self.words_dict[word] = word_index
                    word_index = word_index + 1
                corpus_counts[word][pos] +=1
                pos_count[pos] += 1
        self.B = self.get_b_matrix(corpus_counts, pos_count)
        self.PI = self.get_pi_matrix(pos_count)
        self.A = self.get_a_matrix(data)
        
    def viterbi(self,word_list, A, B, Pi): #evlautate HMM
        # initialization
        T = len(word_list)
        N = A.shape[0] # number of tags

        delta_table = np.zeros((N, T)) # initialise delta table
        psi = np.zeros((N, T))  # initialise the best path table

        delta_table[:,0] = B[:, word_list[0]] * Pi

        for t in range(1, T):
            for s in range (0, N):
                trans_p = delta_table[:, t-1] * A[:, s]
                psi[s][t], delta_table[s][ t] = max(enumerate(trans_p), key=operator.itemgetter(1))
                delta_table[s][t] = delta_table[s][t] * B[s][word_list[t]]

        # Back tracking
        seq = np.zeros(T)
        seq[T-1] = delta_table[:, T-1].argmax()
        for t in range(T-1, 0, -1):
            seq[t-1] = psi[int(seq[t])][t]

        return seq
    
    def evaluate(self, data):
        sentence_success = 0
        sentence_counter = 0
        word_counter = 0 
        word_success = 0
        for line in data:
            list_of_pos = []
            list_of_words = []
            seq = None
            for word, pos in line:
                if word in self.words_dict:
                    list_of_words.append(self.words_dict[word])
                else:
                    list_of_words.append(np.random.choice(list(self.words_dict.values())))
                    if len(list_of_words)>0:
                        if seq is not None:
                            seq = np.append(seq, self.viterbi(list_of_words, self.A, self.B, self.PI))
                        else:
                            seq = self.viterbi(list_of_words, self.A, self.B, self.PI)
                    list_of_words = []
            if (seq is not None and len(list_of_words)>0):
                seq = np.append(seq, self.viterbi(list_of_words, self.A, self.B, self.PI))
            elif len(list_of_words)>0:
                seq = self.viterbi(list_of_words, self.A, self.B, self.PI)    
            sentences_success_flag = True
            index = 0
            for word, pos in line:
                word_counter += 1
                if int(self.pos_dict[pos]) == int(seq[index]):
                    word_success += 1
                else:
                    sentences_success_flag = False
                index += 1
            if sentences_success_flag:
                sentence_success += 1
        self.words_result = word_success/word_counter # word accuracy 
        self.sentences_result = sentence_success/len(data) # sentence accuracy    
                                    
hmm = hmm_tagger()
hmm.train(train_data)


In [17]:
hmm.evaluate(test_data)
print("HMM tagger word result: " + str(hmm.words_result))
print("HMM tagger sentences result: " + str(hmm.sentences_result))

HMM tagger word result: 0.8682926829268293
HMM tagger sentences result: 0.10393873085339168


## results

In [6]:
from nltk.tag import tnt
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)
print("MEMM tagger tagger word result: "+ str(tnt_pos_tagger.evaluate(test_data)))
counter=0
for line in test_data:
    if tnt_pos_tagger.evaluate([line])>0.95:
        counter += 1
print("MEMM tagger tagger sentences result: "+ str(counter/len(test_data)))



MEMM tagger tagger word result: 0.875545003237643
MEMM tagger tagger sentences result: 0.17067833698030635


EOF