# Ex2 - NLP

In [1]:
import nltk
nltk.download("treebank")


[nltk_data] Downloading package treebank to /home/nbuser/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [17]:
from collections import Counter, defaultdict
import random
import numpy as np
from nltk.corpus import treebank
len(treebank.tagged_sents())


3914

In [3]:
train_data = treebank.tagged_sents()[:3000]
test_data = treebank.tagged_sents()[3000:]
print(train_data[0])

[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


## simple tager

In [7]:
class simple_tagger:
    def __init__(self):
        self.map = {}
        self.pos_list = []
        self.words_result = 0
        self.sentences_result = 0
    
    def train(self, data):
        corpus_counts = defaultdict(Counter)
        for d in data:
            for word, pos in d:
                if pos not in self.pos_list:
                    self.pos_list.append(pos)
                corpus_counts[word][pos] +=1
        for word in corpus_counts:
            # takefind the most common pos, if there is tie - the tie breaker is random
            l = corpus_counts[word].most_common(1000)
            options = [l[0][0]]
            j=1
            while len(l)>j and l[j][1] == l[j-1][1]:
                options.append(l[j][0])
                j += 1
            value = random.choice(options)
            self.map[word] = value
            
        
    def evaluate(self, data):
        count = 0
        words_success = 0
        sentences_success = 0
        for d in data:
            sentence_flag = True
            for word, pos in d:
                count += 1
                if word in self.map:
                    suggested_pos = self.map[word]
                else:
                    suggested_pos = random.sample(self.pos_list, k=1)[0]
                if suggested_pos == pos:
                    words_success += 1
                else:
                    sentence_flag = False
            if sentence_flag:
                sentences_success += 1
        self.words_result = words_success/count
        self.sentences_result = sentences_success/len(data)
                
        
c = simple_tagger()
c.train(train_data)
c.evaluate(test_data)


In [8]:
print("Simple tagger word result: " + str(c.words_result))
print("Simple tagger sentences result: " + str(c.sentences_result))


Simple tagger word result: 0.8598748111374919
Simple tagger sentences result:0.06892778993435449


## HMM tagger

In [53]:
class hmm_tagger:
    def __init__(self):
        self.pos_dict = {}
        self.words_dict = {}
        
    def get_b_matrix(self, corpus_counts):
        B = np.zeros((len(self.words_dict), len(self.pos_dict)), dtype=float)
        for word in corpus_counts:
            word_index = self.words_dict[word]
            total = 0
            for pos in corpus_counts[word]:
                total += corpus_counts[word][pos]
            for pos in corpus_counts[word]:
                pos_index = self.pos_dict[pos]
                B[word_index][pos_index] = corpus_counts[word][pos]/total
        return B
    
    def get_pi_matrix(self, data):
        PI = np.zeros(len(self.pos_dict)) 
        for line in data:
            word, pos = line[0]
            pos_index = self.pos_dict[pos]
            PI[pos_index] +=1
        PI = PI/len(data)
        return PI
    
    def get_a_matrix(self, data):
        A = np.zeros((len(self.pos_dict),len(self.pos_dict))) 
        for line in data:
            word, pos = line[0]
            pos_index = self.pos_dict[pos]
            PI[pos_index] +=1
        PI = PI/len(data)
        return PI
            
        
    def train(self, data):
        corpus_counts = defaultdict(Counter)
        word_index = 0
        pos_index = 0
        for d in data:
            for word, pos in d:
                if pos not in self.pos_dict:
                    self.pos_dict[pos] = pos_index
                    pos_index = pos_index + 1 
                if word not in self.words_dict:
                    self.words_dict[word] = word_index
                    word_index = word_index + 1
                corpus_counts[word][pos] +=1
        B = self.get_b_matrix(corpus_counts)
        PI = self.get_pi_matrix(data)
        
    
    def evaluate(self, data):
        print("dor")
hmm = hmm_tagger()
hmm.train(train_data)

[0.17666667 0.         0.00866667 0.04266667 0.03633333 0.00033333
 0.00066667 0.228      0.03666667 0.13266667 0.         0.003
 0.00333333 0.05566667 0.         0.00133333 0.02433333 0.04733333
 0.00166667 0.071      0.00066667 0.00066667 0.         0.
 0.00766667 0.00133333 0.         0.08766667 0.00533333 0.00033333
 0.00466667 0.00266667 0.003      0.007      0.00166667 0.00166667
 0.         0.002      0.         0.00033333 0.00033333 0.
 0.00033333 0.         0.00233333 0.        ]
0.9999999999999998
