In [2]:
import re
from conllu import parse
import numpy as np
from collections import Counter, OrderedDict, defaultdict
import nltk
from nltk.corpus import conll2000

In [3]:
train = 'en_ewt-ud-train.conllu'
with open (train, encoding = 'utf-8') as f:
    data_train = f.read()
train_sent = parse(data_train)

In [4]:
test = 'en_ewt-ud-test.conllu'
with open (test, encoding = 'utf-8') as f:
    data_test = f.read()
test_sent = parse(data_train)

In [5]:
train_sents = []
for sent in train_sent:
    sent_list = []
    for token in sent:
        sent_list.append(tuple((token['form'], token['upostag'])))
    train_sents.append(sent_list)

In [6]:
test_sents = []
for sent in test_sent:
    sent_list = []
    for token in sent:
        sent_list.append(tuple((token['form'], token['upostag'])))
    test_sents.append(sent_list)

In [15]:
class BigramTagger:
    def __init__(self, train_sents):
        self.alpha = 1 ## чтобы не делить на ноль. Может, не один надо добавлять?..
        tags = [pair[1] for sent in train_sents for pair in sent]
        self.tagset = list(set(tags))
        self.tags = {tag:i for i, tag in enumerate(['<b>'] + self.tagset + ['<e>'] + ['UNK'])} ##переводит теги в индексы в матрице
        words = [pair[0] for sent in train_sents for pair in sent]
        vocabulary = list(set(words))
        self.vocab = {word:i for i, word in enumerate(vocabulary)} ##переводит слова в индексы в матрице
        self.emission = np.zeros((len(self.tags), (len(self.vocab)))) ##[tag][worrd]
        self.transition = np.zeros((len(self.tags), len(self.tags))) ## [tag_i-1][tag_i]
        self.update_trans(train_sents)
        self.update_emis(train_sents)
        self.emission = self.normalize(self.emission)
        self.transition = self.normalize(self.transition)
        #self.classify_sent(test_sents)
        
            
        
    def update_trans(self, train_sents):
        for sent in train_sents:
            tags = [pair[1] for pair in sent]
            tags_seq = [tag for tag in (['<b>'] + tags + ['<e>'] + ['UNK'])]
            for i in range(1, len(tags_seq)):
                t_i = self.tags[tags_seq[i]]
                t_imin1 = self.tags[tags_seq[i-1]]
                self.transition[t_imin1][t_i] +=1
        self.transition += self.alpha
                
    def update_emis(self, train_sents):
        for sent in train_sents:
            for pair in sent:
                word = self.vocab[pair[0]]
                tag = self.tags[pair[1]]
                self.emission[tag][word] +=1
        self.emission += self.alpha
        
    def normalize(self, trellis):
        trellis = [[trellis[t][w]/np.sum(tag)  for w, word in enumerate(tag)] for t, tag in enumerate(trellis)]
        return trellis
    
    def classify_ex(self, word_i, tag_imin1): #на вход слово i ('word') и tag i-1
        if word_i not in self.vocab:
            tag = 'UNK'
        else:
            word_ind = self.vocab[word_i]
            max_prob = 0
            for t in self.tagset:
                tag_i_ind = self.tags[t]
                tag_imin1_ind = self.tags[tag_imin1]
                prob = self.emission[tag_i_ind][word_ind] * self.transition[tag_imin1_ind][tag_i_ind] ## word i tag i, tag i tag i-1
                if prob > max_prob:
                    max_prob, tag = prob, t
        return tag
    
    def classify_sent(self, sent): #на вход список слов ['word1', 'word2'...]
        tags_seq = ['<b>']
        for i in range(len(sent)):
            tag = self.classify_ex(sent[i],tags_seq[i]) #одинаковые индексы! потому что тэги начинаются с <b> а предложение нет
            tags_seq.append(tag)
        tags_seq.remove('<b>')
        return list(zip(sent, tags_seq))
    
    #def classify(self, test_sents):
    #    all_tags = []
    #    for sent in test_sents:
    #        sent_tags = self.classify_sent(sent)
    #        all_tags.append(sent_tags)
    #    return all_tags
        
        



In [16]:
model = BigramTagger(train_sents)

In [9]:
def accuracy(test_sents, postagger):
    errors = 0
    length = 0
    for sent in test_sents:
        length += len(sent)
        sent, real_tags = zip(*sent)  # что тут произошло?
        my_tags = postagger.classify_sent(sent)
        for i in range(len(my_tags)):
            if my_tags[i][1] != real_tags[i]:
                errors += 1
    return 1 - errors / length

In [17]:
acc_ud = accuracy(test_sents, model)

In [18]:
train_nltk = conll2000.tagged_sents()[:8000]
test_nltk = conll2000.tagged_sents()[8000:]

model_nltk = BigramTagger(train_nltk)
acc_nltk = accuracy(test_nltk, model_nltk)

In [19]:
print('Точность модели, обученной и протестированной на корпусе UD: %s' % acc_ud)
print('Точность модели, обученной и протестированной на корпусе  conll2000: %s' % acc_nltk)

Точность модели, обученной и протестированной на корпусе UD: 0.8522728938892609
Точность модели, обученной и протестированной на корпусе  conll2000: 0.8614593239387913
