### Тема «POS-tagger и NER»

#### Задание. Написать теггер на данных с русским языком

1) Проверить UnigramTagger, BigramTagger, TrigramTagger и их комбинации
2) Написать свой теггер как на занятии, попробовать разные векторайзеры, добавить знание не только букв но и слов
3) Сравнить все реализованные методы, сделать выводы

In [1]:
import pandas as pd

import pyconll

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb

import matplotlib
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pyconll.load_from_file('datasets/ru_syntagrus-ud-train-c.conllu')
test = pyconll.load_from_file('datasets/ru_syntagrus-ud-dev.conllu')

Задание 1

In [3]:
train_data = []
for sent in train[:]:
    train_data.append([(token.form, token.upos) for token in sent])

test_data = []
for sent in test[:]:
    test_data.append([(token.form, token.upos) for token in sent])

test_sents=[]      
for sent in test[:]:
    test_sents.append([token.form for token in sent])

In [4]:
default_tagger = nltk.DefaultTagger('NN')
default_tagger.evaluate(test_data)

0.0

In [5]:
unigram_tagger = UnigramTagger(train_data)
unigram_tagger.evaluate(test_data)

0.8013933198775962

In [6]:
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
bigram_tagger.evaluate(test_data)

0.8057946480890683

In [7]:
trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
trigram_tagger.evaluate(test_data)

0.8043687740087245

In [8]:
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff


backoff = DefaultTagger('NN') 
tag = backoff_tagger(train_data,  
                     [UnigramTagger, BigramTagger, TrigramTagger],  
                     backoff = backoff) 
  
tag.evaluate(test_data)

0.8046292076307051

Задание 2

In [9]:
train_tok = []
train_label = []

for sent in train_data[:]:
    for tok in sent:
        train_tok.append('NO_TOKEN' if tok[0] is None else tok[0])
        train_label.append('NO_TAG' if tok[1] is None else tok[1])
        
test_tok = []
test_label = []

for sent in test_data[:]:
    for tok in sent:
        test_tok.append('NO_TOKEN' if tok[0] is None else tok[0])
        test_label.append('NO_TAG' if tok[1] is None else tok[1])

In [10]:
le = LabelEncoder()

train_enc_labels = le.fit_transform(train_label)
test_enc_labels = le.transform(test_label)

In [11]:
def calculate_accuracy(Vectorizer, Regression, ngram=(1, 5), analyzer='char'):
    vectorizer = Vectorizer(ngram_range=ngram, analyzer=analyzer)

    X_train = vectorizer.fit_transform(train_tok)
    X_test = vectorizer.transform(test_tok)

    r = Regression
    r.fit(X_train, train_enc_labels)
    
    pred = r.predict(X_test)
      
    accuracy = accuracy_score(test_enc_labels, pred)

    return accuracy

In [12]:
hashing_vectorizer_ngram15 = calculate_accuracy(HashingVectorizer, LogisticRegression(random_state=0), ngram=(1, 5))
hashing_vectorizer_ngram25 = calculate_accuracy(HashingVectorizer, LogisticRegression(random_state=0), ngram=(2, 5))
count_vectorizer = calculate_accuracy(CountVectorizer, LogisticRegression(random_state=0), ngram=(1, 5))
tfidf_vectorizer = calculate_accuracy(TfidfVectorizer, LogisticRegression(random_state=0), ngram=(1, 5))
count_vectorizer_word = calculate_accuracy(CountVectorizer, LogisticRegression(random_state=0), ngram=(1, 2), analyzer='word')
tfidf_vectorizer_word = calculate_accuracy(TfidfVectorizer, LogisticRegression(random_state=0), ngram=(1, 2), analyzer='word')
count_vectorizer_xgb = calculate_accuracy(CountVectorizer, xgb.XGBClassifier(verbosity=0), ngram=(1, 5))
tfidf_vectorizer_xgb = calculate_accuracy(TfidfVectorizer, xgb.XGBClassifier(verbosity=0), ngram=(1, 5))

Задание 3

In [13]:
vectorizers = ['hashing_vectorizer_ngram15', 'hashing_vectorizer_ngram25', 'count_vectorizer', 'tfidf_vectorizer', 
               'count_vectorizer_word', 'tfidf_vectorizer_word', 'count_vectorizer_xgb', 'tfidf_vectorizer_xgb']
accuracy_scores = [hashing_vectorizer_ngram15, hashing_vectorizer_ngram25, count_vectorizer, tfidf_vectorizer, 
                   count_vectorizer_word, tfidf_vectorizer_word, count_vectorizer_xgb, tfidf_vectorizer_xgb]

result = pd.DataFrame({'Vectorizer': vectorizers, 
                       'Accuracy': accuracy_scores})

result.sort_values('Accuracy', ascending=False)

Unnamed: 0,Vectorizer,Accuracy
2,count_vectorizer,0.931213
3,tfidf_vectorizer,0.919155
7,tfidf_vectorizer_xgb,0.911941
0,hashing_vectorizer_ngram15,0.909603
6,count_vectorizer_xgb,0.90627
1,hashing_vectorizer_ngram25,0.825822
5,tfidf_vectorizer_word,0.61269
4,count_vectorizer_word,0.60834
