# Part-of-Speech разметка, NER, извлечение отношений

## Задание 1. Написать теггер на данных с русским языком

1. проверить UnigramTagger, BigramTagger, TrigramTagger и их комбмнации
2. написать свой теггер как на занятии, попробовать разные векторайзеры, добавить знание не только букв но и слов
3. сравнить все реализованные методы сделать выводы

In [4]:
pip install pyconll

Collecting pyconll
  Downloading pyconll-3.1.0-py3-none-any.whl (26 kB)
Installing collected packages: pyconll
Successfully installed pyconll-3.1.0
You should consider upgrading via the '/usr/local/bin/python3.9 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
import pyconll

In [None]:
!mkdir datasets
!wget -O ./datasets/ru_syntagrus-ud-train.conllu https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-train.conllu
!wget -O ./datasets/ru_syntagrus-ud-dev.conllu https://raw.githubusercontent.com/UniversalDependencies/UD_Russian-SynTagRus/master/ru_syntagrus-ud-dev.conllu

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from gensim.models import Word2Vec, FastText

from nltk.tag import DefaultTagger
from nltk.tag import UnigramTagger
from nltk.tag import BigramTagger, TrigramTagger
from nltk.tag import RegexpTagger
from nltk.corpus import names
import nltk
nltk.download('names')

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

from scipy.sparse import hstack

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
full_train = pyconll.load_from_file('datasets/ru_syntagrus-ud-train.conllu')
full_test = pyconll.load_from_file('datasets/ru_syntagrus-ud-dev.conllu')

In [None]:
for sent in full_train[:2]:
    for token in sent:
        print(token.form, token.upos)
    print()

### 1.1 проверить UnigramTagger, BigramTagger, TrigramTagger и их комбмнации

In [None]:
fdata_train = []
for sent in full_train[:]:
    fdata_train.append([(token.form, token.upos) for token in sent])
    
fdata_test = []
for sent in full_test[:]:
    fdata_test.append([(token.form, token.upos) for token in sent])
    
fdata_sent_test = []
for sent in full_test[:]:
    fdata_sent_test.append([token.form for token in sent])
    
    
MAX_SENT_LEN = max(len(sent) for sent in full_train)
MAX_ORIG_TOKEN_LEN = max(len(token.form) for sent in full_train for token in sent)
print('Наибольшая длина предложения', MAX_SENT_LEN)
print('Наибольшая длина токена', MAX_ORIG_TOKEN_LEN)

In [None]:
default_tagger = nltk.DefaultTagger('NOUN')
default_tagger.evaluate(fdata_test)

In [None]:
unigram_tagger = UnigramTagger(fdata_train)
unigram_tagger.evaluate(fdata_test)

In [None]:
bigram_tagger = BigramTagger(fdata_train, backoff=unigram_tagger)
bigram_tagger.evaluate(fdata_test)

In [None]:
trigram_tagger = TrigramTagger(fdata_train, backoff=bigram_tagger)
trigram_tagger.evaluate(fdata_test)

In [None]:
def backoff_tagger(train_sents, tagger_classes, backoff=None):
    for cls in tagger_classes:
        backoff = cls(train_sents, backoff=backoff)
    return backoff


backoff = DefaultTagger('NOUN') 
tag = backoff_tagger(fdata_train,  
                     [
                      UnigramTagger, 
                      BigramTagger, 
                      TrigramTagger
                     ],  
                     backoff = backoff) 
  
tag.evaluate(fdata_test)

## 1.2 написать свой теггер как на занятии, попробовать разные векторайзеры, добавить знание не только букв но и слов

In [None]:
train_tok = []
train_label = []
for sent in fdata_train[:]:
    for tok in sent:
        train_tok.append(tok[0])
        train_label.append('NO_TAG' if tok[1] is None else tok[1])
        
test_tok = []
test_label = []
for sent in fdata_test[:]:
    for tok in sent:
        test_tok.append(tok[0])
        test_label.append('NO_TAG' if tok[1] is None else tok[1])
        
        
le = LabelEncoder()
train_enc_labels = le.fit_transform(train_label) 
test_enc_labels = le.transform(test_label)
le.classes_

In [None]:
for vectorizer in [CountVectorizer, HashingVectorizer, TfidfVectorizer]:

    scaler = StandardScaler(with_mean=False)
    coder = vectorizer(ngram_range=(1, 5), analyzer='char')
    

    X_train = coder.fit_transform(train_tok)
    X_test = coder.transform(test_tok)
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)    
    
    
    print(X_train.shape)
    lr = LogisticRegression(random_state=0, max_iter = 100, n_jobs=7)
    lr.fit(X_train, train_enc_labels)

    pred = lr.predict(X_test)

    print(vectorizer, accuracy_score(test_enc_labels, pred))

In [None]:
for vectorizer in [CountVectorizer, HashingVectorizer, TfidfVectorizer]:

    scaler = StandardScaler(with_mean=False)
    coder = vectorizer(ngram_range=(1, 5), analyzer='word')
    

    X_train = coder.fit_transform(train_tok)
    X_test = coder.transform(test_tok)
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)    
    
    
    print(X_train.shape)
    lr = LogisticRegression(random_state=0, max_iter = 100, n_jobs=7)
    lr.fit(X_train, train_enc_labels)

    pred = lr.predict(X_test)

    print(vectorizer, accuracy_score(test_enc_labels, pred))

In [None]:
scaler = StandardScaler(with_mean=False)
coder_1 = TfidfVectorizer(ngram_range=(1, 5), analyzer='char')
coder_2 = HashingVectorizer(ngram_range=(1, 5), analyzer='word')

X_train_1 = coder_1.fit_transform(train_tok)
X_test_1 = coder_1.transform(test_tok)

X_train_2 = coder_2.fit_transform(train_tok)
X_test_2 = coder_2.transform(test_tok)


X_train = hstack((X_train_1,X_train_2))
X_test = hstack((X_test_1,X_test_2))

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)    


print(X_train.shape)
lr = LogisticRegression(random_state=0, max_iter = 100, n_jobs=7)
lr.fit(X_train, train_enc_labels)

pred = lr.predict(X_test)

print('TfidfVectorizer_char + HashingVectorizer_word :', accuracy_score(test_enc_labels, pred))

__Выводы:__

Для nltk.tag лучший вариант это: Комбинация из DefaultTagger UnigramTagger BigramTagger TrigramTagger
0.9119991237825633

Для Vectorizer лучший вариант это: Комбинация из LogisticRegression поверх TfidfVectorizer при условии analyzer='char'
0.9487749806221144

## Задание 2. Проверить насколько хорошо работает NER
данные брать из http://www.labinform.ru/pub/named_entities/
1. взять нер из nltk
2. проверить deeppavlov
3. написать свой нер попробовать разные подходы:
 - передаём в сетку токен и его соседей
 - передаём в сетку только токен
4. сделать выводы по вашим экспериментам какой из подходов успешнее справляется


## 2.1 взять нер из nltk

In [None]:
!pip install corus

In [None]:
import corus

In [None]:
from corus import load_ne5

In [None]:
!wget http://www.labinform.ru/pub/named_entities/collection5.zip

In [None]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
!ls

In [None]:
# !unzip collection5.zip

In [None]:
path = 'Collection5/'
records = load_ne5(path)

In [None]:
document = next(records).text

In [None]:
document

In [None]:
{(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(document))) if hasattr(chunk, 'label') }

## 2.2. проверить deeppavlov

In [None]:
# !pip install deeppavlov

In [None]:
# не пошла установка

## 2.3 написать свой нер попробовать разные подходы:
- передаём в сетку токен и его соседей
- передаём в сетку только токен

In [None]:
!pip install razdel

In [None]:
from razdel import tokenize

In [None]:
records = corus.load_ne5(path)
words_docs = []
for ix, rec in enumerate(records):
    words = []
    for token in tokenize(rec.text):
       
        result = 'None'        
        
        for item in rec.spans:            
            if (token.start >= item.start) and (token.stop <= item.stop) and (item.type == 'PER'):
                result = 'PER'
                break
            if (token.start >= item.start) and (token.stop <= item.stop) and (item.type == 'ORG'):
                result = 'ORG'
                break
            if (token.start >= item.start) and (token.stop <= item.stop) and (item.type == 'MEDIA'):
                result = 'MEDIA'
                break
            if (token.start >= item.start) and (token.stop <= item.stop) and (item.type == 'LOC'):
                result = 'LOC'
                break
            if (token.start >= item.start) and (token.stop <= item.stop) and (item.type == 'GEOPOLIT'):
                result = 'GEOPOLIT'
                break
                
    
        words.append([token.text, result])
    words_docs.extend(words)

In [None]:
df_words = pd.DataFrame(words_docs, columns=['word', 'tag'])

In [None]:
df_words['tag'].value_counts()

In [None]:
df_words.head(5)

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D, GRU, LSTM, Dropout, Input
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
from sklearn import model_selection, preprocessing, linear_model

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(df_words['word'], df_words['tag'])

# labelEncode целевую переменную
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [None]:
train_data = tf.data.Dataset.from_tensor_slices((train_x, train_y))
valid_data = tf.data.Dataset.from_tensor_slices((valid_x, valid_y))

train_data = train_data.batch(16)
valid_data = valid_data.batch(16)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_data = train_data.cache().prefetch(buffer_size=AUTOTUNE)
valid_data = valid_data.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
def custom_standardization(input_data):
        return input_data

def data_prep(train_data, seq_len=1, vocab_size = 30000):    
    
    vocab_size = 30000
    #seq_len = 1

    vectorize_layer = TextVectorization(
        standardize=custom_standardization,
        max_tokens=vocab_size,
        output_mode='int',
        output_sequence_length=seq_len)


    # Make a text-only dataset (no labels) and call adapt to build the vocabulary.
    text_data = train_data.map(lambda x, y: x)
    vectorize_layer.adapt(text_data)
    return vectorize_layer

In [None]:
embedding_dim = 64

class modelNER(tf.keras.Model):
    def __init__(self):
        super(modelNER, self).__init__()
        self.emb = Embedding(vocab_size, embedding_dim)
        self.gPool = GlobalMaxPooling1D()
        self.fc1 = Dense(300, activation='relu')
        self.fc2 = Dense(50, activation='relu')
        self.fc3 = Dense(len(df_words['tag'].value_counts()), activation='softmax')

    def call(self, x):
        x = vectorize_layer(x)
        x = self.emb(x)
        pool_x = self.gPool(x)
        
        fc_x = self.fc1(pool_x)
        fc_x = self.fc2(fc_x)
        
        concat_x = tf.concat([pool_x, fc_x], axis=1)
        return self.fc3(concat_x)

In [None]:
vocab_size = 30000
vectorize_layer = data_prep(train_data, seq_len = 1, vocab_size = vocab_size)


mmodel = modelNER()
mmodel.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])
mmodel.fit(train_data, validation_data=valid_data, epochs=5)

In [None]:
vocab_size = 30000
vectorize_layer = data_prep(train_data, seq_len = 3, vocab_size = vocab_size)


mmodel = modelNER()
mmodel.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])
mmodel.fit(train_data, validation_data=valid_data, epochs=5)

#### Вывод.

Длина последовательности практически не влияет на результат