In [428]:
import pandas as pd
import pymystem3 
import spacy
nlp = spacy.load("ru_core_news_lg")

### Data Load

In [429]:
df_manual = []
with open("../edz_dataset.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.replace("\ufeff" ,"")
        line = line.replace("\n" ,"")
        df_manual.append([line, 1])
        
with open("../edz_neg_dataset.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.replace("\ufeff" ,"")
        line = line.replace("\n" ,"")
        df_manual.append([line, 0])
df_manual = pd.DataFrame(df_manual, columns=["text", "label"])


In [430]:
df_manual

Unnamed: 0,text,label
0,завтра буду учить английский весь день,1
1,Завтра я пойду на работу,1
2,послезавтра мы планируем ехать на дачу,1
3,к вечеру я планирую закончить отчет,1
4,в воскресенье после обеда мы пойдем в ресторан,1
...,...,...
434,Мой отец увлекается историей автомобилей и кол...,0
435,Знакомый одноклассник работает в издательстве ...,0
436,Моя сестра изучает психологию и помогает людям...,0
437,Твой отец увлекается историей автомобилей и ко...,0


#### SpaCy

In [460]:
tags = []
doc = nlp(df_manual.loc[0, "text"])
for token in doc:
    tags.append([token.text, token.lemma_, token.ent_type_, token.pos_, token.tag_, token.dep_, token.head])
postok = {str(i): list(doc).index(i) for i in doc}
pd.DataFrame(tags, columns = ["Token", "lemma", "entity", "full", "entity", "dep type", "head"])

Unnamed: 0,Token,lemma,entity,full,entity.1,dep type,head
0,завтра,завтра,,ADV,ADV,advmod,учить
1,буду,буду,,AUX,AUX,aux,учить
2,учить,учить,,VERB,VERB,ROOT,учить
3,английский,английский,,ADJ,ADJ,obj,учить
4,весь,весь,,DET,DET,det,день
5,день,день,,NOUN,NOUN,obl,учить


#### MyStem

In [461]:
stem = pymystem3.Mystem()
stem.analyze(df_manual.loc[0, "text"])

[{'analysis': [{'lex': 'завтра', 'wt': 0.9964678515, 'gr': 'ADV='}],
  'text': 'завтра'},
 {'text': ' '},
 {'analysis': [{'lex': 'быть',
    'wt': 0.8530947255,
    'gr': 'V,нп=непрош,ед,изъяв,1-л'}],
  'text': 'буду'},
 {'text': ' '},
 {'analysis': [{'lex': 'учить', 'wt': 1, 'gr': 'V,несов,пе=инф'}],
  'text': 'учить'},
 {'text': ' '},
 {'analysis': [{'lex': 'английский',
    'wt': 1,
    'gr': 'A=(вин,ед,полн,муж,неод|им,ед,полн,муж)'}],
  'text': 'английский'},
 {'text': ' '},
 {'analysis': [{'lex': 'весь',
    'wt': 0.9978909737,
    'gr': 'APRO=(вин,ед,муж,неод|им,ед,муж)'}],
  'text': 'весь'},
 {'text': ' '},
 {'analysis': [{'lex': 'день',
    'wt': 0.999920149,
    'gr': 'S,муж,неод=(вин,ед|им,ед)'}],
  'text': 'день'},
 {'text': '\n'}]

#### Natasha

In [433]:
import natasha

In [434]:
# не работает на новом питоне
from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    PER,
    NamesExtractor,
    Doc
)


segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)
names_extractor = NamesExtractor(morph_vocab)
text = df_manual.loc[23, "text"]
doc = Doc(text)

In [435]:
doc.segment(segmenter=segmenter)
doc.tag_morph(tagger=morph_tagger)
for token in doc.tokens:
    token.lemmatize(morph_vocab)

In [436]:
doc.sents[0].morph.print()

                   Я PRON|Case=Nom|Number=Sing|Person=1
               поеду VERB|Aspect=Perf|Mood=Ind|Number=Sing|Person=1|Tense=Fut|VerbForm=Fin|Voice=Act
                   в ADP
              отпуск NOUN|Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing
                этим DET|Case=Ins|Gender=Neut|Number=Sing
               летом NOUN|Animacy=Inan|Case=Ins|Gender=Neut|Number=Sing
                  На ADP
                Бали PROPN|Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing


In [437]:
doc.tag_ner(ner_tagger)

In [438]:
doc.ner

NERMarkup(
    text='Я поеду в отпуск этим летом На Бали',
    spans=[Span(
         start=31,
         stop=35,
         type='LOC'
     )]
)

У рута-глагола будет полезным выделить признак стоит ли глагол в Perfect времени

In [439]:
[doc.sents[0].morph.tokens[1].feats["Aspect"],
doc.sents[0].morph.tokens[1].feats["Tense"], 
doc.sents[0].morph.tokens[1].feats["VerbForm"]]

['Perf', 'Fut', 'Fin']

## ML модели

### BoW

In [440]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

Выделяем леммы

In [441]:
def preprocess(text):
    lemmas = []
    doc = nlp(text)
    for tok in doc:
        lemmas.append(tok.lemma_)
    return " ".join(lemmas)

In [442]:
df_manual["lemmas"] = df_manual["text"].apply(lambda x: preprocess(x))
model_bow = CountVectorizer()
X = model_bow.fit_transform(df_manual.lemmas).toarray()
y = df_manual.label
x_train, x_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [443]:
model_rfr = RandomForestClassifier()
model_rfr.fit(x_train, y_train)

In [444]:
from sklearn.metrics import classification_report
y_pred = model_rfr.predict(x_test)
print("Random Forest Classifier")
print(classification_report(y_test, y_pred))
print(f"Test ROC AUC: {roc_auc_score(y_test, y_pred)}")

Random Forest Classifier
              precision    recall  f1-score   support

           0       0.94      0.93      0.93        54
           1       0.93      0.95      0.94        56

    accuracy                           0.94       110
   macro avg       0.94      0.94      0.94       110
weighted avg       0.94      0.94      0.94       110

Test ROC AUC: 0.9361772486772486


In [445]:
from sklearn.ensemble import GradientBoostingClassifier
model_gb = GradientBoostingClassifier()
model_gb.fit(x_train, y_train)

In [446]:
from sklearn.metrics import classification_report
y_pred = model_gb.predict(x_test)
print("Gradient Boosting")
print(classification_report(y_test, y_pred))
print(f"Test ROC AUC: {roc_auc_score(y_test, y_pred)}")

Gradient Boosting
              precision    recall  f1-score   support

           0       0.90      0.96      0.93        54
           1       0.96      0.89      0.93        56

    accuracy                           0.93       110
   macro avg       0.93      0.93      0.93       110
weighted avg       0.93      0.93      0.93       110

Test ROC AUC: 0.9279100529100529


Потестируем наш подход на тестовых предложениях

Уже более сложные зависимости не ухватывает

In [447]:
model_gb.predict(model_bow.transform(["Мы на выходных вечером летим в Бали на 2 ночи"]))

array([1], dtype=int64)

### Word2Vec
We will use Navec library from natasha project

In [448]:
x_train, x_test, y_train, y_test = train_test_split(df_manual["lemmas"], df_manual["label"])

In [449]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import tensorflow as tf

SENTENCE_LENGTH = 20
NUM = 200

def get_sequences(tokenizer, x):
    sequences = tokenizer.texts_to_sequences(x)
    return pad_sequences(sequences, maxlen=SENTENCE_LENGTH)

tokenizer = Tokenizer(num_words=NUM)
tokenizer.fit_on_texts(x_train)

# Отображаем каждый текст в массив идентификаторов токенов
x_train_seq = get_sequences(tokenizer, x_train)
x_test_seq = get_sequences(tokenizer, x_test)

In [450]:
from keras.layers import Input, Embedding, Dropout, Conv1D, Activation, Dense, GlobalMaxPooling1D, concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy

input_layer = Input(shape=(SENTENCE_LENGTH,), dtype='int32')
x = Embedding(NUM, 50, input_length=SENTENCE_LENGTH,trainable=True)(input_layer)
branches =  []

for size in range(2, 5):
    for i in range(5):
        branch = Conv1D(filters=1, kernel_size=size, padding='valid', activation='relu')(x)
        branch = GlobalMaxPooling1D()(branch)
        branches.append(branch)

x = concatenate(branches, axis=1)
x = Dropout(0.25)(x)
x = Dense(16, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

model_w2v = Model(inputs=[input_layer], outputs=[output])

model_w2v.compile(loss=BinaryCrossentropy(), optimizer=Adam(), metrics=['accuracy'])
model_w2v.summary()      

Model: "model_23"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_29 (InputLayer)          [(None, 20)]         0           []                               
                                                                                                  
 embedding_28 (Embedding)       (None, 20, 50)       10000       ['input_29[0][0]']               
                                                                                                  
 conv1d_675 (Conv1D)            (None, 19, 1)        101         ['embedding_28[0][0]']           
                                                                                                  
 conv1d_676 (Conv1D)            (None, 19, 1)        101         ['embedding_28[0][0]']           
                                                                                           

In [451]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, verbose=0, mode='auto')

history_w2v = model_w2v.fit(x_train_seq, y_train, validation_split=0.2, epochs=60,callbacks=early_stop)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60


In [452]:
y_pred = np.array(model_w2v.predict(x_test_seq) > 0.5)
print("CNN - 15 conv layers")
print(classification_report(y_test, y_pred))
print(f"Test ROC AUC: {roc_auc_score(y_test, y_pred)}")

CNN - 15 conv layers
              precision    recall  f1-score   support

           0       0.94      0.81      0.87        57
           1       0.82      0.94      0.88        53

    accuracy                           0.87       110
   macro avg       0.88      0.88      0.87       110
weighted avg       0.88      0.87      0.87       110

Test ROC AUC: 0.8752068851373718


In [453]:
from keras.layers import Input, Embedding, Dropout, Conv1D, Activation, Dense, GlobalMaxPooling1D, concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy

input_layer = Input(shape=(SENTENCE_LENGTH,), dtype='int32')
x = Embedding(NUM, 50, input_length=SENTENCE_LENGTH,trainable=True)(input_layer)
branches =  []

for size in range(2, 5):
    for i in range(16):
        branch = Conv1D(filters=5, kernel_size=size, padding='valid', activation='relu')(x)
        branch = GlobalMaxPooling1D()(branch)
        branches.append(branch)

x = concatenate(branches, axis=1)
x = Dropout(0.25)(x)
x = Dense(64, activation='relu')(x)
x = Dense(10, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)

model_w2v = Model(inputs=[input_layer], outputs=[output])

model_w2v.compile(loss=BinaryCrossentropy(), optimizer=Adam(), metrics=['accuracy'])
model_w2v.summary()      

Model: "model_24"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_30 (InputLayer)          [(None, 20)]         0           []                               
                                                                                                  
 embedding_29 (Embedding)       (None, 20, 50)       10000       ['input_30[0][0]']               
                                                                                                  
 conv1d_690 (Conv1D)            (None, 19, 5)        505         ['embedding_29[0][0]']           
                                                                                                  
 conv1d_691 (Conv1D)            (None, 19, 5)        505         ['embedding_29[0][0]']           
                                                                                           

In [454]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=7, verbose=0, mode='auto')

history_w2v = model_w2v.fit(x_train_seq, y_train, validation_split=0.2, epochs=60,callbacks=early_stop)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60


In [455]:
y_pred = np.array(model_w2v.predict(x_test_seq) > 0.5)
print("CNN - 48 conv layers")
print(classification_report(y_test, y_pred))
print(f"Test ROC AUC: {roc_auc_score(y_test, y_pred)}")

CNN - 48 conv layers
              precision    recall  f1-score   support

           0       0.92      0.81      0.86        57
           1       0.82      0.92      0.87        53

    accuracy                           0.86       110
   macro avg       0.87      0.87      0.86       110
weighted avg       0.87      0.86      0.86       110

Test ROC AUC: 0.8657729228732208


### Тестируем модель с лучшим AUC ROC - Random Forest

Мы с друзьями собираемся завтра в поход вечером

In [457]:
model_rfr.predict(model_bow.transform(["Мы с друзьями собираемся завтра в поход вечером"]))

array([1], dtype=int64)

Я пойду на Васильевский остров на пробежку утром

In [458]:
model_rfr.predict(model_bow.transform(["Я пойду на Васильевский остров на пробежку утром"]))

array([1], dtype=int64)

Мне нравится каша с изюмом

In [459]:
model_rfr.predict(model_bow.transform(["Мне нравится каша с изюмом"]))

array([0], dtype=int64)