## Корпус
Корпус был собран с помощью запросов к НКРЯ. 
#### Глава1 - head.
+ глава      
    на расстоянии от -7 до 7 от район|отдел|муниципальный|районный|управление|сказать|говорить|заместитель|сообщать|сообщить|слово  
    
#### Глава2 - chapter
+ глава  
    на расстоянии от -7 до 7 от книга|читать|интересный|писать|повествовать|роман|произведение  

**Из выдачи по обоим запросам удаляются примеры, содержащие "во главе"**

In [12]:
import regex as re

In [52]:
import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

In [25]:
from preprocessing import tokenize

In [42]:
with open('head.txt') as file:
    head_lines = file.readlines()

In [43]:
with open('chapter.txt') as file:
    chapter_lines = file.readlines()

In [44]:
len(head_lines)

400

In [45]:
head_sentences = []
chapter_sentences = []

In [46]:
for i in range(1, len(head_lines), 2):
    sentence = re.sub('\[.*?\]', ' ', head_lines[i])
    sentence = sentence.replace('←…→', '')
    sentence = sentence.lower()
    if 'во главе' not in sentence:
        head_sentences.append(sentence)

In [47]:
for i in range(1, len(chapter_lines), 2):
    sentence = re.sub('\[.*?\]', ' ', chapter_lines[i])
    sentence = sentence.replace('←…→', '')
    sentence = sentence.lower()
    if 'во главе' not in sentence:
        chapter_sentences.append(sentence)

In [48]:
len(head_sentences)

190

In [49]:
len(chapter_sentences)

194

## Препроцессинг 
+ токенизация
+ лемматизация
+ удаление стоп-слов

In [50]:
head_tokens = [tokenize(sent) for sent in head_sentences]
chapter_tokens = [tokenize(sent) for sent in chapter_sentences]

In [51]:
chapter_tokens[10]

['становиться',
 'писать',
 'глава',
 'который',
 'я',
 'прочитывать',
 'очень',
 'я',
 'трогать']

In [57]:
# окно - 4 слова слева и справа от целевого
# head - 0
# chapter - 1
entries_with_classes = []
for i, corpus in enumerate([head_tokens, chapter_tokens]):
    for token_set in corpus:
        token_set_copy = list(token_set)
        try:
            ind = token_set_copy.index('глава')
        except ValueError:
            ind = token_set_copy.index('гл')
        if ind - 4 < 0:
            start = 0
        else:
            start = ind-4
        if ind + 5 > len(token_set_copy):
            finish = len(token_set_copy)
        else:
            finish = ind + 5  
        token_set_copy = token_set_copy[start:finish]
        try:
            token_set_copy.remove('глава')
        except ValueError:
            token_set_copy.remove('гл')
        entries_with_classes.append((' '.join(token_set_copy), i))

##  Признаки - count vectorizer и tf-idf vectorizer

In [96]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [74]:
from sklearn.ensemble import RandomForestClassifier

In [69]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [60]:
X = [entry[0] for entry in entries_with_classes]
y = [entry[1] for entry in entries_with_classes]

In [62]:
count_vect = CountVectorizer(analyzer='word')

In [97]:
tf_vect = TfidfVectorizer(analyzer='word')

In [63]:
def score_classifier(clf, metric='f1'):
    scores = cross_val_score(pipeline, np.asarray(X), np.asarray(y), cv=5, scoring=metric)
    score = sum(scores) / len(scores)
    return score

In [64]:
nb =  MultinomialNB()

In [76]:
rf = RandomForestClassifier()

In [73]:
pipeline = Pipeline([
    ('vectorizer', count_vect),
    ('classifier', nb)])
print ('F1 for Naive Bayes with CountVectorizer on cross-validation:', score_classifier(pipeline))

F1 for Naive Bayes with CountVectorizer on cross-validation: 0.921436612854


In [77]:
pipeline = Pipeline([
    ('vectorizer', count_vect),
    ('classifier', rf)])
print ('F1 for RandomForest with CountVectorizer on cross-validation:', score_classifier(pipeline))

F1 for RandomForest with CountVectorizer on cross-validation: 0.88600171484


In [98]:
pipeline = Pipeline([
    ('vectorizer', tf_vect),
    ('classifier', nb)])
print ('F1 for Naive Bayes with TfidfVectorizer on cross-validation:', score_classifier(pipeline))

F1 for Naive Bayes with TfidfVectorizer on cross-validation: 0.918010355657


In [99]:
pipeline = Pipeline([
    ('vectorizer', tf_vect),
    ('classifier', rf)])
print ('F1 for RandomForest with TfidfVectorizer on cross-validation:', score_classifier(pipeline))

F1 for RandomForest with TfidfVectorizer on cross-validation: 0.874339847603


In [82]:
X_vect = count_vect.fit_transform(X)

In [83]:
rf.fit(X_vect, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [91]:
feature_names = count_vect.get_feature_names()
feature_importances = rf.feature_importances_

In [92]:
imp = zip(feature_names, feature_importances)

In [94]:
imp = sorted(imp, key=lambda x:x[1], reverse=True)

In [95]:
imp[:10]

[('книга', 0.15531678781661135),
 ('заместитель', 0.045383975057919808),
 ('район', 0.036928736915023137),
 ('читать', 0.033897454222876523),
 ('роман', 0.032149113463298927),
 ('писать', 0.029477321997815543),
 ('говорить', 0.028512194686581505),
 ('администрация', 0.025516321338822485),
 ('сообщать', 0.02521180757394521),
 ('первый', 0.0126189781396976)]

### Наиболее важные признаки, полученные с помощью Random Forest:
  Наличие в контексте слов:
+ ('книга', 0.15531678781661135), - chapter
+ ('заместитель', 0.045383975057919808) - head
+ ('район', 0.036928736915023137) - head
+ ('читать', 0.033897454222876523) - chapter
+ ('роман', 0.032149113463298927) - chapter
+ ('писать', 0.029477321997815543) - chapter
+ ('говорить', 0.028512194686581505) - head
+ ('администрация', 0.025516321338822485) - head
+ ('сообщать', 0.02521180757394521) - head
+ ('первый', 0.0126189781396976) - chapter
Большиство слов совпадает с теми, которые были использованы для сбора корпуса. 