In [254]:
import re
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [222]:
with open('anna.txt', encoding='utf-8') as f:
    anna = f.read()
with open('sonets.txt', encoding='utf-8') as f:
    sonets = f.read()

In [228]:
anna_sentences = re.split(r'(?:[.]\s*){3}|[.?!]', anna)
sonet_sentences = re.split(r'(?:[.]\s*){3}|[.?!]', sonets)

Смотрим на длину выборок

In [229]:
print(len(sonet_sentences), len(anna_sentences))

1378 21499


Она очень неровная, поэтому сделаем их одинаковыми

In [230]:
anna_sentences = random.sample(anna_sentences, len(sonet_sentences))

In [231]:
print(len(sonet_sentences), len(anna_sentences))

1378 1378


Делаем вектора

In [232]:
anna_sent_letters = [len([char for char in sent if char.isalpha()]) for sent in anna_sentences]
sonet_sent_letters = [len([char for char in sent if char.isalpha()]) for sent in sonet_sentences]

In [233]:
anna_unique_letters = [len(set([char for char in sent if char.isalpha()])) for sent in anna_sentences]
sonet_unique_letters = [len(set([char for char in sent if char.isalpha()])) for sent in sonet_sentences]

In [234]:
vowels = 'аоэуыяёеюи'

In [235]:
anna_sent_vowels = [len([char for char in sent if char in vowels]) for sent in anna_sentences]
sonet_sent_vowels = [len([char for char in sent if char in vowels]) for sent in sonet_sentences]

In [237]:
def lenwords(sentence):
    return [len(word) for word in sentence.split()] or [0]

In [238]:
anna_sentlens = [lenwords(sentence) for sentence in anna_sentences]
sonet_sentlens = [lenwords(sentence) for sentence in sonet_sentences]

In [239]:
anna_word_letters_median = [np.median(sent) for sent in anna_sentlens]
sonet_word_letters_median = [np.median(sent) for sent in sonet_sentlens]

In [240]:
def lenwords_vow(sentence):
    return [len([char for char in word if char in vowels]) for word in sentence.split()] or [0]

In [241]:
anna_sentlens_vow = [lenwords_vow(sentence) for sentence in anna_sentences]
sonet_sentlens_vow = [lenwords_vow(sentence) for sentence in sonet_sentences]

In [242]:
anna_word_vowels_median = [np.median(sent) for sent in anna_sentlens_vow]
sonet_word_vowels_median = [np.median(sent) for sent in sonet_sentlens_vow]

Собираем вектора в кучу

In [243]:
data = [
    ('sent_letters', anna_sent_letters + sonet_sent_letters),
    ('sent_letters_unique', anna_unique_letters + sonet_unique_letters),
    ('sent_vowels', anna_sent_vowels + sonet_sent_vowels),
    ('word_letters_median', anna_word_letters_median + sonet_word_letters_median),
    ('word_vowels_median', anna_word_vowels_median + sonet_word_vowels_median),
    ('class', [0 for _ in range(len(anna_sent_letters))] + [1 for _ in range(len(sonet_sent_letters))])
]

In [245]:
df = pd.DataFrame.from_items(data)

In [246]:
df.isnull().values.any()

False

In [247]:
X_all = df.loc[:,['sent_letters','sent_letters_unique', 'sent_vowels', 'word_letters_median', 'word_vowels_median']]

Делим на трейн и тест

In [249]:
X_train, X_test, y_train, y_test = train_test_split(X_all, df['class'], test_size=0.2)

Делаем для начала байеса

In [250]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

a = accuracy_score(y_test, y_pred)
f = f1_score(y_test, y_pred)
print(a, f)

0.559782608696 0.636771300448


Результат не оч, попробуем рендом форест и логит

In [253]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

a = accuracy_score(y_test, y_pred)
f = f1_score(y_test, y_pred)
print(a, f)

0.76268115942 0.749521988528


In [255]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

a = accuracy_score(y_test, y_pred)
f = f1_score(y_test, y_pred)
print(a, f)

0.748188405797 0.745886654479


Уже лучше