**Модель 1**

In [None]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

DATA_DIR = 'data'

def load_data():
    authors = []
    texts = []

    for author_dir in os.listdir(DATA_DIR):
        author_path = os.path.join(DATA_DIR, author_dir)
        if not os.path.isdir(author_path):
            continue

        for filename in os.listdir(author_path):
            if not filename.endswith('.txt'):
                continue

            file_path = os.path.join(author_path, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()

            authors.append(author_dir)
            texts.append(text)

    return pd.DataFrame({'author': authors, 'text': texts})

def train_model(df):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['text'])
    y = df['author']

    clf = LogisticRegression()
    clf.fit(X, y)

    return clf, vectorizer

def predict_author(clf, vectorizer, text):
    X_new = vectorizer.transform([text])
    y_new = clf.predict(X_new)
    return y_new[0]

def evaluate_model(clf, vectorizer, X, y):
    y_pred = clf.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted')

    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 Score:', f1)

if __name__ == '__main__':
    df = load_data()
    clf, vectorizer = train_model(df)

    new_authors = ['Franz Kafka', 'Charles Dickens', 'Agatha Chritstie', 'Lovecraft', 'Walter Scott', 'William Shakespeare']
    new_texts = ['the time carelessly, as they did in the golden world. OLIVER. What, you', 'brook such disgrace well as he shall run into, in that it is thing of his own']
    for i in range(len(new_authors)):
        author_dir = os.path.join(DATA_DIR, new_authors[i])
        os.makedirs(author_dir, exist_ok=True)

        filename = f'{new_authors[i]}_{len(os.listdir(author_dir))+1}.txt'
        file_path = os.path.join(author_dir, filename)

        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(new_texts[i])

    X = vectorizer.transform(df['text'])
    y = df['author']
    evaluate_model(clf, vectorizer, X, y)

    for author_dir in os.listdir(DATA_DIR):
        author_path = os.path.join(DATA_DIR, author_dir)
        if not os.path.isdir(author_path):
            continue

        for filename in os.listdir(author_path):
            if not filename.endswith('.txt'):
                continue

Accuracy: 0.75
Precision: 0.5833333333333333
Recall: 0.75
F1 Score: 0.65


  _warn_prf(average, modifier, msg_start, len(result))


**Модель 2**

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
from tensorflow.keras.layers import Dense, Input, Dropout, Embedding, Conv1D, MaxPooling1D, LSTM, Bidirectional
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import GRU
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

with open('corpus.txt', 'r', encoding='utf-8') as f:
    corpus = f.read().splitlines()

authors = ['Charles Dickens', 'Jane Austen', 'William Shakespeare', 'Lovecraft', 'Walter Scott', 'Agatha Christie', 'Franz Kafka']

author_ids = []
for text in corpus:
    if not text.strip():
        continue
    author_name_match = re.match(r"\s*\((\w+\s*\w+)\)", text)
    if author_name_match:
        author_name = author_name_match.group(1)
        try:
            author_ids.append(authors.index(author_name))
        except ValueError:
            print(f"Skipping text: {text.strip()}")
    else:
        print(f"Skipping text: {text.strip()}")

texts = []
for text in corpus:
    parts = text.split(') ')
    if len(parts) < 2:
        continue
    text_without_author = parts[1].lower()
    texts.append(text_without_author)

print(author_ids, texts)

tokenizer = Tokenizer(num_words=10000)

tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)

maxlen = 1000

padded_sequences = pad_sequences(sequences, maxlen=maxlen)

author_ids_one_hot = tf.keras.utils.to_categorical(author_ids)

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, author_ids_one_hot, test_size=0.2, random_state=42)

model = Sequential()

model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=maxlen))

model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))

model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))

model.add(Dropout(0.3))

model.add(Dense(len(authors), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

history = model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32)

score = model.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', score[1])

new_texts = ["I know not why my dreams were so wild that night; but ere the waning and fantastically gibbous moon had risen far above the eastern plain, I was awake in a cold perspiration, determined to sleep no more.",
             "ORLANDO. I will not, till I please; you shall hear me. My father charg'd you in his will to give me good education: you have train'd me like a peasant, obscuring and hiding from me all gentleman-like qualities.",
             "They alighted at the street corner, and dismissing their conveyance, walked to the house. To their first knock at the door there was no response. A second met with the like result. But in answer to the third, which was of a more vigorous kind, the parlour window-sash was gently raised, and a musical voice cried:",
             "In the meanwhile Mr. Ireby found some amusement in detaining the northern drover at his ancient hall. He caused a cold round of beef to be placed before the Scot in the butler's pantry, together with a foaming tankard of home-brewed, and took pleasure in seeing the hearty appetite with which these unwonted edibles were discussed by Robin Oig M'Combich."
             "An thony re joined his flock of sheep. Miss Tay lor, the youngest and most skit tish of the party, in stantly at tacked him. 'Oh, Mr Cade, was that an old friend of yours?'",
             "'You will probably think me very foolish, Monsieur Poirot, but Lord Cranshaw was telling me last night how wonderfully you cleared up the mystery of his nephew's death, and I felt that I just must have your advice. I dare say it's only a silly hoax - Gregory says so - but it's just worrying me to death.'",
             "This noble body, equipped with everything necessary, almost to the point of bursting, also appeared to carry freedom around with it. That seem to be located somewhere or other in its teeth, and its joy in living came with such strong passion from its throat that it was not easy for spectators to keep watching.",
             "He did not quite reject the idea that he should see a doctor the next time he had the chance, but whatever he did - and this was something on which he could advise himself - he wanted to spend all Sunday mornings in future better than he had spent this one."]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=maxlen)

predictions = model.predict(new_padded_sequences)

for i, text in enumerate(new_texts):
    print(text, 'is written by', authors[np.argmax(predictions[i])])

plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')
plt.show()

plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim([0, 2])
plt.legend(loc='upper right')
plt.show()

y_pred = np.argmax(model.predict(X_test), axis=-1)
y_true = np.argmax(y_test, axis=1)
print(classification_report(y_true, y_pred, labels=range(len(authors)), target_names=authors))

score = model.evaluate(X_test, y_test, batch_size=32)
print('Test accuracy:', score[1])

FileNotFoundError: ignored

**Модель 3**

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import re
from tensorflow.keras.layers import Dense, Input, Dropout, Embedding, Conv1D, MaxPooling1D, LSTM, Bidirectional
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import GRU
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D
from keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

with open('corpus.txt', 'r', encoding='utf-8') as f:
    corpus = f.read().splitlines()

authors = ['Charles Dickens', 'Jane Austen', 'William Shakespeare', 'Lovecraft', 'Walter Scott', 'Agatha Christie', 'Franz Kafka']

author_ids = []
for text in corpus:
    if not text.strip():
        continue
    author_name_match = re.match(r"\s*\((\w+\s*\w+)\)", text)
    if author_name_match:
        author_name = author_name_match.group(1)
        try:
            author_ids.append(authors.index(author_name))
        except ValueError:
            print(f"Skipping text: {text.strip()}")
    else:
        print(f"Skipping text: {text.strip()}")

texts = []
for text in corpus:
    parts = text.split(') ')
    if len(parts) < 2:
        continue
    text_without_author = parts[1].lower()
    texts.append(text_without_author)

labels = np.array(author_ids)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

max_len = max([len(x) for x in sequences])

x = pad_sequences(sequences, maxlen=max_len)

indices = np.arange(x.shape[0])
np.random.shuffle(indices)
x = x[indices]
labels = labels[indices]
num_validation_samples = int(0.2 * x.shape[0])
x_train = x[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = x[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 128, input_length=max_len))
model.add(Conv1D(32, 7, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(authors), activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
model.summary()

history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=20, batch_size=128)

new_texts = ["I know not why my dreams were so wild that night; but ere the waning and fantastically gibbous moon had risen far above the eastern plain, I was awake in a cold perspiration, determined to sleep no more.",
             "ORLANDO. I will not, till I please; you shall hear me. My father charg'd you in his will to give me good education: you have train'd me like a peasant, obscuring and hiding from me all gentleman-like qualities.",
             "They alighted at the street corner, and dismissing their conveyance, walked to the house. To their first knock at the door there was no response. A second met with the like result. But in answer to the third, which was of a more vigorous kind, the parlour window-sash was gently raised, and a musical voice cried:",
             "In the meanwhile Mr. Ireby found some amusement in detaining the northern drover at his ancient hall. He caused a cold round of beef to be placed before the Scot in the butler's pantry, together with a foaming tankard of home-brewed, and took pleasure in seeing the hearty appetite with which these unwonted edibles were discussed by Robin Oig M'Combich."
             "An thony re joined his flock of sheep. Miss Tay lor, the youngest and most skit tish of the party, in stantly at tacked him. 'Oh, Mr Cade, was that an old friend of yours?'",
             "'You will probably think me very foolish, Monsieur Poirot, but Lord Cranshaw was telling me last night how wonderfully you cleared up the mystery of his nephew's death, and I felt that I just must have your advice. I dare say it's only a silly hoax - Gregory says so - but it's just worrying me to death.'",
             "This noble body, equipped with everything necessary, almost to the point of bursting, also appeared to carry freedom around with it. That seem to be located somewhere or other in its teeth, and its joy in living came with such strong passion from its throat that it was not easy for spectators to keep watching.",
             "He did not quite reject the idea that he should see a doctor the next time he had the chance, but whatever he did - and this was something on which he could advise himself - he wanted to spend all Sunday mornings in future better than he had spent this one."]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_padded_sequences = pad_sequences(new_sequences, maxlen=max_len)

predictions = model.predict(new_padded_sequences)

for i, text in enumerate(new_texts):
    print(text, 'is written by', authors[np.argmax(predictions[i])])

plt.plot(history.history['acc'], label='acc')
plt.plot(history.history['val_acc'], label='val_acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')
plt.show()

plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.ylim([0, 2])
plt.legend(loc='upper right')
plt.show()