In [17]:
import pandas as pd
import os
import numpy as np
import re
import random
import math

In [18]:
df_description = pd.read_csv('description.csv', sep=';', names=['file_name', 'book_title', 'author', 'age_restriction', 'genre'], header=None)
df_description.head()

Unnamed: 0,file_name,book_title,author,age_restriction,genre
0,children_1.txt,Три четверти,Анна Красильщик,12,Детская проза|Детские книги
1,children_2.txt,Повести и рассказы,Аркадий Гайдар,12,Детская проза|Детские книги
2,children_3.txt,"Оленёнок Крапинка, или Бархатистый носик",Холли Вебб,6,Детская проза|Детские книги
3,children_4.txt,Сара и роль мечты,Холли Вебб,6,Детская проза|Детские книги
4,children_5.txt,Как Гитлер украл розового кролика,Джудит Керр,6,Детская проза|Детские книги


In [154]:
def clean_text(text):
	text = re.sub(r'[\u00A0\u202F\u2000-\u200A\u205F\u3000]', ' ', text)
	text_with_spaces = text.replace('\n', ' ')
	text = re.sub(r'\b\d+\.\s*', '', text)
	cleaned_text = re.sub(r'[^a-zA-Zа-яА-Я ,!?.]', '', text_with_spaces)
	cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
	return cleaned_text

def extract_sentences(text, number_of_sentences):
    text = clean_text(text)
    sentences = re.split(r'(?<=[.!?;]) +', text)
    sentences = sentences[math.floor(len(sentences)*0.1):]
    if len(sentences) <= number_of_sentences:
        sentences = list(map(lambda x: x.strip(), sentences))
        selected_text = ''.join(sentences)
    else:
        start_index = random.randint(0, len(sentences) - number_of_sentences)
        selected_sentences = []
        while(len(selected_sentences)<number_of_sentences and start_index<len(sentences)):
            sentence = sentences[start_index]
            start_index+=1
            if (len(sentence.split(' '))>5):
                selected_sentences.append(sentence)
        selected_sentences = list(map(lambda x: x.strip(), selected_sentences))
        selected_text = ''.join(selected_sentences)
    
    return selected_text

def determine_type(file_name):
    if "children" in file_name.lower():
        return "child"
    else:
        return "adult"

In [268]:
def append_data(data, text_fragments, row, fragment_length):
    for fragment in text_fragments:
        book_type = determine_type(row.file_name)
        data["text_fragment"].append(fragment)
        data["file_name"].append(row.file_name)
        data["book_title"].append(row.book_title)
        data["author"].append(row.author)
        data["age_restriction"].append(row.age_restriction)
        data["genre"].append(row.genre)
        data["type"].append(book_type) 
        data["fragment_length"].append(fragment_length)

data_test = {
    "text_fragment": [], "file_name": [], "book_title": [], "author": [], 
    "age_restriction": [], "genre": [], "type": [], "fragment_length": []
}

data_train = {
    "text_fragment": [], "file_name": [], "book_title": [], "author": [], 
    "age_restriction": [], "genre": [], "type": [], "fragment_length": []
}

sentence_lengths = [3, 4, 5, 6]
number_of_fragments = 2

test_path = 'test'
train_path = 'train'

In [269]:
for index, row in df_description.iterrows():
    file_path_test = os.path.join(test_path, row.file_name)
    file_path_train = os.path.join(train_path, row.file_name)
    
    if os.path.exists(file_path_test):
        file_path = file_path_test
        data = data_test
    elif os.path.exists(file_path_train):
        file_path = file_path_train
        data = data_train
    else:
        continue  # Skip if file does not exist in either location
    
    with open(file_path, 'r', encoding='utf-8') as file:
        text_content = file.read()
        
    # For each sentence length, extract 2 fragments and append them to the dataset
    if (len(re.split(r'(?<=[.!?]) +', text_content))<20):
        continue
    else:
        for length in sentence_lengths:
            text_fragments = [extract_sentences(text_content, length) for _ in range(number_of_fragments)]
            append_data(data, text_fragments, row, length)

# Conversion to DataFrames
df_test = pd.DataFrame(data_test)
df_train = pd.DataFrame(data_train)

# Display the shapes of the final DataFrames for a basic verification
print(df_test.shape)
print(df_train.shape)



(7720, 8)
(35072, 8)


In [270]:
df_train.head()

Unnamed: 0,text_fragment,file_name,book_title,author,age_restriction,genre,type,fragment_length
0,"Потом учительница показала наш будущий класс, ...",children_1.txt,Три четверти,Анна Красильщик,12,Детская проза|Детские книги,child,3
1,"Это Сыроежка, он ткнул пальцем на доедающую ко...",children_1.txt,Три четверти,Анна Красильщик,12,Детская проза|Детские книги,child,3
2,"Испанской нет, но наши далекие предки долго жи...",children_1.txt,Три четверти,Анна Красильщик,12,Детская проза|Детские книги,child,4
3,"Какая гадость, сказала Сыроежка, хотя вообщето...",children_1.txt,Три четверти,Анна Красильщик,12,Детская проза|Детские книги,child,4
4,Это когда человек не верит в Бога.Если ты вери...,children_1.txt,Три четверти,Анна Красильщик,12,Детская проза|Детские книги,child,5


In [271]:
df_test.head()

Unnamed: 0,text_fragment,file_name,book_title,author,age_restriction,genre,type,fragment_length
0,"Ну конечно же, это не сон, сказал Дедушка, Дав...",children_1504.txt,Зимние сказки,Чермошенцев Максим,6,Сказки|Детские книги,child,3
1,Немного побыв за столом с гостями Катя решила ...,children_1504.txt,Зимние сказки,Чермошенцев Максим,6,Сказки|Детские книги,child,3
2,Дед Мороз взял со стола на котором было много ...,children_1504.txt,Зимние сказки,Чермошенцев Максим,6,Сказки|Детские книги,child,4
3,Но потолок затмевал все что было в этой комнат...,children_1504.txt,Зимние сказки,Чермошенцев Максим,6,Сказки|Детские книги,child,4
4,И продолжил рассказ Все деревья были сильны и ...,children_1504.txt,Зимние сказки,Чермошенцев Максим,6,Сказки|Детские книги,child,5


In [272]:
def get_syllables(text):
        return sum([character.lower() in ['а', 'у', 'о', 'и', 'э', 'ы', 'я', 'ю', 'е', 'ё'] for character in text])

MIN_SYLLABLES_IN_COMPLEX_WORD = 3

class Readability:
    def __init__(self, text) -> None:
        self.text = text.replace('...', '').replace('..', '.')
        self.sentences = self._get_sentences()
        self.alpha_characters = sum([character.isalpha() for character in self.text])
        self.syllables = self._get_words_by_syllables()
        self.words = self._get_words()
        self.commas = self.text.count(',')
        self.complex_words = self._get_complex_words_number()
        self.total_syllables = sum([(n*len(w)) for n, w in self.syllables.items()])
        
    def _get_words_by_syllables(self):
        words_by_syllables = {1: [], 2: [], 3: [], 4:[], 5:[], 6:[], 7:[], 8:[]}
        for sentence in self.sentences:
            for word in sentence.split(' '):
                word = re.sub(r'\W+', '', word)
                syllables = get_syllables(word)
                if syllables<=1:
                    words_by_syllables[1].append(word)
                elif syllables>=8:
                    words_by_syllables[8].append(word)
                else:
                    words_by_syllables[syllables].append(word)
                

        return words_by_syllables

    def _get_words(self):
        total_words = 0
        for words in self.syllables.values():
           total_words+=len(words)
        return total_words

    def _get_sentences(self):
        return [sentence for sentence in re.split(r'(?<=[.!?;])+', self.text) if sentence!='']

    def _get_complex_words_number(self):
        complex_words = 0
        for s in range(MIN_SYLLABLES_IN_COMPLEX_WORD, 9):
            complex_words+=len(self.syllables[s])

        return complex_words
    
    def get_average_number_of_characters_in_word(self):
        return self.alpha_characters/self.words

    def get_average_number_of_words_in_sentence(self):
        return self.words/len(self.sentences)

    def get_complex_words_to_words_ratio(self):
        return (self.complex_words/self.words)
    
    def get_average_number_of_complex_words_in_sentence(self):
        return (self.complex_words/len(self.sentences))
    
    def get_average_number_of_syllables_in_word(self):
        return self.total_syllables/self.words
    
    def get_average_number_of_commas_per_sentence(self):
        return self.commas/len(self.sentences)
    
    def get_N_syllable_word_to_all_words_ratio(self, N):
        return len(self.syllables[N])/self.words

In [273]:
def create_readability_features_df(df):
    new_df = {
        "average_number_of_characters_in_word": [], "average_number_of_words_in_sentence": [], "complex_words_to_words_ratio": [], "average_number_of_complex_words_in_sentence": [], 
        "average_number_of_syllables_in_word": [], "average_number_of_commas_per_sentence": [], "one_syl_words_ratio": [], "two_syl_words_ratio": [],
        "three_syl_words_ratio": [], "four_syl_words_ratio": [], "five_syl_words_ratio": [], "six_syl_words_ratio": [], "seven_syl_words_ratio": [], 
        "eight_syl_words_ratio": [], "type": []
    }

    def append_data(row):
        r = Readability(row.text_fragment)
        new_df["average_number_of_characters_in_word"].append(r.get_average_number_of_characters_in_word())
        new_df["average_number_of_words_in_sentence"].append(r.get_average_number_of_words_in_sentence())
        new_df["complex_words_to_words_ratio"].append(r.get_complex_words_to_words_ratio())
        new_df["average_number_of_complex_words_in_sentence"].append(r.get_average_number_of_complex_words_in_sentence())
        new_df["average_number_of_syllables_in_word"].append(r.get_average_number_of_syllables_in_word())
        new_df["average_number_of_commas_per_sentence"].append(r.get_average_number_of_commas_per_sentence())

        n_syl_features = list(new_df.keys())[6:-1]
        for n in range(1, 9):
            new_df[n_syl_features[n-1]].append(r.get_N_syllable_word_to_all_words_ratio(n))


        new_df["type"].append(row.type)

    for index, row in df.iterrows():
        if row.file_name!='adults_1159.txt' and len(row.text_fragment)>10:
            append_data(row)
    
    return pd.DataFrame(new_df)

In [274]:
train_readability_features_df = create_readability_features_df(df_train)
train_readability_features_df.shape

(35059, 15)

In [275]:
train_readability_features_df.head()

Unnamed: 0,average_number_of_characters_in_word,average_number_of_words_in_sentence,complex_words_to_words_ratio,average_number_of_complex_words_in_sentence,average_number_of_syllables_in_word,average_number_of_commas_per_sentence,one_syl_words_ratio,two_syl_words_ratio,three_syl_words_ratio,four_syl_words_ratio,five_syl_words_ratio,six_syl_words_ratio,seven_syl_words_ratio,eight_syl_words_ratio,type
0,4.727273,14.666667,0.272727,4.0,2.068182,1.333333,0.363636,0.363636,0.159091,0.068182,0.045455,0.0,0.0,0.0,child
1,5.652174,7.666667,0.478261,3.666667,2.608696,1.333333,0.26087,0.26087,0.173913,0.26087,0.0,0.043478,0.0,0.0,child
2,4.897959,12.25,0.346939,4.25,2.061224,0.75,0.367347,0.285714,0.265306,0.081633,0.0,0.0,0.0,0.0,child
3,4.525424,14.75,0.322034,4.75,2.118644,2.0,0.423729,0.254237,0.135593,0.169492,0.0,0.016949,0.0,0.0,child
4,4.62963,10.8,0.296296,3.2,2.055556,0.4,0.333333,0.37037,0.203704,0.092593,0.0,0.0,0.0,0.0,child


In [276]:
test_readability_features_df = create_readability_features_df(df_test)
test_readability_features_df.shape

(7720, 15)

In [277]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import pandas as pd

label_encoder = LabelEncoder()
train_readability_features_df['type'] = label_encoder.fit_transform(train_readability_features_df['type'])
test_readability_features_df['type'] = label_encoder.transform(test_readability_features_df['type'])

X_train = train_readability_features_df.drop('type', axis=1)
y_train = train_readability_features_df['type']

X_test = test_readability_features_df.drop('type', axis=1)
y_test = test_readability_features_df['type']

In [301]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dense(128, activation='relu'),
    #BatchNormalization(),
    #Dropout(0.1),
    Dense(128, activation='relu'),
    #BatchNormalization(),
    #Dropout(0.1),
    Dense(128, activation='relu'),
    #BatchNormalization(),
    Dense(1, activation='sigmoid') 
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [302]:
history = model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=64, verbose=1, shuffle=True)

loss, accuracy = model.evaluate(X_train, y_train, verbose=1)
print(f"Test Accuracy: {accuracy*100:.2f}%")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [303]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Accuracy: {accuracy*100:.2f}%")

Test Accuracy: 56.98%


In [293]:
accuracy_by_length = {}
for length in df_train['fragment_length'].unique():
    df_filtered = df_train[df_train['fragment_length'] == length]
    
    df_filtered_features = create_readability_features_df(df_filtered)
    df_filtered_features['type'] = label_encoder.transform(df_filtered_features['type'])
    X_filtered = df_filtered_features.drop(['type'], axis=1)
    y_filtered = df_filtered_features['type']
    
    _, accuracy = model.evaluate(X_filtered, y_filtered, verbose=1)
    accuracy_by_length[length] = accuracy

accuracy_by_length





{3: 0.5948197245597839,
 4: 0.6122076511383057,
 5: 0.6052030920982361,
 6: 0.616130530834198}

In [304]:
accuracy_by_age_restriction = {}
for restriction in df_train['age_restriction'].unique():
    df_filtered = df_train[df_train['age_restriction'] == restriction]
    
    df_filtered_features = create_readability_features_df(df_filtered)
    df_filtered_features['type'] = label_encoder.transform(df_filtered_features['type'])
    X_filtered = df_filtered_features.drop(['type'], axis=1)
    y_filtered = df_filtered_features['type']
    
    _, accuracy = model.evaluate(X_filtered, y_filtered, verbose=1)
    accuracy_by_age_restriction[restriction] = accuracy

accuracy_by_age_restriction



{12: 0.7101699709892273,
 6: 0.8184172511100769,
 16: 0.4777902364730835,
 18: 0.4857421815395355,
 11: 0.75,
 8: 0.7386363744735718,
 3: 1.0}