### Preprocessing module

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate
from tensorflow.keras.models import Sequential, Model

  from pandas.core import (





In [2]:
# Настройка стоп-слов и лемматизатора
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [3]:
def preprocess_text(text):
    text = text.lower() # convert text to lowercase
    text = re.sub(r'\b(u\.s\.|us)\b', 'usa', text, flags=re.IGNORECASE)  # replace "U.S." or "US" with "usa"
    text = re.sub(r'<[^>]+>', '', text)  # remove HTML tags
    text = re.sub(r'&\w+;', '', text)  # remove HTML entities
    text = re.sub(r'[^a-z\s]', '', text)  # remove numbers and punctuation (keep only letters and spaces)
    words = text.split()
    words = [word for word in words if word not in stop_words] # remove stop-words, tokenization
    words = [lemmatizer.lemmatize(word) for word in words] # lemmatization
    return ' '.join(words) # combining words into a string

In [6]:
# Загрузка данных
train_df = pd.read_csv('../data/agn_train.csv')
test_df = pd.read_csv('../data/agn_test.csv')

# Применение предобработки к заголовкам и описаниям
train_df['clean_text'] = (train_df['Title'] + ' ' + train_df['Description']).apply(preprocess_text)
test_df['clean_text'] = (test_df['Title'] + ' ' + test_df['Description']).apply(preprocess_text)

# Токенизация
tokenizer = Tokenizer(num_words=5000)  # Ограничение на количество слов
tokenizer.fit_on_texts(train_df['clean_text'])  # Токенизация по тренировочным данным

# Токенизация заголовков и описаний
X_train_title_seq = tokenizer.texts_to_sequences(train_df['Title'])
X_test_title_seq = tokenizer.texts_to_sequences(test_df['Title'])

X_train_description_seq = tokenizer.texts_to_sequences(train_df['Description'])
X_test_description_seq = tokenizer.texts_to_sequences(test_df['Description'])

# Определение максимальной длины
max_length_titles = max([len(x) for x in X_train_title_seq])
max_length_descriptions = max([len(x) for x in X_train_description_seq])

# Выровнять последовательности
X_train_title_pad = pad_sequences(X_train_title_seq, maxlen=max_length_titles)
X_test_title_pad = pad_sequences(X_test_title_seq, maxlen=max_length_titles)

X_train_description_pad = pad_sequences(X_train_description_seq, maxlen=max_length_descriptions)
X_test_description_pad = pad_sequences(X_test_description_seq, maxlen=max_length_descriptions)

# Определение меток для тренировочных данных
y_train = train_df['Class Index'].values - 1  # Приводим классы к диапазону [0, 1, 2, 3]

# Определение меток для тестовых данных
y_test = test_df['Class Index'].values - 1  # Приводим классы к диапазону [0, 1, 2, 3]

# Загрузка предобученных эмбеддингов GloVe
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Создание матрицы эмбеддингов для слов в токенизаторе
embedding_matrix = np.zeros((5000, 100))  # input_dim=5000, output_dim=100 (размерность векторов GloVe)
for word, i in tokenizer.word_index.items():
    if i < 5000:  # Только слова из топ-5000
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Использование предварительно обученных эмбеддингов (GloVe)
embedding_layer = Embedding(input_dim=5000, 
                            output_dim=100, 
                            weights=[embedding_matrix],  # Передача предобученной матрицы эмбеддингов
                            input_length=max_length_titles, 
                            trainable=False)




### Training module

In [7]:
# Вход для заголовков
title_input = Input(shape=(max_length_titles,), name='title_input')
title_embedding = Embedding(input_dim=5000, output_dim=100, weights=[embedding_matrix], input_length=max_length_titles, trainable=False)(title_input)
title_flatten = Flatten()(title_embedding)

# Вход для описаний
description_input = Input(shape=(max_length_descriptions,), name='description_input')
description_embedding = Embedding(input_dim=5000, output_dim=100, weights=[embedding_matrix], input_length=max_length_descriptions, trainable=False)(description_input)
description_flatten = Flatten()(description_embedding)

# Объединение представлений заголовков и описаний
merged = Concatenate()([title_flatten, description_flatten])

# Полносвязные слои после объединения
dense_1 = Dense(128, activation='relu')(merged)
dense_2 = Dense(64, activation='relu')(dense_1)
output = Dense(4, activation='softmax')(dense_2)  # 4 класса

# Создание модели
model = Model(inputs=[title_input, description_input], outputs=output)

# Компиляция модели
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Обучение модели с раздельной обработкой заголовков и описаний
model.fit([X_train_title_pad, X_train_description_pad], y_train, epochs=10, batch_size=32, validation_data=([X_test_title_pad, X_test_description_pad], y_test))


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1ca29117750>

### Hyperparameter selection module