In [None]:
!pip install pandas

In [None]:
!pip install scikit-learn

In [None]:
!pip install dask

In [None]:
!pip install kagglehub

In [None]:
!pip install torch

# Предобработка данных

In [None]:
import pandas as pd
import numpy as np
import kagglehub
from datetime import datetime, timedelta

In [None]:
# Загрузка данных
path = kagglehub.dataset_download("retailrocket/ecommerce-dataset")
events = pd.read_csv(path+'/events.csv')
item_properties_p1 = pd.read_csv(path+'/item_properties_part1.csv')
item_properties_p2 = pd.read_csv(path+'/item_properties_part2.csv')
item_properties = pd.concat([item_properties_p1, item_properties_p2])

In [None]:
# Очистка данных (дубли/null)
events = events.drop_duplicates()
item_properties = item_properties.drop_duplicates()
events = events.dropna(subset = ['event'])
events = events.dropna(subset = ['itemid'])
events = events.dropna(subset = ['visitorid'])
events = events.dropna(subset = ['timestamp'])
item_properties = item_properties.dropna(subset = ['itemid'])
item_properties = item_properties.dropna(subset = ['timestamp'])

# Приведение timestamp к единому типу данных
events['timestamp'] = events['timestamp'].astype('int64')
item_properties['timestamp'] = item_properties['timestamp'].astype('int64')
events = events.sort_values(['timestamp']).reset_index(drop=True)

In [None]:
item_properties.head()

In [None]:
categories = item_properties[item_properties['property'] == 'categoryid']
categories = categories.sort_values('timestamp').drop_duplicates('itemid', keep='first')
categories = categories.rename(columns={'value': 'categoryid'})
categories = categories.drop(['property'], axis=1)
print(categories.head())

In [None]:
# Соединение events и categories
merged_data = pd.merge_asof(
    events,
    categories,
    on='timestamp',
    by='itemid',
    direction='backward'
)
# Проверка что все правильно соединилось без дублей
print("Было:", len(events), ",стало:", len(merged_data))
merged_data.head()

In [None]:
merged_data = merged_data.dropna(subset=['categoryid'])
print("Было:", len(events), ",стало:", len(merged_data))

In [None]:
# Создание сессий (действия пользователя в течение 30 минут)
merged_data = merged_data.drop(['transactionid'], axis = 1)
merged_data['timestamp'] = pd.to_datetime(merged_data['timestamp'], unit='ms')
merged_data = merged_data.sort_values(by=['visitorid', 'timestamp'])

merged_data['session_id'] = (
    (merged_data['timestamp'].diff() >= pd.Timedelta(minutes=30)) |
    (merged_data['visitorid'] != merged_data['visitorid'].shift())
).cumsum()

print("Уникальных сессий:", merged_data['session_id'].nunique())
print("Уникальных пользователей:", merged_data['visitorid'].nunique())

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder_category = LabelEncoder()
merged_data['category_encoded'] = label_encoder_category.fit_transform(merged_data['categoryid'].astype(str))
merged_data.head()

In [None]:
import os
import dask.dataframe as dd
import dask.array as da
from dask import delayed
from dask.diagnostics import ProgressBar
from sklearn.model_selection import train_test_split

In [None]:
# Сборка Dask-DF

ddf = dd.from_pandas(
    merged_data[['session_id','category_encoded']],
    npartitions=16
)

In [None]:
session_seqs = (
    ddf
      .groupby('session_id')['category_encoded']
      .apply(lambda s: s.tolist(), meta=('list', object))
      .compute()
)
# Оставляем только сессии длиной >= 5
session_seqs = [seq for seq in session_seqs if len(seq) >= 5]

# Генерация префиксов -> метки (следующая категория)
sequences, labels = [], []
for seq in session_seqs:
    for i in range(1, len(seq)):
        sequences.append(seq[:i])
        labels.append(seq[i])

# Усечение до фиксированной длины
def pad_sequences(seqs, seq_length):
    padded = np.zeros((len(seqs), seq_length), dtype=np.int64)
    for i, seq in enumerate(seqs):
        length = min(len(seq), seq_length)
        padded[i, :length] = seq[:length]
    return padded

seq_length = 5
X = pad_sequences(sequences, seq_length)
y = np.array(labels, dtype=np.int64)

# Фильтрация редких меток (<2 возникновений)
unique, counts = np.unique(y, return_counts=True)
valid = unique[counts >= 2]
mask = np.isin(y, valid)
X, y = X[mask], y[mask]

print(f"После фильтрации: всего сэмплов = {len(y)}, "
      f"уникальных классов = {len(np.unique(y))}")


X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.25,
    random_state=42,
    stratify=y_train_val
)
print(f"Размер X_train: {X_train.shape}")
print(f"Размер X_test:  {X_test.shape}")
print(f"Размер X_val:  {X_val.shape}")
print(f"Размер y_train: {y_train.shape}")
print(f"Размер y_test: {y_test.shape}")
print(f"Размер y_val:  {y_val.shape}")

# LRCN

In [None]:
import time
import numpy as np
import torch
import os
import psutil
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, TensorDataset, random_split, WeightedRandomSampler
import optuna

## Классический LRCN

In [None]:
def make_loader(X_arr, y_arr, shuffle=False):
    # Преобразование данных в тензоры для использования в модели
    ds = TensorDataset(
        torch.tensor(X_arr, dtype=torch.long),
        torch.tensor(y_arr, dtype=torch.long)
    )
    return DataLoader(ds, batch_size=64, shuffle=shuffle)

# loader'ы для итераций по датасетам
train_loader = make_loader(X_train, y_train, shuffle=True)
val_loader   = make_loader(X_val,   y_val,   shuffle=False)
test_loader  = make_loader(X_test,  y_test,  shuffle=False)

In [None]:
"""
Класс модели
Наследуется от nn.Module (базовый класс для всех моделей в PyTorch)
"""
class LRCN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_size, num_layers, num_classes):
        super().__init__()
        # embedding входных категорий
        self.emb   = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        # CNN часть
        self.conv1 = nn.Conv1d(in_channels=emb_dim, out_channels=128, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=128, out_channels=256, kernel_size=3, padding=1)
        self.relu  = nn.ReLU()
        self.pool  = nn.MaxPool1d(kernel_size=2, stride=2)
        # LSTM часть
        self.lstm  = nn.LSTM(input_size=256, hidden_size=hidden_size,
                             num_layers=num_layers, batch_first=True)
        # классификатор
        self.fc    = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.emb(x)
        x = x.permute(0, 2, 1)
        x = self.relu(self.conv1(x))
        x = self.pool(x)
        x = self.relu(self.conv2(x))
        x = self.pool(x)
        x = x.permute(0, 2, 1)
        out, _ = self.lstm(x)
        h = out[:, -1, :]
        return self.fc(h)

# Функция обучения
def train_model(model, loader, criterion, optimizer, num_epochs=20):
    model.train()
    for epoch in range(1, num_epochs+1):
        total_loss = 0
        for Xb, yb in loader:
            Xb, yb = Xb.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(Xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch}/{num_epochs} — loss={total_loss/len(loader):.4f}")

# Функция оценки
def evaluate_model(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for Xb, yb in loader:
            Xb = Xb.to(device)
            logits = model(Xb)
            preds = logits.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(yb.numpy())
    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    rec = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
    f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    return acc, prec, rec, f1

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = int(X.max()) + 1
num_classes = vocab_size

# Замер текущей памяти
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss

# Тренировка и оценка классической LRCN
lrcn = LRCN(vocab_size, emb_dim=128, hidden_size=128, num_layers=2, num_classes=num_classes).to(device)
opt = optim.Adam(lrcn.parameters(), lr=1e-3)
crit = nn.CrossEntropyLoss()

start = time.time()
train_model(lrcn, train_loader, crit, opt, num_epochs=20)
test_acc, precision, recall, f1 = evaluate_model(lrcn, test_loader)

print(f"Время работы: {time.time()-start:.1f}s")

# Память после
mem_after = process.memory_info().rss
mem_used = mem_after - mem_before

print(f"Потребление памяти процесса: {mem_used / (1024**2):.2f} MiB")
print("=== Оценка на тесте ===")
print(f"Accuracy = {test_acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

## Улучшенная LRCN с подобранными гиперпараметрами

In [None]:
def make_loader(X_arr, y_arr, batch_size=64, shuffle=False):
  # Преобразование данных в тензоры для использования в модели
    ds = TensorDataset(
        torch.tensor(X_arr, dtype=torch.long),
        torch.tensor(y_arr, dtype=torch.long)
    )
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle)

# loader'ы для итераций по датасетам
train_loader = make_loader(X_train, y_train, batch_size=64, shuffle=True)
val_loader   = make_loader(X_val,   y_val,   batch_size=64, shuffle=False)
test_loader  = make_loader(X_test,  y_test,  batch_size=64, shuffle=False)

class ResidualCNNBlock(nn.Module):
    """
    Остаточный блок на основе 1D-свёрток с пропускным соединением
    channels: число каналов на входе и выходе
    kernel_size: размер ядра свёртки
    padding: паддинг по краям
    """
    def __init__(self, channels, kernel_size=3, padding=1):
        super().__init__()
        self.conv1 = nn.Conv1d(channels, channels, kernel_size, padding=padding)
        self.bn1 = nn.BatchNorm1d(channels)
        self.relu = nn.ReLU()
        self.conv2 = nn.Conv1d(channels, channels, kernel_size, padding=padding)
        self.bn2 = nn.BatchNorm1d(channels)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        return self.relu(out + x)

In [None]:
"""
Класс модели
Наследуется от nn.Module (базовый класс для всех моделей в PyTorch)
Улучшенная LRCN-модель:
      - Эмбеддинги + Dropout
      - Свёрточные слои + остаточные блоки
      - Адаптивный пуллинг
      - Двунаправленный LSTM
      - Глобальный пулинг + Dropout + полносвязный слой
"""
class ImprovedLRCN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_size, num_layers, num_classes,
                 dropout_emb=0.1, dropout_lstm=0.1, dropout_fc=0.3):
        super().__init__()
        # Эмбеддинги + регуляризация
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.emb_dropout = nn.Dropout(dropout_emb)
        # Входной свёрточный слой + нормализация
        self.conv_in = nn.Conv1d(emb_dim, 128, kernel_size=3, padding=1)
        self.bn_in = nn.BatchNorm1d(128)
        self.res1 = ResidualCNNBlock(128)
        self.pool = nn.AdaptiveMaxPool1d(output_size=seq_len//2)  # сохранить половину
        self.conv_mid = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.bn_mid = nn.BatchNorm1d(256)
        self.res2 = ResidualCNNBlock(256)
        # Двунаправленный LSTM для захвата обоих направлений
        self.lstm = nn.LSTM(input_size=256,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True,
                            bidirectional=True,
                            dropout=dropout_lstm if num_layers>1 else 0)
        self.fc_dropout = nn.Dropout(dropout_fc)
        self.fc = nn.Linear(hidden_size*2, num_classes)

    def forward(self, x):
        seq_len = x.size(1)
        x = self.emb(x)
        x = self.emb_dropout(x)
        x = x.permute(0,2,1)
        x = self.conv_in(x)
        x = self.bn_in(x)
        x = self.relu(x) if hasattr(self, 'relu') else nn.ReLU()(x)
        x = self.res1(x)
        x = self.pool(x)
        x = self.conv_mid(x)
        x = self.bn_mid(x)
        x = self.res2(x)
        x = x.permute(0,2,1)
        out, _ = self.lstm(x)
        context = torch.mean(out, dim=1)

        h = self.fc_dropout(context)
        return self.fc(h)

# Функция цикла обучения модели
def train_model(model, train_loader, val_loader, criterion, optimizer,
                scheduler, num_epochs=30, clip_norm=5.0,
                early_stopping_patience=5, device='cpu'):
    model.to(device)
    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(1, num_epochs+1):
        model.train()
        total_loss = 0.0
        # Обучение по батчам
        for Xb, yb in train_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(Xb)
            loss = criterion(logits, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm)
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        # валидация
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for Xb, yb in val_loader:
                Xb, yb = Xb.to(device), yb.to(device)
                logits = model(Xb)
                val_loss += criterion(logits, yb).item()
        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch {epoch}/{num_epochs} — train_loss={avg_train_loss:.4f}, val_loss={avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
            best_state = model.state_dict()
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= early_stopping_patience:
                print(f"Early stopping at epoch {epoch}")
                model.load_state_dict(best_state)
                break

    return model


# Оценка модели: возвращает accuracy, precision, recall и F1
def evaluate_model(model, loader, device='cpu'):
    model.to(device)
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for Xb, yb in loader:
            Xb = Xb.to(device)
            logits = model(Xb)
            preds = logits.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(yb.numpy())
    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    rec = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
    f1  = f1_score(all_labels, all_preds, average='weighted', zero_division=0)
    return acc, prec, rec, f1

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = int(X_train.max()) + 1
num_classes = vocab_size
seq_len = X_train.shape[1]

# Используем CrossEntropy без весов или с мягкими весами
crit = nn.CrossEntropyLoss()
model = ImprovedLRCN(vocab_size, emb_dim=128, hidden_size=128,
                     num_layers=2, num_classes=num_classes).to(device)
opt = optim.AdamW(model.parameters(), lr=3e-3, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    opt, max_lr=3e-3,
    steps_per_epoch=len(train_loader), epochs=20,
    pct_start=0.1, anneal_strategy='cos'
)

start = time.time()
# Замер текущей памяти
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss

model = train_model(model, train_loader, val_loader, crit, opt, scheduler,
                    num_epochs=20, clip_norm=5.0,
                    early_stopping_patience=5, device=device)

test_acc, precision, recall, f1 = evaluate_model(model, test_loader, device)
print(f"Время работы: {time.time()-start:.1f}s")

# Память после
mem_after = process.memory_info().rss
mem_used = mem_after - mem_before

print(f"Потребление памяти процесса: {mem_used / (1024**2):.2f} MiB")

print(f"=== Оценка на тесте ===")
print(f"Accuracy = {test_acc:.4f}")
print(f"Precision = {precision:.4f}")
print(f"Recall = {recall:.4f}")
print(f"F1-Score = {f1:.4f}")

# LSTM

In [None]:
import torch
import time
import torch.nn as nn
import numpy as np
import torch.optim as optim
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
def make_loader(X_arr, y_arr, shuffle=False):
    # Преобразование данных в тензоры для использования в модели
    ds = TensorDataset(
        torch.tensor(X_arr, dtype=torch.long),
        torch.tensor(y_arr, dtype=torch.long)
    )
    return DataLoader(ds, batch_size=64, shuffle=shuffle)

# loader'ы для итераций по датасетам
train_loader = make_loader(X_train, y_train, shuffle=True)
val_loader   = make_loader(X_val,   y_val,   shuffle=False)
test_loader  = make_loader(X_test,  y_test,  shuffle=False)

In [None]:
"""
Класс модели
Наследуется от nn.Module (базовый класс для всех моделей в PyTorch)
"""

class LSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_size, num_layers, seq_length, pad_idx=0, dropout=0.3):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2,
            bidirectional=True
        )
        self.layernorm = nn.LayerNorm(hidden_size * 2)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * 2, vocab_size)

    def forward(self, x):
        emb = self.emb(x)
        out, _ = self.lstm(emb)
        h = out[:, -1, :]
        h = self.layernorm(h)
        h = self.dropout(h)
        return self.fc(h)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = int(X.max()) + 1

# Замер текущей памяти
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss

lstm_model = LSTM(
    vocab_size=vocab_size,
    emb_dim=128,
    hidden_size=128,
    num_layers=2,
    seq_length=seq_length,
    pad_idx=0,
    dropout=0.3
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=1e-3)

In [None]:
# Тренировочный цикл с валидацией по Accuracy
best_val_acc = 0.0
start = time.time()

for epoch in range(1, 21):
    lstm_model.train()
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = lstm_model(Xb)
        loss = criterion(logits, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(lstm_model.parameters(), 1.0)
        optimizer.step()

    # Валидация
    lstm_model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb = Xb.to(device)
            preds = lstm_model(Xb).argmax(dim=1).cpu().numpy()
            all_preds.append(preds)
            all_labels.append(yb.numpy())
    val_preds = np.concatenate(all_preds)
    val_labels = np.concatenate(all_labels)
    val_acc = accuracy_score(val_labels, val_preds)

    print(f"Epoch {epoch}/20 — val_accuracy={val_acc:.4f}")

    # Сохраняем лучшую модель
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(lstm_model.state_dict(), 'best_lstm.pt')

print(f"Время работы: {time.time()-start:.1f}s")

# Память после
mem_after = process.memory_info().rss
mem_used = mem_after - mem_before

print(f"Потребление памяти процесса: {mem_used / (1024**2):.2f} MiB")

In [None]:
# Оценка на тесте
lstm_model.load_state_dict(torch.load('best_lstm.pt'))
lstm_model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for Xb, yb in test_loader:
        Xb = Xb.to(device)
        preds = lstm_model(Xb).argmax(dim=1).cpu().numpy()
        all_preds.append(preds)
        all_labels.append(yb.numpy())

test_preds = np.concatenate(all_preds)
test_labels = np.concatenate(all_labels)
test_acc = accuracy_score(test_labels, test_preds)
precision = precision_score(test_labels, test_preds, average='weighted')
recall = recall_score(test_labels, test_preds, average='weighted')
f1 = f1_score(test_labels, test_preds, average='weighted')

print("=== Оценка на тесте ===")
print(f"Accuracy = {test_acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Transformer

In [None]:
import torch
import time
import torch.nn as nn
import numpy as np
import torch.optim as optim
from collections import Counter
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
session_seqs = (
    ddf
      .groupby('session_id')['category_encoded']
      .apply(lambda s: s.tolist(), meta=('list', object))
      .compute()
)
# Оставляем только сессии длиной >= 10
session_seqs = [seq for seq in session_seqs if len(seq) >= 10]

# Генерация префиксов -> метки (следующая категория)
sequences, labels = [], []
for seq in session_seqs:
    for i in range(1, len(seq)):
        sequences.append(seq[:i])
        labels.append(seq[i])

# Усечение до фиксированной длины
def pad_and_shift(seqs, seq_length):
    X = np.zeros((len(seqs), seq_length), dtype=np.int64)
    for i, seq in enumerate(seqs):
        arr = np.array(seq, dtype=np.int64) + 1
        if len(arr) >= seq_length:
            X[i] = arr[-seq_length:]
        else:
            X[i, -len(arr):] = arr
    return X

seq_length = 10
X = pad_and_shift(sequences, seq_length)
y = np.array(labels, dtype=np.int64) + 1

# Фильтрация редких меток (<2 возникновений)
unique, counts = np.unique(y, return_counts=True)
valid = unique[counts >= 2]
mask = np.isin(y, valid)
X, y = X[mask], y[mask]

print(f"После фильтрации: всего сэмплов = {len(y)}, "
      f"уникальных классов = {len(np.unique(y))}")


X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val,
    test_size=0.25,
    random_state=42,
    stratify=y_train_val
)
print(f"Размер X_train: {X_train.shape}")
print(f"Размер X_test:  {X_test.shape}")
print(f"Размер X_val:  {X_val.shape}")
print(f"Размер y_train: {y_train.shape}")
print(f"Размер y_test: {y_test.shape}")
print(f"Размер y_val:  {y_val.shape}")

In [None]:
def make_loader(X_arr, y_arr, shuffle=False):
    # Преобразование данных в тензоры для использования в модели
    ds = TensorDataset(
        torch.tensor(X_arr, dtype=torch.long),
        torch.tensor(y_arr, dtype=torch.long)
    )
    return DataLoader(ds, batch_size=64, shuffle=shuffle)

# loader'ы для итераций по датасетам
train_loader = make_loader(X_train, y_train, shuffle=True)
val_loader   = make_loader(X_val,   y_val,   shuffle=False)
test_loader  = make_loader(X_test,  y_test,  shuffle=False)

In [None]:
"""
Добавляет к входным эмбеддингам информацию о позиции токенов в последовательности.
"""
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=500):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [None]:
"""
Класс модели
Наследуется от nn.Module (базовый класс для всех моделей в PyTorch)
"""

class Transformer(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 d_model: int = 128,
                 nhead: int = 4,
                 dim_feedforward: int = 512,
                 num_layers: int = 2,
                 seq_length: int = 5,
                 pad_idx: int = 0,
                 dropout: float = 0.3):
        super().__init__()
        # слой эмбеддинга
        self.emb = nn.Embedding(vocab_size, d_model, padding_idx=pad_idx)
        # позиционное кодирование
        self.pos_enc = PositionalEncoding(d_model, max_len=seq_length)
        # формируем один слой энкодера Transformer
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.layernorm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        pad_mask = (x == self.emb.padding_idx)
        e = self.emb(x)
        e = self.pos_enc(e)
        out = self.transformer(e, src_key_padding_mask=pad_mask)
        h = out[:, -1, :]
        h = self.layernorm(h)
        h = self.dropout(h)
        return self.fc(h)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = int(X.max()) + 1

# Замер текущей памяти
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss

tr_model = Transformer(
    vocab_size=vocab_size,
    d_model=128,
    nhead=4,
    dim_feedforward=512,
    num_layers=2,
    seq_length=seq_length,
    pad_idx=0,
    dropout=0.3
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(tr_model.parameters(), lr=1e-3)

In [None]:
# Тренировочный цикл с валидацией по Accuracy
best_val_acc = 0.0
start = time.time()

for epoch in range(1, 21):
    tr_model.train()
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = tr_model(Xb)
        loss = criterion(logits, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(tr_model.parameters(), 1.0)
        optimizer.step()

    # Валидация
    tr_model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb = Xb.to(device)
            preds = tr_model(Xb).argmax(dim=1).cpu().numpy()
            all_preds.append(preds)
            all_labels.append(yb.numpy())
    val_preds = np.concatenate(all_preds)
    val_labels = np.concatenate(all_labels)
    val_acc = accuracy_score(val_labels, val_preds)

    print(f"Epoch {epoch}/20 — val_accuracy={val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(tr_model.state_dict(), 'best_transformer.pt')

print(f"Время работы: {time.time()-start:.1f}s")

# Память после
mem_after = process.memory_info().rss
mem_used = mem_after - mem_before

print(f"Потребление памяти процесса: {mem_used / (1024**2):.2f} MiB")

In [None]:
# Оценка на тесте
tr_model.load_state_dict(torch.load('best_transformer.pt'))
tr_model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for Xb, yb in test_loader:
        Xb = Xb.to(device)
        preds = tr_model(Xb).argmax(dim=1).cpu().numpy()
        all_preds.append(preds)
        all_labels.append(yb.numpy())

test_preds = np.concatenate(all_preds)
test_labels = np.concatenate(all_labels)
test_acc = accuracy_score(test_labels, test_preds)
precision = precision_score(test_labels,
                            test_preds,
                            average='weighted',
                            zero_division=0)
recall = recall_score(test_labels,
                         test_preds,
                         average='weighted',
                         zero_division=0)
f1 = f1_score(test_labels,
                      test_preds,
                      average='weighted',
                      zero_division=0)

print("=== Оценка на тесте ===")
print(f"Accuracy = {test_acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# FFNN

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
seq_length = 5

merged_data = merged_data.sort_values(['session_id', 'timestamp'])

ddf = dd.from_pandas(
    merged_data[['session_id','category_encoded']],
    npartitions=16
)
session_seqs = (
    ddf
      .groupby('session_id')['category_encoded']
      .apply(lambda s: s.tolist(), meta=('list', object))
      .compute()
)

# Оставляем только сессии длиной >= 5
session_seqs = [seq for seq in session_seqs if len(seq) >= 5]

# Генерируем префиксы → метки
sequences, labels = [], []
for seq in session_seqs:
    for i in range(1, len(seq)):
        sequences.append(seq[:i])
        labels.append(seq[i])

# Паддинг/выравнивание вправо + сдвиг всех категорий на +1
def pad_and_shift(seqs, seq_length):
    X = np.zeros((len(seqs), seq_length), dtype=np.int64)  # 0 = padding
    for i, seq in enumerate(seqs):
        arr = np.array(seq, dtype=np.int64) + 1  # реальные категории
        if len(arr) >= seq_length:
            X[i] = arr[-seq_length:]
        else:
            X[i, -len(arr):] = arr
    return X

X = pad_and_shift(sequences, 5)
y = np.array(labels, dtype=np.int64) + 1   # метки тоже +1

# Фильтруем очень редкие метки
unique, counts = np.unique(y, return_counts=True)
valid = unique[counts >= 2]
mask = np.isin(y, valid)
X, y = X[mask], y[mask]

print(f"После фильтрации: всего сэмплов = {len(y)}, "
      f"уникальных классов = {len(valid)}")

# Разделяем на train/val/test
X_trval, X_test, y_trval, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_trval, y_trval, test_size=0.25, random_state=42, stratify=y_trval
)
print(f"Размер X_train: {X_train.shape}")
print(f"Размер X_test:  {X_test.shape}")
print(f"Размер X_val:  {X_val.shape}")
print(f"Размер y_train: {y_train.shape}")
print(f"Размер y_test: {y_test.shape}")
print(f"Размер y_val:  {y_val.shape}")

In [None]:
# Вычисляем веса для классов из train
class_counts = np.bincount(y_train)
class_weights = np.zeros_like(class_counts, dtype=np.float32)
nonzero = class_counts > 0
class_weights[nonzero] = 1.0 / class_counts[nonzero]

# Для CrossEntropyLoss
weight_tensor = torch.from_numpy(class_weights).float()

# Для сэмплера
sample_weights = class_weights[y_train]
sampler = WeightedRandomSampler(sample_weights,
                                num_samples=len(sample_weights),
                                replacement=True)

def make_loader(X_arr, y_arr, sampler=None, shuffle=False):
    # Преобразование данных в тензоры для использования в модели
    ds = TensorDataset(torch.tensor(X_arr, dtype=torch.long),
                       torch.tensor(y_arr, dtype=torch.long))
    return DataLoader(ds,
                      batch_size=64,
                      sampler=sampler,
                      shuffle=shuffle and sampler is None)

# loader'ы для итераций по датасетам
train_loader = make_loader(X_train, y_train, sampler=sampler)
val_loader   = make_loader(X_val,   y_val,   shuffle=False)
test_loader  = make_loader(X_test,  y_test,  shuffle=False)

In [None]:
"""
Класс модели
Наследуется от nn.Module (базовый класс для всех моделей в PyTorch)
"""

class FFNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, seq_length, hidden_dim, dropout, pad_idx=0):
        super().__init__()
        self.emb  = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.fc1  = nn.Linear(emb_dim * seq_length, hidden_dim)
        self.bn1  = nn.BatchNorm1d(hidden_dim)
        self.relu = nn.ReLU(inplace=True)
        self.do   = nn.Dropout(dropout)
        self.fc2  = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        e    = self.emb(x)
        flat = e.view(e.size(0), -1)
        h    = self.fc1(flat)
        h    = self.bn1(h)
        h    = self.relu(h)
        h    = self.do(h)
        return self.fc2(h)

In [None]:
device     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = int(X.max()) + 1

# Замер текущей памяти
process = psutil.Process(os.getpid())
mem_before = process.memory_info().rss

ffnn_model = FFNN(vocab_size, 128, 5, 256, 0.3).to(device)
criterion = nn.CrossEntropyLoss(weight=weight_tensor.to(device))
optimizer = optim.Adam(ffnn_model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=1e-3 * 10,
    steps_per_epoch=len(train_loader),
    epochs=EPOCHS
)

In [None]:
# Тренировочный цикл с валидацией по Accuracy
best_val_acc = 0.0
start = time.time()

for epoch in range(1, 21):
    ffnn_model.train()
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        logits = ffnn_model(Xb)
        loss   = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        scheduler.step()

    ffnn_model.eval()
    preds, labs = [], []
    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb = Xb.to(device)
            out = ffnn_model(Xb).argmax(dim=1).cpu().numpy()
            preds.append(out)
            labs.append(yb.numpy())
    val_preds  = np.concatenate(preds)
    val_labels = np.concatenate(labs)
    val_acc    = accuracy_score(val_labels, val_preds)

    print(f"Epoch {epoch}/{20} — val_acc={val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(ffnn_model.state_dict(), 'best_ffnn.pt')

print(f"Время работы: {time.time()-start:.1f}s")

# Память после
mem_after = process.memory_info().rss
mem_used = mem_after - mem_before

print(f"Потребление памяти процесса: {mem_used / (1024**2):.2f} MiB")

In [None]:
# Оценка на тесте
ffnn_model.load_state_dict(torch.load('best_ffnn.pt'))
ffnn_model.eval()
preds, labs = [], []
with torch.no_grad():
    for Xb, yb in test_loader:
        Xb = Xb.to(device)
        out = ffnn_model(Xb).argmax(dim=1).cpu().numpy()
        preds.append(out)
        labs.append(yb.numpy())

test_preds  = np.concatenate(preds)
test_labels = np.concatenate(labs)
test_acc    = accuracy_score(test_labels, test_preds)
precision = precision_score(test_labels,
                            test_preds,
                            average='weighted',
                            zero_division=0)
recall = recall_score(test_labels,
                         test_preds,
                         average='weighted',
                         zero_division=0)
f1 = f1_score(test_labels,
                      test_preds,
                      average='weighted',
                      zero_division=0)

print("=== Оценка на тесте ===")
print(f"Accuracy = {test_acc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

In [None]:
from sklearn.metrics import precision_score, recall_score
prec_macro = precision_score(test_labels, test_preds, average='macro', zero_division=0)
rec_macro  = recall_score(   test_labels, test_preds, average='macro', zero_division=0)
print(f"Macro Precision: {prec_macro:.4f}, Macro Recall: {rec_macro:.4f}")