In [None]:
"""
recsys_pointwise.py

Простой pipeline для задачи рекомендаций (pointwise). Делает следующее:
1. Читает train_data.pq и sample_submission.csv
2. Делит по времени: последние 7 дней как target (positive) для обучения/валидации
3. Генерирует кандидатов: история пользователя, популярные товары, случайные негативы
4. Строит признаки (user/item statistics, recency, user-item count)
5. Обучает LightGBM классификатор (pointwise) для предсказания вероятности клика
6. Генерирует топ-20 рекомендаций для пользователей из sample_submission
7. Сохраняет два варианта сабмишна: "submission_long.csv" (one row per user-item)
   и "submission_space.csv" (one row per user with space-separated 20 item_ids)

Требования: pandas, numpy, pyarrow (для чтения parquet), scikit-learn, lightgbm

Запуск:
python recsys_pointwise.py --train train_data.pq --sample sample_submission.csv --out_dir ./output

Примечание: это минимальный, но практичный baseline. Его легко улучшать: больше фич,
больше кандидатов, продвинутые модели (табличные NN / embedding + dot-product),
специфичная валидация по времени и т.д.
"""

import argparse
import os
import gc
from collections import defaultdict

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score

# lightgbm may not be installed in user's env; try import and give friendly error
try:
    import lightgbm as lgb
except Exception as e:
    lgb = None


def read_data(train_path, sample_path):
    print("Reading train parquet...")
    df = pd.read_parquet(train_path)
    print(f"Loaded interactions: {len(df):,}")
    sample = pd.read_csv(sample_path)
    print(f"Loaded sample submission: {len(sample):,} rows")
    return df, sample


def time_split_make_labels(df, days_for_target=7):
    # assume 'date' is integer day index
    max_day = int(df['date'].max())
    split_day = max_day - days_for_target
    print(f"Max day in data: {max_day}, using last {days_for_target} days as target (days > {split_day})")

    train_df = df[df['date'] <= split_day].copy()
    target_df = df[df['date'] > split_day].copy()
    print(f"Train interactions: {len(train_df):,}, target interactions: {len(target_df):,}")
    return train_df, target_df, split_day


def make_stats(train_df):
    print("Building basic user/item statistics...")
    user_clicks = train_df.groupby('user_id').size().rename('user_total_clicks')
    user_nunique_items = train_df.groupby('user_id')['item_id'].nunique().rename('user_nunique_items')
    item_clicks = train_df.groupby('item_id').size().rename('item_total_clicks')
    item_last_day = train_df.groupby('item_id')['date'].max().rename('item_last_day')

    user_stats = pd.concat([user_clicks, user_nunique_items], axis=1).reset_index()
    item_stats = pd.concat([item_clicks, item_last_day], axis=1).reset_index()

    return user_stats, item_stats


def generate_candidates(train_df, sample_users, item_stats, topk_popular=500, max_user_history=200, random_neg_per_user=200, seed=42):
    """
    Candidate generation strategy:
    - for each user: include their historical items (most recent first up to max_user_history)
    - add top popular items (global)
    - add some random negatives sampled from popular pool
    Returns dict: user_id -> set(item_ids)
    """
    rng = np.random.RandomState(seed)

    # top popular items globally
    popular_items = item_stats.sort_values('item_total_clicks', ascending=False)['item_id'].values
    popular_top = popular_items[:topk_popular]

    # user history (most recent first)
    user_history = train_df.sort_values(['user_id', 'date'], ascending=[True, False]).groupby('user_id')['item_id'].apply(list)

    candidates = {}
    pop_pool = list(popular_top)

    for uid in sample_users:
        cands = []
        if uid in user_history.index:
            # take last N
            hist = user_history.loc[uid][:max_user_history]
            cands.extend(hist)
        # add popular
        cands.extend(pop_pool[:200])
        # random negatives from popular pool
        neg = rng.choice(pop_pool, size=min(random_neg_per_user, len(pop_pool)), replace=False).tolist()
        cands.extend(neg)
        candidates[uid] = set(cands)

    print(f"Generated candidates for {len(candidates)} users; avg candidates per user ~ {np.mean([len(v) for v in candidates.values()]):.1f}")
    return candidates


def build_feature_table(train_df, target_df, candidates, user_stats, item_stats, split_day):
    """
    Build training table: for each user-item candidate produce features and label (1 if in target_df)
    """
    print("Building feature table (this may use memory)...")
    # map stats
    user_stats = user_stats.set_index('user_id')
    item_stats = item_stats.set_index('item_id')

    # build a set of positive pairs (from target_df)
    pos_pairs = set(zip(target_df['user_id'], target_df['item_id']))

    rows = []

    for i, (uid, items) in enumerate(candidates.items()):
        if (i+1) % 10000 == 0:
            print(f"Processed {i+1} users for features...")
        # prefetch user stats
        if uid in user_stats.index:
            u_total = int(user_stats.at[uid, 'user_total_clicks'])
            u_nitems = int(user_stats.at[uid, 'user_nunique_items'])
        else:
            u_total = 0
            u_nitems = 0

        # get user's last interaction day from train_df (if present)
        user_last_day = train_df[train_df['user_id'] == uid]['date'].max() if uid in train_df['user_id'].values else np.nan

        for iid in items:
            if iid in item_stats.index:
                i_total = int(item_stats.at[iid, 'item_total_clicks'])
                i_last = int(item_stats.at[iid, 'item_last_day'])
            else:
                i_total = 0
                i_last = -999

            # user-item historical count in train
            ui_count = ((train_df['user_id'] == uid) & (train_df['item_id'] == iid)).sum()

            recency = split_day - i_last  # how many days since last item click in train

            label = 1 if (uid, iid) in pos_pairs else 0

            rows.append((uid, iid, u_total, u_nitems, i_total, ui_count, recency, label))

    feats = pd.DataFrame(rows, columns=['user_id','item_id','user_total_clicks','user_nunique_items','item_total_clicks','ui_count','recency','label'])
    print(f"Feature table size: {len(feats):,}")
    return feats


def train_pointwise_model(feats, num_boost_round=1000, early_stopping_rounds=50):
    if lgb is None:
        raise ImportError("lightgbm is required to train the model. Please install it (pip install lightgbm)")

    X = feats[['user_total_clicks','user_nunique_items','item_total_clicks','ui_count','recency']]
    y = feats['label']

    # simple stratified split maintaining label distribution
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val)

    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': 0.1,
        'num_leaves': 63,
        'seed': 42,
    }

    print("Training LightGBM...")
    model = lgb.train(params, dtrain, valid_sets=[dtrain, dval], valid_names=['train','val'],
                      num_boost_round=num_boost_round, early_stopping_rounds=early_stopping_rounds, verbose_eval=50)

    # evaluate AP@20 in a rough way on validation users
    print("Evaluating on validation set (approx AP) ...")
    val_preds = model.predict(X_val)
    try:
        ap = average_precision_score(y_val, val_preds)
        print(f"Validation average precision (binary): {ap:.6f}")
    except Exception:
        print("Couldn't compute average_precision_score (maybe all labels same)")

    return model


def predict_topk(model, candidates, user_stats, item_stats, split_day, topk=20):
    print("Scoring candidates and producing top-k per user...")
    rows = []
    user_stats = user_stats.set_index('user_id')
    item_stats = item_stats.set_index('item_id')

    for i, (uid, items) in enumerate(candidates.items()):
        if (i+1) % 10000 == 0:
            print(f"Scored {i+1} users...")
        feats = []
        iids = []
        for iid in items:
            if uid in user_stats.index:
                u_total = int(user_stats.at[uid, 'user_total_clicks'])
                u_nitems = int(user_stats.at[uid, 'user_nunique_items'])
            else:
                u_total = 0
                u_nitems = 0
            if iid in item_stats.index:
                i_total = int(item_stats.at[iid, 'item_total_clicks'])
                i_last = int(item_stats.at[iid, 'item_last_day'])
            else:
                i_total = 0
                i_last = -999
            ui_count = 0  # we don't keep full train_df here for speed
            recency = split_day - i_last
            feats.append([u_total, u_nitems, i_total, ui_count, recency])
            iids.append(iid)
        X = np.array(feats)
        scores = model.predict(X)
        top_idx = np.argsort(-scores)[:topk]
        top_items = [iids[idx] for idx in top_idx]
        rows.append((uid, top_items))
    print("Done scoring all users")
    return rows


def save_submission_long(rows, out_path):
    # rows: list of (user_id, [item_ids])
    out_rows = []
    for uid, items in rows:
        for iid in items:
            out_rows.append((uid, iid))
    sub_df = pd.DataFrame(out_rows, columns=['user_id','item_id'])
    sub_df.to_csv(out_path, index=False)
    print(f"Saved long-format submission to {out_path}")


def save_submission_space(rows, out_path):
    out_rows = []
    for uid, items in rows:
        out_rows.append((uid, ' '.join(map(str, items))))
    sub_df = pd.DataFrame(out_rows, columns=['user_id','predictions'])
    sub_df.to_csv(out_path, index=False)
    print(f"Saved space-separated submission to {out_path}")


def main(args):
    df, sample = read_data(args.train, args.sample)
    train_df, target_df, split_day = time_split_make_labels(df, days_for_target=args.days_target)

    user_stats, item_stats = make_stats(train_df)

    sample_users = sample['user_id'].unique().tolist()

    candidates = generate_candidates(train_df, sample_users, item_stats,
                                     topk_popular=args.topk_popular,
                                     max_user_history=args.max_user_history,
                                     random_neg_per_user=args.random_neg)

    feats = build_feature_table(train_df, target_df, candidates, user_stats, item_stats, split_day)

    # balance down negative samples for faster training (optional)
    pos = feats[feats['label'] == 1]
    neg = feats[feats['label'] == 0]
    if len(neg) > 5 * len(pos):
        neg = neg.sample(n=min(len(neg), 5 * len(pos)), random_state=42)
    feats_bal = pd.concat([pos, neg]).sample(frac=1, random_state=42).reset_index(drop=True)

    model = train_pointwise_model(feats_bal, num_boost_round=args.num_boost_round, early_stopping_rounds=args.early_stop)

    # predict
    rows = predict_topk(model, candidates, user_stats, item_stats, split_day, topk=args.topk)

    os.makedirs(args.out_dir, exist_ok=True)
    save_submission_long(rows, os.path.join(args.out_dir, 'submission_long.csv'))
    save_submission_space(rows, os.path.join(args.out_dir, 'submission_space.csv'))

    print("All done.")


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train', type=str, required=True, help='train_data.pq')
    parser.add_argument('--sample', type=str, required=True, help='sample_submission.csv')
    parser.add_argument('--out_dir', type=str, default='./output')
    parser.add_argument('--days_target', type=int, default=7)
    parser.add_argument('--topk_popular', type=int, default=500)
    parser.add_argument('--max_user_history', type=int, default=200)
    parser.add_argument('--random_neg', type=int, default=200)
    parser.add_argument('--topk', type=int, default=20)
    parser.add_argument('--num_boost_round', type=int, default=1000)
    parser.add_argument('--early_stop', type=int, default=50)
    args = parser.parse_args()
    main(args)

In [None]:
"""
recsys_pointwise_kaggle.py

Pointwise baseline для Kaggle Notebook.
1. Читает train_data.pq и sample_submission.csv (должны быть в /kaggle/input/...)
2. Делит по времени: последние 7 дней — как целевой период
3. Генерирует кандидатов: история пользователя, популярные товары, случайные негативы
4. Строит простые признаки (user/item статистики, recency)
5. Обучает LightGBM (binary pointwise)
6. Предсказывает топ-20 item_id для каждого user_id
7. Сохраняет submission.csv в /kaggle/working/

Требования: pandas, numpy, pyarrow, lightgbm, scikit-learn
"""

import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score

# === CONFIG ===
TRAIN_PATH = '/kaggle/input/reccomend/train_data.pq'  # изменить под структуру датасета
SAMPLE_PATH = '/kaggle/input/asddbfd/sample_submission (1).csv'
OUT_PATH = '/kaggle/working/submission.csv'

DAYS_TARGET = 7
TOPK_POPULAR = 500
MAX_USER_HISTORY = 200
RANDOM_NEG = 200
TOPK = 20
SEED = 42


# === STEP 1: LOAD DATA ===
print('Reading data...')
df = pd.read_parquet(TRAIN_PATH)
sample = pd.read_csv(SAMPLE_PATH)
print(f"Train shape: {df.shape}, Sample: {sample.shape}")

# === STEP 2: TIME SPLIT ===
max_day = int(df['date'].max())
split_day = max_day - DAYS_TARGET
train_df = df[df['date'] <= split_day].copy()
target_df = df[df['date'] > split_day].copy()
print(f"Train days ≤ {split_day}, target days > {split_day}")

# === STEP 3: BASIC STATS ===
user_clicks = train_df.groupby('user_id').size().rename('user_total_clicks')
user_nunique_items = train_df.groupby('user_id')['item_id'].nunique().rename('user_nunique_items')
item_clicks = train_df.groupby('item_id').size().rename('item_total_clicks')
item_last_day = train_df.groupby('item_id')['date'].max().rename('item_last_day')

user_stats = pd.concat([user_clicks, user_nunique_items], axis=1).reset_index()
item_stats = pd.concat([item_clicks, item_last_day], axis=1).reset_index()

# === STEP 4: CANDIDATES ===
np.random.seed(SEED)
popular_items = item_stats.sort_values('item_total_clicks', ascending=False)['item_id'].values[:TOPK_POPULAR]
user_history = train_df.sort_values(['user_id','date'], ascending=[True, False]).groupby('user_id')['item_id'].apply(list)

candidates = {}
for uid in sample['user_id'].unique():
    cands = []
    if uid in user_history.index:
        cands.extend(user_history.loc[uid][:MAX_USER_HISTORY])
    cands.extend(popular_items[:200])
    cands.extend(np.random.choice(popular_items, size=min(RANDOM_NEG, len(popular_items)), replace=False))
    candidates[uid] = list(set(cands))

print(f"Generated candidates for {len(candidates)} users")

# === STEP 5: FEATURE TABLE ===
user_stats = user_stats.set_index('user_id')
item_stats = item_stats.set_index('item_id')

pos_pairs = set(zip(target_df['user_id'], target_df['item_id']))

rows = []
for uid, items in candidates.items():
    if uid in user_stats.index:
        u_total = user_stats.at[uid, 'user_total_clicks']
        u_nitems = user_stats.at[uid, 'user_nunique_items']
    else:
        u_total, u_nitems = 0, 0
    for iid in items:
        if iid in item_stats.index:
            i_total = item_stats.at[iid, 'item_total_clicks']
            i_last = item_stats.at[iid, 'item_last_day']
        else:
            i_total, i_last = 0, -999
        recency = split_day - i_last
        ui_count = ((train_df['user_id'] == uid) & (train_df['item_id'] == iid)).sum()
        label = 1 if (uid, iid) in pos_pairs else 0
        rows.append((uid, iid, u_total, u_nitems, i_total, ui_count, recency, label))

feats = pd.DataFrame(rows, columns=['user_id','item_id','user_total_clicks','user_nunique_items','item_total_clicks','ui_count','recency','label'])
print(f"Feature table shape: {feats.shape}")

# Balance for faster training
pos = feats[feats['label'] == 1]
neg = feats[feats['label'] == 0]
if len(neg) > 5 * len(pos):
    neg = neg.sample(n=5 * len(pos), random_state=SEED)
feats_bal = pd.concat([pos, neg]).sample(frac=1, random_state=SEED)

# === STEP 6: TRAIN MODEL (RankNet) ===
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from tqdm.notebook import tqdm

# Подготовка данных
X = feats_bal[['user_total_clicks','user_nunique_items','item_total_clicks','ui_count','recency']]
y = feats_bal['label']
groups = feats_bal.groupby('user_id').size().values  # количество кандидатов на каждого пользователя

# Разделение по пользователям, а не по строкам
unique_users = feats_bal['user_id'].unique()
train_users, val_users = train_test_split(unique_users, test_size=0.15, random_state=SEED)

train_mask = feats_bal['user_id'].isin(train_users)
val_mask = feats_bal['user_id'].isin(val_users)

X_train, y_train = X[train_mask], y[train_mask]
X_val, y_val = X[val_mask], y[val_mask]

group_train = feats_bal[train_mask].groupby('user_id').size().values
group_val = feats_bal[val_mask].groupby('user_id').size().values

# LightGBM Datasets
dtrain = lgb.Dataset(X_train, label=y_train, group=group_train)
dval = lgb.Dataset(X_val, label=y_val, group=group_val)

params = {
    'objective': 'rank_xendcg',  # RankNet / LambdaRank / Xendcg (все поддерживаются)
    'metric': 'ndcg',
    'ndcg_eval_at': [5, 10],
    'learning_rate': 0.05,
    'num_leaves': 63,
    'min_data_in_leaf': 20,
    'boosting': 'gbdt',
    'verbosity': -1,
    'seed': SEED
}

EPOCHS = 1
pbar = tqdm(total=EPOCHS, desc="Training RankNet", position=0, leave=True)

def tqdm_callback(env):
    pbar.update(1)
    if env.iteration % 10 == 0:
        ndcg_val = env.evaluation_result_list[1][2]
        pbar.set_postfix({'iter': env.iteration, 'val_ndcg': f"{ndcg_val:.4f}"})

model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dval],
    valid_names=['train','val'],
    num_boost_round=EPOCHS,
    early_stopping_rounds=30,
    verbose_eval=False,
    callbacks=[tqdm_callback]
)

pbar.close()
print("Best iteration:", model.best_iteration)



# === STEP 7: PREDICT TOPK ===
rows_pred = []
for uid, items in candidates.items():
    feats_user = []
    iids = []
    if uid in user_stats.index:
        u_total = user_stats.at[uid, 'user_total_clicks']
        u_nitems = user_stats.at[uid, 'user_nunique_items']
    else:
        u_total, u_nitems = 0, 0
    for iid in items:
        if iid in item_stats.index:
            i_total = item_stats.at[iid, 'item_total_clicks']
            i_last = item_stats.at[iid, 'item_last_day']
        else:
            i_total, i_last = 0, -999
        recency = split_day - i_last
        feats_user.append([u_total, u_nitems, i_total, 0, recency])
        iids.append(iid)
    X_pred = np.array(feats_user)
    scores = model.predict(X_pred)
    top_items = [iids[i] for i in np.argsort(-scores)[:TOPK]]
    rows_pred.append((uid, ' '.join(map(str, top_items))))

submission = pd.DataFrame(rows_pred, columns=['user_id','predictions'])
submission.to_csv(OUT_PATH, index=False)
print(f'Saved submission to {OUT_PATH}')


Reading data...
Train shape: (8777975, 3), Sample: (5864600, 2)
Train days ≤ 39, target days > 39
Generated candidates for 293230 users


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

# === CONFIG ===
TRAIN_PATH = '/kaggle/input/reccomend/train_data.pq'
SAMPLE_PATH = '/kaggle/input/asddbfd/sample_submission (1).csv'
OUT_PATH = '/kaggle/working/submission.csv'
SEED = 42
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DAYS_TARGET = 7
TOPK = 20
TOPK_POPULAR = 3000
MAX_USER_HISTORY = 20
MAX_USERS = 50000      # ⚠️ обучаем только на части пользователей
CANDS_PER_USER = 200
RANDOM_NEG = 50
torch.manual_seed(SEED)
np.random.seed(SEED)

# === STEP 1: LOAD ===
print("Reading data...")
df = pd.read_parquet(TRAIN_PATH)
sample = pd.read_csv(SAMPLE_PATH)
print(f"Train shape: {df.shape}, Sample: {sample.shape}")

# === STEP 2: TIME SPLIT ===
max_day = int(df['date'].max())
split_day = max_day - DAYS_TARGET
train_df = df[df['date'] <= split_day]
target_df = df[df['date'] > split_day]

print(f"Train ≤ {split_day}, Target > {split_day}")

# === STEP 3: BASIC STATS ===
user_clicks = train_df.groupby('user_id').size().rename('user_total_clicks')
user_nunique_items = train_df.groupby('user_id')['item_id'].nunique().rename('user_nunique_items')
item_clicks = train_df.groupby('item_id').size().rename('item_total_clicks')
item_last_day = train_df.groupby('item_id')['date'].max().rename('item_last_day')

user_stats = pd.concat([user_clicks, user_nunique_items], axis=1)
item_stats = pd.concat([item_clicks, item_last_day], axis=1)

# ограничим пользователей
active_users = user_clicks.sort_values(ascending=False).head(MAX_USERS).index
sample = sample[sample['user_id'].isin(active_users)]
print(f"Using {len(sample)} most active users for training")

# === STEP 4: CANDIDATES (быстро) ===
popular_items = item_stats.sort_values('item_total_clicks', ascending=False)['item_id'].values[:TOPK_POPULAR]
user_history = (
    train_df[train_df['user_id'].isin(active_users)]
    .sort_values(['user_id', 'date'], ascending=[True, False])
    .groupby('user_id')['item_id']
    .apply(lambda x: x.iloc[:MAX_USER_HISTORY].tolist())
)

def get_candidates(uid):
    cands = []
    if uid in user_history:
        cands += user_history[uid]
    cands += list(popular_items[:100])
    if len(cands) < CANDS_PER_USER:
        cands += list(np.random.choice(popular_items, size=CANDS_PER_USER - len(cands), replace=False))
    return list(set(cands))

print("Generating candidates (subset)...")
candidates = {uid: get_candidates(uid) for uid in tqdm(sample['user_id'])}

# === STEP 5: FEATURE TABLE (векторизовано) ===
def build_features(uids, cands_dict):
    feats = []
    for uid in uids:
        items = cands_dict[uid]
        u_total = user_stats.at[uid, 'user_total_clicks'] if uid in user_stats.index else 0
        u_nitems = user_stats.at[uid, 'user_nunique_items'] if uid in user_stats.index else 0
        user_part = np.array([[uid, iid, u_total, u_nitems] for iid in items])
        feats.append(user_part)
    feats = np.vstack(feats)
    feats = pd.DataFrame(feats, columns=['user_id', 'item_id', 'user_total_clicks', 'user_nunique_items'])
    feats['item_total_clicks'] = item_stats.reindex(feats['item_id'].values)['item_total_clicks'].fillna(0).values
    feats['recency'] = split_day - item_stats.reindex(feats['item_id'].values)['item_last_day'].fillna(-999).values
    feats['ui_count'] = (
        train_df[['user_id','item_id']].value_counts().reindex(list(zip(feats['user_id'], feats['item_id'])))
        .fillna(0).values
    )
    feats['label'] = feats.apply(lambda r: 1 if (r['user_id'], r['item_id']) in set(zip(target_df['user_id'], target_df['item_id'])) else 0, axis=1)
    return feats

print("Building compact feature table...")
feats = build_features(sample['user_id'].values, candidates)
print(f"Feature table: {feats.shape}")

# Ограничим количество примеров для быстрой тренировки
feats_small = feats.sample(n=min(100_000, len(feats)), random_state=SEED).reset_index(drop=True)

# === STEP 6: MAKE PAIRS ===
def make_pairs_fast(df):
    pairs = []
    grouped = df.groupby('user_id')
    for uid, g in grouped:
        pos = g[g['label'] == 1]
        neg = g[g['label'] == 0]
        if len(pos) == 0 or len(neg) == 0:
            continue
        neg_sample = neg.sample(min(RANDOM_NEG, len(neg)), random_state=SEED)
        for _, p in pos.iterrows():
            for _, n in neg_sample.iterrows():
                pairs.append((p, n))
    return pairs

print("Generating pairs (subset)...")
pairs = make_pairs_fast(feats_small)
print(f"Pairs: {len(pairs)}")

def tensorize(rows):
    X1 = np.array([[r[0]['user_total_clicks'], r[0]['user_nunique_items'], r[0]['item_total_clicks'], r[0]['ui_count'], r[0]['recency']] for r in rows], dtype=np.float32)
    X2 = np.array([[r[1]['user_total_clicks'], r[1]['user_nunique_items'], r[1]['item_total_clicks'], r[1]['ui_count'], r[1]['recency']] for r in rows], dtype=np.float32)
    return torch.tensor(X1), torch.tensor(X2)

X1, X2 = tensorize(pairs)

# === STEP 7: RANKNET MODEL ===
class RankNet(nn.Module):
    def __init__(self, input_dim=5, hidden_dim=64):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Linear(hidden_dim // 2, 1)
        )

    def forward(self, x):
        return self.net(x)

model = RankNet().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCELoss()

# === STEP 8: TRAIN LOOP (SUBSET) ===
EPOCHS = 5
BATCH_SIZE = 1024

pbar = tqdm(range(EPOCHS), desc="Training RankNet")
for epoch in pbar:
    perm = torch.randperm(len(X1))
    total_loss = 0
    model.train()
    for i in range(0, len(X1), BATCH_SIZE):
        idx = perm[i:i+BATCH_SIZE]
        x1 = X1[idx].to(DEVICE)
        x2 = X2[idx].to(DEVICE)
        s1 = model(x1)
        s2 = model(x2)
        p = torch.sigmoid(s1 - s2)
        y = torch.ones_like(p)
        loss = criterion(p, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    pbar.set_postfix({'loss': f'{total_loss:.4f}'})

# === STEP 9: PREDICT TOPK ===
print("Predicting for submission (subset)...")
model.eval()
rows_pred = []
with torch.no_grad():
    for uid, items in tqdm(candidates.items()):
        feats_user = []
        iids = []
        if uid in user_stats.index:
            u_total = user_stats.at[uid, 'user_total_clicks']
            u_nitems = user_stats.at[uid, 'user_nunique_items']
        else:
            u_total, u_nitems = 0, 0
        for iid in items:
            if iid in item_stats.index:
                i_total = item_stats.at[iid, 'item_total_clicks']
                i_last = item_stats.at[iid, 'item_last_day']
            else:
                i_total, i_last = 0, -999
            recency = split_day - i_last
            feats_user.append([u_total, u_nitems, i_total, 0, recency])
            iids.append(iid)
        X_pred = torch.tensor(feats_user, dtype=torch.float32).to(DEVICE)
        scores = model(X_pred).squeeze().cpu().numpy()
        top_items = [iids[i] for i in np.argsort(-scores)[:TOPK]]
        rows_pred.append((uid, ' '.join(map(str, top_items))))

submission = pd.DataFrame(rows_pred, columns=['user_id','predictions'])
submission.to_csv(OUT_PATH, index=False)
print(f'Saved submission to {OUT_PATH}')

Reading data...
Train shape: (8777975, 3), Sample: (5864600, 2)
Train ≤ 39, Target > 39
Using 353680 most active users for training


KeyError: 'item_id'

In [None]:
# Установка необходимых библиотек
# !pip install pandas numpy scikit-surprise scikit-learn

import pandas as pd
import numpy as np
from surprise import SVDpp, Dataset, Reader
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# === 1. Загрузка данных ===
train_df = pd.read_csv("/kaggle/input/hshussu/train.tsv", sep='\t')
test_df = pd.read_csv("/kaggle/input/hshussu/test.tsv", sep='\t')
# Попробуем безопасно загрузить файл отзывов
reviews_df = pd.read_csv(
    "/kaggle/input/hshussu/reviews.txv/reviews.tsv",
    sep='\t',
    on_bad_lines='skip',  # пропуск строк с ошибками
    quoting=3,            # ignore quotes
    engine='python'       # более устойчивый парсер
)

print(reviews_df.head())


# === 2. Подготовка признаков текстов ===
# Заменяем None/NaN на пустую строку
reviews_df['text'] = reviews_df['text'].fillna("")

# Объединяем все отзывы для каждого заведения
reviews_agg = reviews_df.groupby("id")['text'].apply(lambda x: " ".join(x)).reset_index()

print(reviews_agg.head())

# TF-IDF векторизация
tfidf = TfidfVectorizer(max_features=500)
reviews_tfidf = tfidf.fit_transform(reviews_agg["text"])
print(111)
# Объединяем TF-IDF с базовыми признаками
# Для простоты возьмём только категорию и географические признаки
feature_cols = ['category', 'traffic_300m', 'traffic_1000m', 'mean_income_300m', 'mean_income_1000m']
train_features = train_df[feature_cols].copy()
test_features = test_df[feature_cols].copy()
print(111)
# Кодируем категориальные признаки
train_features = pd.get_dummies(train_features, columns=['category'])
test_features = pd.get_dummies(test_features, columns=['category'])
print(111)
# Выравниваем колонки
train_features, test_features = train_features.align(test_features, join='left', axis=1, fill_value=0)

# === 3. Подготовка данных для SVD++ ===
# SVD++ в библиотеке surprise работает с "user-item-rating" форматом
# Здесь заведем условного пользователя "system", чтобы использовать SVD++ для заведений
svd_data = train_df[['id', 'target']].copy()
svd_data['user'] = 'system'  # фиксированный "пользователь"
print(111)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(svd_data[['user', 'id', 'target']], reader)

trainset = data.build_full_trainset()
print(111)
# === 4. Обучение SVD++ ===
algo = SVDpp(n_factors=50, n_epochs=1, lr_all=0.005, reg_all=0.02,verbose=True)
algo.fit(trainset)

# === 5. Предсказание для тестового набора ===
test_df['target'] = test_df['id'].apply(lambda x: algo.predict('system', x).est)

# === 6. Сохранение результатов ===
test_df[['id', 'target']].to_csv("submission.csv", index=False)
print("Результаты сохранены в submission.csv")


      id                                               text
0  43591  Мармелад в целом неплохой, но цены завышены, м...
1  43591  Не нравится, что товар выложен открыто, слишко...
2  43591  Часто попадается сухой мармелад, дубовый впере...
3  43591  Персонал был одет в костюмы пиратов, а ассорти...
4  43591  Вкусный мармелад с широким ассортиментом форм,...
                                                  id text
0                   - ЧАСТЬ I (всё прошло хорошо) –"     
1   - широкий ассортимент пива, включающий не тол...     
2                  Но цена могла бы быть чуть ниже."     
3  !!!!! Здесь установлено современное оборудован...     
4                        !!!!!! Плохой сервис !!!!!!     
111
111
111
111
111
 processing epoch 0
Результаты сохранены в submission.csv


In [None]:
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader
from sklearn.feature_extraction.text import TfidfVectorizer

# === 1. Загрузка данных ===
train_df = pd.read_csv("/kaggle/input/hshussu/train.tsv", sep='\t')
test_df = pd.read_csv("/kaggle/input/hshussu/test.tsv", sep='\t')

# Попробуем безопасно загрузить файл отзывов
reviews_df = pd.read_csv(
    "/kaggle/input/hshussu/reviews.txv/reviews.tsv",
    sep='\t',
    on_bad_lines='skip',
    quoting=3,
    engine='python'
)

# === 2. Подготовка текстов ===
reviews_df['text'] = reviews_df['text'].fillna("")
reviews_agg = reviews_df.groupby("id")['text'].apply(lambda x: " ".join(x)).reset_index()

# TF-IDF векторизация
tfidf = TfidfVectorizer(max_features=500)
reviews_tfidf = tfidf.fit_transform(reviews_agg["text"])

# === 3. Подготовка данных для FunkSVD ===
svd_data = train_df[['id', 'target']].copy()
svd_data['user'] = 'system'  # фиктивный пользователь
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(svd_data[['user', 'id', 'target']], reader)
trainset = data.build_full_trainset()

# === 4. Обучение FunkSVD ===
algo = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02, verbose=True)
algo.fit(trainset)

# === 5. Предсказание ===
test_df['target'] = test_df['id'].apply(lambda x: algo.predict('system', x).est)

# === 6. Сохранение результатов ===
test_df[['id', 'target']].to_csv("submission_funksvd.csv", index=False)
print("✅ Результаты сохранены в submission_funksvd.csv")


Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
✅ Результаты сохранены в submission_funksvd.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from scipy.sparse import hstack

# === 1. Загрузка данных ===
train_df = pd.read_csv("/kaggle/input/hshussu/train.tsv", sep='\t')
test_df = pd.read_csv("/kaggle/input/hshussu/test.tsv", sep='\t')
reviews_df = pd.read_csv(
    "/kaggle/input/hshussu/reviews.txv/reviews.tsv",
    sep='\t',
    on_bad_lines='skip',
    quoting=3,
    engine='python'
)

# === 2. Подготовка текстов ===
reviews_df['text'] = reviews_df['text'].fillna("")
reviews_df['id'] = reviews_df['id'].astype(str)

reviews_agg = reviews_df.groupby("id")['text'].apply(lambda x: " ".join(x)).reset_index()

# Приводим id к одному типу и в train/test
train_df['id'] = train_df['id'].astype(str)
test_df['id'] = test_df['id'].astype(str)

# Присоединяем текстовые признаки
train_df = train_df.merge(reviews_agg, on='id', how='left')
test_df = test_df.merge(reviews_agg, on='id', how='left')
train_df['text'] = train_df['text'].fillna("")
test_df['text'] = test_df['text'].fillna("")


train_tfidf = tfidf.transform(train_df['text'])
test_tfidf = tfidf.transform(test_df['text'])

# === 3. Табличные признаки ===
feature_cols = ['category', 'traffic_300m', 'traffic_1000m', 'mean_income_300m', 'mean_income_1000m']
from scipy.sparse import hstack, csr_matrix

# Убедимся, что только числовые колонки
# === Кодирование категориальных признаков ===
train_features = pd.get_dummies(train_df[feature_cols], columns=['category'])
test_features = pd.get_dummies(test_df[feature_cols], columns=['category'])

# === Выравнивание фич ===
train_features, test_features = train_features.align(test_features, join='left', axis=1, fill_value=0)

# Приведение типов
train_features = train_features.astype('float32')
test_features = test_features.astype('float32')

# TF-IDF признаки
tfidf = TfidfVectorizer(max_features=500)
train_tfidf = tfidf.fit_transform(train_df["text"])
test_tfidf = tfidf.transform(test_df["text"])

# Объединение TF-IDF + числовые признаки
from scipy.sparse import hstack, csr_matrix

X_train = hstack([csr_matrix(train_features.values), train_tfidf]).tocsr()
X_test = hstack([csr_matrix(test_features.values), test_tfidf]).tocsr()

y_train = train_df['target']



# === 4. Обучение XGBoost ===
model = XGBRegressor(
    n_estimators=380,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist',
    objective='reg:squarederror'
)

# Разделим немного для контроля качества
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], early_stopping_rounds=20, verbose=True)

# === 5. Предсказание ===
test_df['target'] = model.predict(X_test)
test_df['target'] = np.clip(test_df['target'], 1, 5)  # диапазон [1,5]

# === 6. Сохранение ===
test_df[['id', 'target']].to_csv("submission_xgboost.csv", index=False)
print("✅ Результаты сохранены в submission_xgboost.csv")




[0]	validation_0-rmse:1.15075
[1]	validation_0-rmse:1.13537
[2]	validation_0-rmse:1.12082
[3]	validation_0-rmse:1.10700
[4]	validation_0-rmse:1.09465
[5]	validation_0-rmse:1.08336
[6]	validation_0-rmse:1.07250
[7]	validation_0-rmse:1.06349
[8]	validation_0-rmse:1.05397
[9]	validation_0-rmse:1.04489
[10]	validation_0-rmse:1.03743
[11]	validation_0-rmse:1.02972
[12]	validation_0-rmse:1.02339
[13]	validation_0-rmse:1.01622
[14]	validation_0-rmse:1.01083
[15]	validation_0-rmse:1.00505
[16]	validation_0-rmse:0.99941
[17]	validation_0-rmse:0.99408
[18]	validation_0-rmse:0.98919
[19]	validation_0-rmse:0.98465
[20]	validation_0-rmse:0.98027
[21]	validation_0-rmse:0.97606
[22]	validation_0-rmse:0.97250
[23]	validation_0-rmse:0.96903
[24]	validation_0-rmse:0.96576
[25]	validation_0-rmse:0.96236
[26]	validation_0-rmse:0.95914
[27]	validation_0-rmse:0.95682
[28]	validation_0-rmse:0.95471
[29]	validation_0-rmse:0.95234
[30]	validation_0-rmse:0.94980
[31]	validation_0-rmse:0.94725
[32]	validation_0-

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# ============ 1. Загрузка данных ============
train_df = pd.read_csv("/kaggle/input/hshussu/train.tsv", sep='\t')
test_df = pd.read_csv("/kaggle/input/hshussu/test.tsv", sep='\t')
reviews_df = pd.read_csv(
    "/kaggle/input/hshussu/reviews.txv/reviews.tsv",
    sep='\t',
    on_bad_lines='skip',
    quoting=3,
    engine='python'
)

# ============ 2. Подготовка текстов ============
reviews_df['text'] = reviews_df['text'].fillna("")
reviews_df['id'] = reviews_df['id'].astype(str)
train_df['id'] = train_df['id'].astype(str)
test_df['id'] = test_df['id'].astype(str)

reviews_agg = reviews_df.groupby("id")['text'].apply(lambda x: " ".join(x)).reset_index()

train_df = train_df.merge(reviews_agg, on='id', how='left')
test_df = test_df.merge(reviews_agg, on='id', how='left')

train_df['text'] = train_df['text'].fillna("")
test_df['text'] = test_df['text'].fillna("")

# ============ 3. TF-IDF ============
tfidf = TfidfVectorizer(max_features=512)
train_tfidf = tfidf.fit_transform(train_df["text"]).astype(np.float32)
test_tfidf = tfidf.transform(test_df["text"]).astype(np.float32)

# ============ 4. Табличные признаки ============
feature_cols = ['category', 'traffic_300m', 'traffic_1000m', 'mean_income_300m', 'mean_income_1000m']
train_features = pd.get_dummies(train_df[feature_cols], columns=['category'])
test_features = pd.get_dummies(test_df[feature_cols], columns=['category'])
train_features, test_features = train_features.align(test_features, join='left', axis=1, fill_value=0)

scaler = StandardScaler()
train_num = scaler.fit_transform(train_features)
test_num = scaler.transform(test_features)

# ============ 5. Torch Dataset ============
class RecDataset(Dataset):
    def __init__(self, tfidf, num, target=None):
        self.tfidf = torch.tensor(tfidf.toarray(), dtype=torch.float32)
        self.num = torch.tensor(num, dtype=torch.float32)
        self.target = torch.tensor(target, dtype=torch.float32) if target is not None else None

    def __len__(self):
        return len(self.num)

    def __getitem__(self, idx):
        if self.target is not None:
            return self.tfidf[idx], self.num[idx], self.target[idx]
        else:
            return self.tfidf[idx], self.num[idx]

from sklearn.model_selection import train_test_split
import numpy as np

# Создаем массив индексов
idx = np.arange(train_df.shape[0])
idx_train, idx_val = train_test_split(idx, test_size=0.1, random_state=42)

# Разделяем TF-IDF
X_tfidf_train = train_tfidf[idx_train]
X_tfidf_val = train_tfidf[idx_val]

# Разделяем числовые признаки
X_num_train = train_num[idx_train]
X_num_val = train_num[idx_val]

# Разделяем целевую переменную
y_train = train_df['target'].values[idx_train]
y_val = train_df['target'].values[idx_val]

# Создаем датасеты
train_dataset = RecDataset(X_tfidf_train, X_num_train, y_train)
val_dataset = RecDataset(X_tfidf_val, X_num_val, y_val)

test_dataset = RecDataset(test_tfidf, test_num)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

# ============ 6. Модель с Attention ============
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_hidden=128, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_hidden),
            nn.ReLU(),
            nn.Linear(ff_hidden, embed_dim)
        )
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        attn_out, _ = self.attn(x, x, x)
        x = self.norm1(x + self.dropout(attn_out))
        ff_out = self.ff(x)
        x = self.norm2(x + self.dropout(ff_out))
        return x

class HybridTransformer(nn.Module):
    def __init__(self, text_dim, num_dim, hidden=128):
        super().__init__()
        self.text_proj = nn.Linear(text_dim, hidden)
        self.num_proj = nn.Linear(num_dim, hidden)
        self.transformer = TransformerBlock(hidden, num_heads=4)
        self.fc_out = nn.Sequential(
            nn.Linear(hidden, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, text, num):
        text_emb = self.text_proj(text)
        num_emb = self.num_proj(num)
        # объединяем текст и табличные признаки в последовательность длиной 2
        seq = torch.stack([text_emb, num_emb], dim=1)
        out = self.transformer(seq)
        # берём среднее по токенам
        pooled = out.mean(dim=1)
        return self.fc_out(pooled).squeeze(1)

# ============ 7. Обучение ============
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridTransformer(text_dim=train_tfidf.shape[1], num_dim=train_num.shape[1]).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.L1Loss()  # MAE

for epoch in range(10):
    model.train()
    total_loss = 0
    for text, num, target in tqdm(train_loader, desc=f"Epoch {epoch+1}/5"):
        text, num, target = text.to(device), num.to(device), target.to(device)
        pred = model(text, num)
        loss = criterion(pred, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Train loss: {total_loss / len(train_loader):.4f}")

# ============ 8. Предсказание ============
model.eval()
preds = []
with torch.no_grad():
    for text, num in DataLoader(test_dataset, batch_size=128):
        text, num = text.to(device), num.to(device)
        out = model(text, num)
        preds.extend(out.cpu().numpy())

test_df['target'] = np.clip(preds, 1, 5)
test_df[['id', 'target']].to_csv("submission_transformer.csv", index=False)
print("✅ Результаты сохранены в submission_transformer.csv")


Epoch 1/5: 100%|██████████| 579/579 [00:04<00:00, 119.65it/s]


Train loss: 0.5551


Epoch 2/5: 100%|██████████| 579/579 [00:04<00:00, 115.84it/s]


Train loss: 0.4733


Epoch 3/5: 100%|██████████| 579/579 [00:04<00:00, 115.87it/s]


Train loss: 0.4641


Epoch 4/5: 100%|██████████| 579/579 [00:04<00:00, 116.99it/s]


Train loss: 0.4598


Epoch 5/5: 100%|██████████| 579/579 [00:04<00:00, 118.86it/s]


Train loss: 0.4597


Epoch 6/5: 100%|██████████| 579/579 [00:04<00:00, 118.14it/s]


Train loss: 0.4605


Epoch 7/5: 100%|██████████| 579/579 [00:04<00:00, 119.31it/s]


Train loss: 0.4541


Epoch 8/5: 100%|██████████| 579/579 [00:04<00:00, 116.48it/s]


Train loss: 0.4549


Epoch 9/5: 100%|██████████| 579/579 [00:04<00:00, 116.35it/s]


Train loss: 0.4523


Epoch 10/5: 100%|██████████| 579/579 [00:05<00:00, 113.55it/s]


Train loss: 0.4517
✅ Результаты сохранены в submission_transformer.csv


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# === 1. Подготовка ===
train_df = pd.read_csv("/kaggle/input/hshussu/train.tsv", sep='\t')
test_df = pd.read_csv("/kaggle/input/hshussu/test.tsv", sep='\t')
reviews_df = pd.read_csv("/kaggle/input/hshussu/reviews.txv/reviews.tsv", sep='\t', on_bad_lines='skip', quoting=3, engine='python')

reviews_df['text'] = reviews_df['text'].fillna("")
reviews_df = reviews_df.groupby("id")["text"].apply(lambda x: " ".join(x)).reset_index()
train_df = train_df.merge(reviews_df, on="id", how="left").fillna("")
test_df = test_df.merge(reviews_df, on="id", how="left").fillna("")

# === 2. Числовые признаки ===
num_cols = ['traffic_300m','traffic_1000m','mean_income_300m','mean_income_1000m']
scaler = StandardScaler()
train_num = scaler.fit_transform(train_df[num_cols])
test_num = scaler.transform(test_df[num_cols])

# === 3. Tokenizer ===
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

class POIDataset(Dataset):
    def __init__(self, texts, nums, targets=None):
        self.texts = texts
        self.nums = torch.tensor(nums, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32) if targets is not None else None
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = tokenizer(self.texts[idx], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
        out = {k: v.squeeze(0) for k, v in enc.items()}
        out["num"] = self.nums[idx]
        if self.targets is not None:
            out["target"] = self.targets[idx]
        return out

# === 4. Модель ===
class TransModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.text_model = AutoModel.from_pretrained("distilbert-base-uncased")
        self.fc_num = nn.Linear(4, 64)
        self.head = nn.Sequential(nn.Linear(64 + 768, 128), nn.ReLU(), nn.Linear(128, 1))
    def forward(self, input_ids, attention_mask, num):
        x_text = self.text_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:,0,:]
        x_num = self.fc_num(num)
        x = torch.cat([x_text, x_num], dim=1)
        return self.head(x).squeeze(1)

# === 5. Обучение ===
X_train, X_val, y_train, y_val = train_test_split(train_df["text"].tolist(), train_num, train_df["target"].values, test_size=0.1, random_state=42)
train_ds = POIDataset(X_train, y_train, y_train)
val_ds = POIDataset(X_val, y_val, y_val)
train_dl = DataLoader(train_ds, batch_size=8, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=8)

model = TransModel().to("cuda")
opt = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.L1Loss()

for epoch in range(2):
    model.train()
    for batch in train_dl:
        opt.zero_grad()
        out = model(batch["input_ids"].to("cuda"), batch["attention_mask"].to("cuda"), batch["num"].to("cuda"))
        loss = loss_fn(out, batch["target"].to("cuda"))
        loss.backward()
        opt.step()
    print(f"Epoch {epoch+1} done")


ValueError: You are trying to merge on int64 and object columns for key 'id'. If you wish to proceed you should use pd.concat

In [None]:


import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# ======================================================
# 0. CLI аргументы
# ======================================================

# ======================================================
# 1. Загрузка данных
# ======================================================
train_df = pd.read_csv("/kaggle/input/hshussu/train.tsv", sep="\t")
test_df = pd.read_csv("/kaggle/input/hshussu/test.tsv", sep="\t")
reviews_df = pd.read_csv("/kaggle/input/hshussu/reviews.txv/reviews.tsv", sep="\t",
                         on_bad_lines="skip", quoting=3, engine="python")

# агрегируем отзывы
reviews_df["text"] = reviews_df["text"].fillna("")
reviews_agg = reviews_df.groupby("id")["text"].apply(lambda x: " ".join(x)).reset_index()

# Приводим id к строковому типу везде
train_df["id"] = train_df["id"].astype(str)
test_df["id"] = test_df["id"].astype(str)
reviews_agg["id"] = reviews_agg["id"].astype(str)

# Теперь безопасно объединяем
train_df = train_df.merge(reviews_agg, on="id", how="left").fillna("")
test_df = test_df.merge(reviews_agg, on="id", how="left").fillna("")


# ======================================================
# 2. Общие числовые признаки
# ======================================================
num_cols = ["traffic_300m", "traffic_1000m", "mean_income_300m", "mean_income_1000m"]
for col in num_cols:
    train_df[col] = train_df[col].fillna(train_df[col].median())
    test_df[col] = test_df[col].fillna(train_df[col].median())

scaler = StandardScaler()
train_num = scaler.fit_transform(train_df[num_cols])
test_num = scaler.transform(test_df[num_cols])

# ======================================================
# 3. Модели
# ======================================================

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

device = "cuda" if torch.cuda.is_available() else "cpu"
from transformers import AutoTokenizer

from huggingface_hub import login, hf_hub_download
import os

os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"



class POIDataset(Dataset):
    def __init__(self, texts, nums, targets=None):
        self.texts = texts
        self.nums = torch.tensor(nums, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.float32) if targets is not None else None
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = tokenizer(self.texts[idx], truncation=True, padding="max_length",
                        max_length=128, return_tensors="pt")
        out = {k: v.squeeze(0) for k, v in enc.items()}
        out["num"] = self.nums[idx]
        if self.targets is not None:
            out["target"] = self.targets[idx]
        return out

class TransModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.text_model = AutoModel.from_pretrained("distilbert-base-uncased")
        self.fc_num = nn.Linear(4, 64)
        self.head = nn.Sequential(
            nn.Linear(768 + 64, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, input_ids, attention_mask, num):
        x_text = self.text_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        x_num = self.fc_num(num)
        x = torch.cat([x_text, x_num], dim=1)
        return self.head(x).squeeze(1)

X_train = train_test_split(train_df["text"].tolist(), train_num, train_df["target"].values,test_size=0.1)
y_train = train_test_split(train_df["text"].tolist(), train_num, train_df["target"].values,test_size=0.1)
train_ds = POIDataset(X_train)
val_ds = POIDataset(X_val, y_val)
train_dl = DataLoader(train_ds, batch_size=8, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=8)

model = TransModel().to(device)
opt = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.L1Loss()

for epoch in range(2):
    model.train()
    total_loss = 0
    for batch in train_dl:
        opt.zero_grad()
        out = model(batch["input_ids"].to(device),
                    batch["attention_mask"].to(device),
                    batch["num"].to(device))
        loss = loss_fn(out, batch["target"].to(device))
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: loss={total_loss/len(train_dl):.4f}")

# предсказание
model.eval()
preds = []
test_ds = POIDataset(test_df["text"].tolist(), test_num)
test_dl = DataLoader(test_ds, batch_size=16)
with torch.no_grad():
    for batch in test_dl:
        out = model(batch["input_ids"].to(device),
                    batch["attention_mask"].to(device),
                    batch["num"].to(device))
        preds.extend(out.cpu().numpy())
test_df["target"] = np.clip(preds, 1, 5)

# ------------------------------------------------------



# ======================================================
# 4. Сохранение результата
# ======================================================
sub = test_df[["id", "target"]]
sub.to_csv("submission.csv", index=False)
print("✅ Saved submission.csv")


TypeError: POIDataset.__init__() missing 1 required positional argument: 'nums'

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# === Пути к данным ===
TRAIN_PATH = "/kaggle/input/hshussu/train.tsv"
TEST_PATH = "/kaggle/input/hshussu/test.tsv"
REVIEWS_PATH = "/kaggle/input/hshussu/reviews.txv/reviews.tsv"

# === Чтение данных ===
train = pd.read_csv(TRAIN_PATH, sep='\t')
test = pd.read_csv(TEST_PATH, sep='\t')
reviews = pd.read_csv(REVIEWS_PATH, sep='\t')

print(train.shape, test.shape, reviews.shape)
print(train.columns[:20])


(41105, 286) (9276, 285) (440082, 2)
Index(['id', 'name', 'coordinates', 'category', 'address', 'target',
       'traffic_300m', 'homes_300m', 'works_300m', 'female_300m',
       'train_ticket_order_300m', 'mortgage_300m', 'recipes_300m',
       'online_shops_300m', 'manga_300m', 'children_goods_300m',
       'language_courses_300m', 'commercial_real_estate_purchase_300m',
       'grocery_stores_300m', 'preschool_300m'],
      dtype='object')


In [None]:
# Целевая переменная
train['target']

0        4.1
1        3.6
2        3.5
3        4.0
4        4.2
        ... 
41100    3.5
41101    3.6
41102    0.0
41103    3.9
41104    3.7
Name: target, Length: 41105, dtype: float64

In [None]:
# Соединяем отзывы
reviews_grouped = reviews.groupby('id')['text'].apply(lambda x: ' '.join(x)).reset_index()
#train = train.merge(reviews_grouped, on='id', how='left')
#test = test.merge(reviews_grouped, on='id', how='left')


cat_cols = ['category']
for c in cat_cols:
    le = LabelEncoder()
    train[c] = le.fit_transform(train[c].astype(str))
    test[c] = le.transform(test[c].astype(str))

# Целевая переменная
train['target']
#train = train.drop(columns=['target', 'name', 'address'])
test['id']
#test = test.drop(columns=['name', 'address'])

# Удалим текст для табличных моделей
tab_cols = [c for c in train.columns if c not in ['id', 'text']]


In [None]:
# Используем text_features_train/test из предыдущего шага
X_tab = train[tab_cols].fillna(0).values
# Убедимся, что список признаков совпадает между train и test
tab_cols = [c for c in train.columns if c not in ['id', 'text']]
common_cols = [c for c in tab_cols if c in test.columns]

X_tab = train[common_cols].fillna(0).values
X_test_tab = test[common_cols].fillna(0).values

X_test_tab = test[tab_cols].fillna(0).values

X_train = np.hstack([X_tab, text_features_train])
X_test = np.hstack([X_test_tab, text_features_test])

X_train_t, X_val_t, y_train_t, y_val_t = train_test_split(X_train, y, test_size=0.2, random_state=42)

# Torch Dataset
class TabTextDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

train_ds = TabTextDataset(X_train_t, y_train_t)
val_ds = TabTextDataset(X_val_t, y_val_t)

train_dl = torch.utils.data.DataLoader(train_ds, batch_size=256, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=256)

# Модель
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512), nn.ReLU(),
            nn.Linear(512, 128), nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, x): return self.net(x).squeeze(1)

model = MLP(X_train_t.shape[1])
opt = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.L1Loss()

for epoch in range(10):
    model.train()
    for xb, yb in train_dl:
        opt.zero_grad()
        loss = loss_fn(model(xb), yb)
        loss.backward()
        opt.step()
    model.eval()
    val_loss = np.mean([loss_fn(model(xb), yb).item() for xb, yb in val_dl])
    print(f"Epoch {epoch+1}: val_MAE = {val_loss:.4f}")


KeyError: "['name', 'address', 'target'] not in index"

In [None]:
# =======================
#  MLP baseline for POI/Recommendation Task
# =======================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ========== 1. Dataset ==========

class RecDataset(Dataset):
    def __init__(self, X_tfidf, X_num, y=None):
        self.tfidf = torch.tensor(X_tfidf, dtype=torch.float32)
        self.num = torch.tensor(X_num, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return len(self.tfidf)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.tfidf[idx], self.num[idx], self.y[idx]
        return self.tfidf[idx], self.num[idx]

# ========== 2. Model ==========

class MLPModel(nn.Module):
    def __init__(self, tfidf_dim, num_dim, hidden_dim=256):
        super().__init__()
        self.fc_tfidf = nn.Sequential(
            nn.Linear(tfidf_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3)
        )
        self.fc_num = nn.Sequential(
            nn.Linear(num_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(0.2)
        )
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim + hidden_dim // 2, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x_tfidf, x_num):
        t = self.fc_tfidf(x_tfidf)
        n = self.fc_num(x_num)
        x = torch.cat([t, n], dim=1)
        return self.classifier(x)

# ========== 3. Train function ==========

def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for x_tfidf, x_num, y in loader:
        x_tfidf, x_num, y = x_tfidf.to(device), x_num.to(device), y.to(device)
        optimizer.zero_grad()
        y_pred = model(x_tfidf, x_num).squeeze()
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(y)
    return total_loss / len(loader.dataset)

def eval_epoch(model, loader, device):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for x_tfidf, x_num, y in loader:
            x_tfidf, x_num = x_tfidf.to(device), x_num.to(device)
            y_pred = model(x_tfidf, x_num).squeeze().cpu().numpy()
            preds.extend(y_pred)
            targets.extend(y.numpy())
    return roc_auc_score(targets, preds)

# ========== 4. Example usage ==========

# допустим, у нас уже есть матрицы:
# train_tfidf, train_num, train_df["target"]

# Разделяем
X_train_tfidf, X_val_tfidf, X_train_num, X_val_num, y_train, y_val = train_test_split(
    train_tfidf, train_num, train_df["target"], test_size=0.1, random_state=42
)

scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train_num)
X_val_num = scaler.transform(X_val_num)

train_ds = RecDataset(X_train_tfidf, X_train_num, y_train)
val_ds = RecDataset(X_val_tfidf, X_val_num, y_val)

train_loader = DataLoader(train_ds, batch_size=512, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=1024, shuffle=False)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = MLPModel(tfidf_dim=X_train_tfidf.shape[1], num_dim=X_train_num.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

# ========== 5. Training loop ==========

best_auc = 0
for epoch in range(10):
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    val_auc = eval_epoch(model, val_loader, device)
    best_auc = max(best_auc, val_auc)
    print(f"Epoch {epoch+1}: train_loss={train_loss:.4f}, val_auc={val_auc:.4f}")

print(f"✅ Best ROC-AUC: {best_auc:.4f}")


NameError: name 'tfidf' is not defined

In [None]:
# ===============================
# 💡 MLP Model for Recommendation Task
# ===============================
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

# ===============================
# 1. Load Data
# ===============================

# Пример: поменяй пути на свои файлы
train_df = pd.read_csv("/kaggle/input/hshussu/train.tsv")
test_df = pd.read_csv("/kaggle/input/hshussu/test.tsv")

print("✅ Train shape:", train_df.shape)
print("✅ Test shape:", test_df.shape)

# Проверим столбцы
print("Columns:", list(train_df.columns))

# ===============================
# 2. Feature Engineering
# ===============================

# Пример: допустим, есть текстовые поля 'title', 'description'
TEXT_COLS = ["title", "description"]
NUM_COLS = ["price", "rating", "num_reviews"]  # замени на реальные числовые фичи

# Заполним пропуски
for col in TEXT_COLS:
    train_df[col] = train_df[col].fillna("")
    test_df[col] = test_df[col].fillna("")
for col in NUM_COLS:
    train_df[col] = train_df[col].fillna(0)
    test_df[col] = test_df[col].fillna(0)

# Объединим тексты в один
train_texts = (train_df[TEXT_COLS[0]] + " " + train_df[TEXT_COLS[1]]).values
test_texts = (test_df[TEXT_COLS[0]] + " " + test_df[TEXT_COLS[1]]).values

# ===============================
# 3. TF-IDF Features
# ===============================
print("🔧 Building TF-IDF features...")

tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
train_tfidf = tfidf.fit_transform(train_texts)
test_tfidf = tfidf.transform(test_texts)

# Можно добавить SVD для ускорения (уменьшает размерность TF-IDF)
svd = TruncatedSVD(n_components=256, random_state=42)
train_tfidf = svd.fit_transform(train_tfidf)
test_tfidf = svd.transform(test_tfidf)

# ===============================
# 4. Numeric Features
# ===============================
scaler = StandardScaler()
train_num = scaler.fit_transform(train_df[NUM_COLS])
test_num = scaler.transform(test_df[NUM_COLS])

# ===============================
# 5. Train/Validation Split
# ===============================
X_train_tfidf, X_val_tfidf, X_train_num, X_val_num, y_train, y_val = train_test_split(
    train_tfidf, train_num, train_df["target"], test_size=0.1, random_state=42
)

# ===============================
# 6. Dataset and Dataloader
# ===============================

class RecDataset(Dataset):
    def __init__(self, X_tfidf, X_num, y=None):
        self.tfidf = torch.tensor(X_tfidf, dtype=torch.float32)
        self.num = torch.tensor(X_num, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return len(self.tfidf)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.tfidf[idx], self.num[idx], self.y[idx]
        return self.tfidf[idx], self.num[idx]

train_ds = RecDataset(X_train_tfidf, X_train_num, y_train)
val_ds = RecDataset(X_val_tfidf, X_val_num, y_val)
train_loader = DataLoader(train_ds, batch_size=512, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=1024, shuffle=False)

# ===============================
# 7. MLP Model
# ===============================

class MLPModel(nn.Module):
    def __init__(self, tfidf_dim, num_dim, hidden_dim=256):
        super().__init__()
        self.fc_tfidf = nn.Sequential(
            nn.Linear(tfidf_dim, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3)
        )
        self.fc_num = nn.Sequential(
            nn.Linear(num_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.Dropout(0.2)
        )
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim + hidden_dim // 2, hidden_dim),
            nn.ReLU(),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, x_tfidf, x_num):
        t = self.fc_tfidf(x_tfidf)
        n = self.fc_num(x_num)
        x = torch.cat([t, n], dim=1)
        return self.classifier(x)

# ===============================
# 8. Training
# ===============================

device = "cuda" if torch.cuda.is_available() else "cpu"
model = MLPModel(tfidf_dim=X_train_tfidf.shape[1], num_dim=X_train_num.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

def train_epoch(model, loader):
    model.train()
    total_loss = 0
    for x_tfidf, x_num, y in loader:
        x_tfidf, x_num, y = x_tfidf.to(device), x_num.to(device), y.to(device)
        optimizer.zero_grad()
        y_pred = model(x_tfidf, x_num).squeeze()
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(y)
    return total_loss / len(loader.dataset)

def eval_epoch(model, loader):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for x_tfidf, x_num, y in loader:
            x_tfidf, x_num = x_tfidf.to(device), x_num.to(device)
            y_pred = model(x_tfidf, x_num).squeeze().cpu().numpy()
            preds.extend(y_pred)
            targets.extend(y.numpy())
    return roc_auc_score(targets, preds)

# ===============================
# 9. Run Training
# ===============================
best_auc = 0
for epoch in range(10):
    train_loss = train_epoch(model, train_loader)
    val_auc = eval_epoch(model, val_loader)
    best_auc = max(best_auc, val_auc)
    print(f"Epoch {epoch+1}: train_loss={train_loss:.4f}, val_auc={val_auc:.4f}")

print(f"✅ Best ROC-AUC: {best_auc:.4f}")

# ===============================
# 10. Predict on Test
# ===============================
test_ds = RecDataset(test_tfidf, test_num)
test_loader = DataLoader(test_ds, batch_size=1024, shuffle=False)

model.eval()
preds = []
with torch.no_grad():
    for x_tfidf, x_num in test_loader:
        x_tfidf, x_num = x_tfidf.to(device), x_num.to(device)
        y_pred = model(x_tfidf, x_num).squeeze().cpu().numpy()
        preds.extend(y_pred)

submission = pd.DataFrame({"id": test_df["id"], "target": preds})
submission.to_csv("submission_mlp.csv", index=False)
print("💾 Saved submission_mlp.csv")


ParserError: Error tokenizing data. C error: Expected 6 fields in line 60, saw 7


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_absolute_error

# ========== 1. Загрузка данных ==========

train = pd.read_csv('/kaggle/input/hshussu/train.tsv', sep='\t')
test = pd.read_csv('/kaggle/input/hshussu/test.tsv', sep='\t')
reviews = pd.read_csv('/kaggle/input/hshussu/reviews.txv/reviews.tsv', sep='\t')

# ========== 2. Объединение отзывов ==========
reviews_grouped = reviews.groupby('id')['text'].apply(lambda x: ' '.join(x)).reset_index()
train = train.merge(reviews_grouped, on='id', how='left')
test = test.merge(reviews_grouped, on='id', how='left')
train['text'] = train['text'].fillna('')
test['text'] = test['text'].fillna('')

# ========== 3. Кодирование категориальных признаков ==========
cat_cols = ['category']
for c in cat_cols:
    le = LabelEncoder()
    train[c] = le.fit_transform(train[c].astype(str))
    test[c] = le.transform(test[c].astype(str))

# ========== 4. Формирование фич ==========
target = train['target'].clip(1, 5)  # диапазон 1–5
drop_cols = ['id', 'name', 'address', 'target', 'coordinates']
num_cols = [c for c in train.columns if c not in drop_cols + ['text']]

# стандартизация
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

# TF-IDF текстов
tfidf = TfidfVectorizer(max_features=8000)
svd = TruncatedSVD(n_components=128, random_state=42)
text_features_train = svd.fit_transform(tfidf.fit_transform(train['text']))
text_features_test = svd.transform(tfidf.transform(test['text']))

# Объединяем фичи
train_tab = torch.tensor(train[num_cols].values, dtype=torch.float32)
test_tab = torch.tensor(test[num_cols].values, dtype=torch.float32)
train_text = torch.tensor(text_features_train, dtype=torch.float32)
test_text = torch.tensor(text_features_test, dtype=torch.float32)
train_target = torch.tensor(target.values, dtype=torch.float32)



In [None]:
# ========== 5. Dataset ==========
class POIDataset(Dataset):
    def __init__(self, tab, text, y=None):
        self.tab = tab
        self.text = text
        self.y = y
    def __len__(self):
        return len(self.tab)
    def __getitem__(self, idx):
        if self.y is None:
            return self.tab[idx], self.text[idx]
        return self.tab[idx], self.text[idx], self.y[idx]

X_train, X_val, txt_train, txt_val, y_train, y_val = train_test_split(
    train_tab, train_text, train_target, test_size=0.1, random_state=42
)

train_ds = POIDataset(X_train, txt_train, y_train)
val_ds = POIDataset(X_val, txt_val, y_val)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=256)

# ========== 6. Модель ==========
from transformers import AutoModel

class POIModelHF(nn.Module):
    def __init__(self, num_tab_features, model_name="bert-base-uncased", hidden_dim=256):
        super().__init__()
        self.text_encoder = AutoModel.from_pretrained(model_name)
        text_dim = self.text_encoder.config.hidden_size

        self.mlp_tab = nn.Sequential(
            nn.Linear(num_tab_features, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim)
        )
        self.fc_out = nn.Sequential(
            nn.ReLU(),
            nn.Linear(hidden_dim + text_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, tab, text_inputs):
        # text_inputs — dict с input_ids и attention_mask
        text_out = self.text_encoder(**text_inputs).last_hidden_state[:, 0, :]  # CLS
        tab_out = self.mlp_tab(tab)
        x = torch.cat([tab_out, text_out], dim=1)
        return self.fc_out(x).squeeze(1)


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = POIModelHF(num_tab_features=train_tab.shape[1]).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.L1Loss()

# ========== 7. Обучение ==========
for epoch in range(15):
    model.train()
    total_loss = 0
    for tab, text, y in train_dl:
        tab, text, y = tab.to(device), text.to(device), y.to(device)
        opt.zero_grad()
        preds = model(tab, text)
        loss = criterion(preds, y)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    # валидация
    model.eval()
    val_losses = []
    with torch.no_grad():
        for tab, text, y in val_dl:
            tab, text, y = tab.to(device), text.to(device), y.to(device)
            preds = model(tab, text)
            val_losses.append(criterion(preds, y).item())
    print(f"Epoch {epoch+1}: train={total_loss/len(train_dl):.4f}, val={np.mean(val_losses):.4f}")

# ========== 8. Предсказания ==========
test_ds = POIDataset(test_tab, test_text)
test_dl = DataLoader(test_ds, batch_size=256)
model.eval()
preds = []
with torch.no_grad():
    for tab, text in tqdm(test_dl):
        tab, text = tab.to(device), text.to(device)
        out = model(tab, text)
        preds.append(out.cpu().numpy())
preds = np.concatenate(preds)
preds = np.clip(preds, 1, 5)  # диапазон [1,5]

# ========== 9. Сохранение ==========
submission = pd.DataFrame({
    'id': test['id'],
    'target': preds
})
submission.to_csv('submission_poi_transformer.csv', index=False)
print("✅ submission_poi_transformer.csv saved")


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

TypeError: BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (pooler): BertPooler(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (activation): Tanh()
  )
) argument after ** must be a mapping, not Tensor

In [None]:
!pip install -q sentence-transformers
import torch
from sentence_transformers import SentenceTransformer
import pandas as pd
from tqdm import tqdm

# === 1. Загружаем данные ===
train = pd.read_parquet('/kaggle/input/hshussu/train.tsv')
test = pd.read_parquet('/kaggle/input/hshussu/test.tsv')  # если есть

# === 2. Модель (можно поменять название в model_name) ===
model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
model = SentenceTransformer(model_name)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# === 3. Получаем эмбеддинги ===
def get_embeddings(texts, batch_size=256):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size].fillna("").tolist()
        emb = model.encode(batch, show_progress_bar=False, convert_to_numpy=True, batch_size=batch_size)
        all_embs.append(emb)
    return np.vstack(all_embs)

train_embs = get_embeddings(train['text'])
test_embs = get_embeddings(test['text'])

print(train_embs.shape, test_embs.shape)  # например (1_000_000, 384)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m89.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

2025-10-17 14:19:57.451237: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760710797.718844      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760710797.786366      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


ArrowInvalid: Could not open Parquet input source '<Buffer>': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.

In [None]:
!pip install -q transformers accelerate
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import numpy as np

model_name = "DeepPavlov/rubert-base-cased"  # или "intfloat/multilingual-e5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
model.eval()

def get_embeddings_hf(texts, batch_size=64):
    embs = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = [t if isinstance(t, str) else "" for t in texts[i:i+batch_size]]
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=128, return_tensors='pt').to(model.device)
        with torch.no_grad():
            outputs = model(**inputs)
            # Берем [CLS] токен (или среднее по токенам)
            cls_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embs.append(cls_emb)
    return np.vstack(embs)

train_embs = get_embeddings_hf(train['text'])
test_embs = get_embeddings_hf(test['text'])
print(train_embs.shape)


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

  0%|          | 0/643 [00:00<?, ?it/s][A
  0%|          | 1/643 [00:22<3:57:59, 22.24s/it][A
  0%|          

(41105, 768)





In [None]:
np.save('train_text_embs.npy', train_embs)
np.save('test_text_embs.npy', test_embs)


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm

# ========== 1. Load data ==========
train = pd.read_parquet('/kaggle/input/hshussu/train.tsv')
sample = pd.read_csv('/kaggle/input/asddbfd/sample_submission (1).csv')
test_users = sample['user_id'].unique()

# ========== 2. Item embeddings ==========
item_embs = np.load('/kaggle/input/train-test-embedings-text-ru/train_text_embs.npy')  # твой файл с эмбеддингами
print("Item embeddings shape:", item_embs.shape)

# Создадим словарь item_id -> индекс
item_vocab = {it: idx for idx, it in enumerate(train['item_id'].unique())}
id2item = {v: k for k, v in item_vocab.items()}

# Добавим один вектор PAD (нулевой)
pad_vec = np.zeros((1, item_embs.shape[1]), dtype=np.float32)
item_embs = np.vstack([pad_vec, item_embs])
n_items = item_embs.shape[0]
d_emb = item_embs.shape[1]

# ========== 3. User sequences ==========
max_len = 20
train = train.sort_values(['user_id', 'date'])
user_sequences = (
    train.groupby('user_id')['item_id']
    .apply(lambda x: [item_vocab[i] + 1 for i in x if i in item_vocab][-max_len:])  # +1, т.к. 0 — PAD
    .to_dict()
)

# ========== 4. Dataset ==========
class SeqDataset(Dataset):
    def __init__(self, user_seq):
        self.users = list(user_seq.keys())
        self.seqs = list(user_seq.values())

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        seq = self.seqs[idx]
        pad_len = max_len - len(seq)
        seq = [0]*pad_len + seq
        target = seq[-1]
        return torch.tensor(seq[:-1]), torch.tensor(target)

train_ds = SeqDataset(user_sequences)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

# ========== 5. TransformerRec с внешними эмбеддингами ==========
class TransformerRec(nn.Module):
    def __init__(self, pretrained_embs, n_heads=4, n_layers=2, max_len=20, dropout=0.1):
        super().__init__()
        n_items, d_model = pretrained_embs.shape
        self.item_emb = nn.Embedding.from_pretrained(torch.tensor(pretrained_embs, dtype=torch.float32), freeze=False)
        self.pos_emb = nn.Embedding(max_len, d_model)
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dropout=dropout, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.fc = nn.Linear(d_model, n_items)

    def forward(self, seq):
        pos = torch.arange(seq.size(1), device=seq.device).unsqueeze(0).expand(seq.size(0), -1)
        x = self.item_emb(seq) + self.pos_emb(pos)
        out = self.encoder(x)
        out = out[:, -1, :]  # берем последний токен
        logits = self.fc(out)
        return logits

# ========== 6. Train ==========
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = TransformerRec(pretrained_embs=item_embs).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=0)

for epoch in range(1):
    model.train()
    total_loss = 0
    for seq, tgt in tqdm(train_dl, desc=f"Epoch {epoch+1}"):
        seq, tgt = seq.to(device), tgt.to(device)
        opt.zero_grad()
        logits = model(seq)
        loss = criterion(logits, tgt)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | loss={total_loss/len(train_dl):.4f}")

# ========== 7. Prediction ==========
model.eval()
TOPK = 20
submission_rows = []

with torch.no_grad():
    for user in tqdm(test_users):
        seq = user_sequences.get(user, [])
        seq = [0]*(max_len - len(seq)) + seq
        seq_t = torch.tensor(seq[:-1]).unsqueeze(0).to(device)
        logits = model(seq_t)
        topk_idx = torch.topk(logits, TOPK, dim=1).indices[0].cpu().numpy()
        top_items = [id2item[i-1] for i in topk_idx if i-1 in id2item and i != 0]
        for it in top_items:
            submission_rows.append((user, it))

sub = pd.DataFrame(submission_rows, columns=['user_id', 'item_id'])
sub.to_csv('submission_transformer_textemb.csv', index=False)
print("✅ Saved submission_transformer_textemb.csv")


ArrowInvalid: Could not open Parquet input source '<Buffer>': Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm

# ========== 1. Загрузка данных ==========
train = pd.read_csv('/kaggle/input/hshussu/train.tsv', sep='\t')
test = pd.read_csv('/kaggle/input/hshussu/test.tsv', sep='\t')
reviews = pd.read_csv('/kaggle/input/hshussu/reviews.txv/reviews.tsv', sep='\t')

# Загружаем заранее посчитанные эмбеддинги отзывов
# (например, полученные через HuggingFace CLS-токен)
train_emb = np.load('/kaggle/input/train-test-embedings-text-ru/train_text_embs.npy')   # shape: (n_train, 768)
test_emb = np.load('/kaggle/input/train-test-embedings-text-ru/test_text_embs.npy')     # shape: (n_test, 768)

print("✅ Shapes:")
print("train:", train.shape)
print("test:", test.shape)
print("train_emb:", train_emb.shape)
print("test_emb:", test_emb.shape)

# ========== 2. Предобработка табличных признаков ==========
# Уберем неинформативные поля
drop_cols = ['id', 'name', 'address', 'coordinates', 'target']
cat_cols = ['category']
num_cols = [c for c in train.columns if c not in drop_cols + cat_cols]

# Кодируем категориальные
for col in cat_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train[col], test[col]], axis=0).astype(str))
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# Масштабируем числовые
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

# Собираем X и y
X_train_tab = np.hstack([train[cat_cols + num_cols].values, train_emb])
X_test_tab = np.hstack([test[cat_cols + num_cols].values, test_emb])
y_train = train['target'].values.astype(np.float32)

print(f"X_train_tab: {X_train_tab.shape}, X_test_tab: {X_test_tab.shape}")

# ========== 3. Dataset и DataLoader ==========
class TabularDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

train_ds = TabularDataset(X_train_tab, y_train)
train_dl = DataLoader(train_ds, batch_size=512, shuffle=True)
test_ds = TabularDataset(X_test_tab)
test_dl = DataLoader(test_ds, batch_size=512, shuffle=False)

# ========== 4. Модель ==========
class MLPModel(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MLPModel(in_dim=X_train_tab.shape[1]).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-5)
criterion = nn.L1Loss()  # MAE

# ========== 5. Обучение ==========
EPOCHS = 150
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for Xb, yb in tqdm(train_dl, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        Xb, yb = Xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(Xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: MAE={total_loss/len(train_dl):.4f}")

# ========== 6. Предсказание ==========
model.eval()
preds = []
with torch.no_grad():
    for Xb in test_dl:
        Xb = Xb.to(device)
        preds.append(model(Xb).cpu().numpy())
preds = np.concatenate(preds)
preds = np.clip(preds, 1, 5)  # диапазон 1–5

# ========== 7. Сабмит ==========
submission = pd.DataFrame({
    'id': test['id'],
    'target': preds
})
submission.to_csv('submission_hf_text_model.csv', index=False)
print("✅ submission_hf_text_model.csv saved")


✅ Shapes:
train: (41105, 286)
test: (9276, 285)
train_emb: (41105, 768)
test_emb: (9276, 768)
X_train_tab: (41105, 1049), X_test_tab: (9276, 1049)


Epoch 1/150: 100%|██████████| 81/81 [00:02<00:00, 39.86it/s]


Epoch 1: MAE=2.5006


Epoch 2/150: 100%|██████████| 81/81 [00:01<00:00, 41.21it/s]


Epoch 2: MAE=1.0826


Epoch 3/150: 100%|██████████| 81/81 [00:01<00:00, 40.67it/s]


Epoch 3: MAE=0.7284


Epoch 4/150: 100%|██████████| 81/81 [00:02<00:00, 37.16it/s]


Epoch 4: MAE=0.6933


Epoch 5/150: 100%|██████████| 81/81 [00:02<00:00, 40.25it/s]


Epoch 5: MAE=0.6806


Epoch 6/150: 100%|██████████| 81/81 [00:02<00:00, 39.77it/s]


Epoch 6: MAE=0.6781


Epoch 7/150: 100%|██████████| 81/81 [00:01<00:00, 41.01it/s]


Epoch 7: MAE=0.6708


Epoch 8/150: 100%|██████████| 81/81 [00:02<00:00, 37.43it/s]


Epoch 8: MAE=0.6673


Epoch 9/150: 100%|██████████| 81/81 [00:01<00:00, 40.58it/s]


Epoch 9: MAE=0.6572


Epoch 10/150: 100%|██████████| 81/81 [00:01<00:00, 40.82it/s]


Epoch 10: MAE=0.6532


Epoch 11/150: 100%|██████████| 81/81 [00:02<00:00, 40.05it/s]


Epoch 11: MAE=0.6513


Epoch 12/150: 100%|██████████| 81/81 [00:01<00:00, 40.99it/s]


Epoch 12: MAE=0.6460


Epoch 13/150: 100%|██████████| 81/81 [00:01<00:00, 40.91it/s]


Epoch 13: MAE=0.6424


Epoch 14/150: 100%|██████████| 81/81 [00:01<00:00, 40.59it/s]


Epoch 14: MAE=0.6412


Epoch 15/150: 100%|██████████| 81/81 [00:02<00:00, 37.49it/s]


Epoch 15: MAE=0.6395


Epoch 16/150: 100%|██████████| 81/81 [00:02<00:00, 39.79it/s]


Epoch 16: MAE=0.6383


Epoch 17/150: 100%|██████████| 81/81 [00:01<00:00, 40.99it/s]


Epoch 17: MAE=0.6344


Epoch 18/150: 100%|██████████| 81/81 [00:01<00:00, 40.83it/s]


Epoch 18: MAE=0.6358


Epoch 19/150: 100%|██████████| 81/81 [00:02<00:00, 37.73it/s]


Epoch 19: MAE=0.6308


Epoch 20/150: 100%|██████████| 81/81 [00:02<00:00, 40.46it/s]


Epoch 20: MAE=0.6312


Epoch 21/150: 100%|██████████| 81/81 [00:02<00:00, 39.47it/s]


Epoch 21: MAE=0.6272


Epoch 22/150: 100%|██████████| 81/81 [00:01<00:00, 40.78it/s]


Epoch 22: MAE=0.6260


Epoch 23/150: 100%|██████████| 81/81 [00:02<00:00, 40.34it/s]


Epoch 23: MAE=0.6246


Epoch 24/150: 100%|██████████| 81/81 [00:01<00:00, 41.07it/s]


Epoch 24: MAE=0.6212


Epoch 25/150: 100%|██████████| 81/81 [00:01<00:00, 40.58it/s]


Epoch 25: MAE=0.6201


Epoch 26/150: 100%|██████████| 81/81 [00:02<00:00, 36.59it/s]


Epoch 26: MAE=0.6198


Epoch 27/150: 100%|██████████| 81/81 [00:01<00:00, 40.68it/s]


Epoch 27: MAE=0.6151


Epoch 28/150: 100%|██████████| 81/81 [00:01<00:00, 40.98it/s]


Epoch 28: MAE=0.6154


Epoch 29/150: 100%|██████████| 81/81 [00:02<00:00, 40.45it/s]


Epoch 29: MAE=0.6161


Epoch 30/150: 100%|██████████| 81/81 [00:02<00:00, 37.42it/s]


Epoch 30: MAE=0.6142


Epoch 31/150: 100%|██████████| 81/81 [00:02<00:00, 40.06it/s]


Epoch 31: MAE=0.6153


Epoch 32/150: 100%|██████████| 81/81 [00:01<00:00, 41.39it/s]


Epoch 32: MAE=0.6121


Epoch 33/150: 100%|██████████| 81/81 [00:01<00:00, 40.87it/s]


Epoch 33: MAE=0.6085


Epoch 34/150: 100%|██████████| 81/81 [00:01<00:00, 40.67it/s]


Epoch 34: MAE=0.6071


Epoch 35/150: 100%|██████████| 81/81 [00:01<00:00, 40.93it/s]


Epoch 35: MAE=0.6044


Epoch 36/150: 100%|██████████| 81/81 [00:02<00:00, 39.90it/s]


Epoch 36: MAE=0.6051


Epoch 37/150: 100%|██████████| 81/81 [00:02<00:00, 37.50it/s]


Epoch 37: MAE=0.6033


Epoch 38/150: 100%|██████████| 81/81 [00:01<00:00, 40.80it/s]


Epoch 38: MAE=0.6043


Epoch 39/150: 100%|██████████| 81/81 [00:01<00:00, 40.85it/s]


Epoch 39: MAE=0.6032


Epoch 40/150: 100%|██████████| 81/81 [00:02<00:00, 40.20it/s]


Epoch 40: MAE=0.6050


Epoch 41/150: 100%|██████████| 81/81 [00:02<00:00, 36.27it/s]


Epoch 41: MAE=0.6013


Epoch 42/150: 100%|██████████| 81/81 [00:02<00:00, 40.13it/s]


Epoch 42: MAE=0.5960


Epoch 43/150: 100%|██████████| 81/81 [00:01<00:00, 40.68it/s]


Epoch 43: MAE=0.5985


Epoch 44/150: 100%|██████████| 81/81 [00:01<00:00, 40.57it/s]


Epoch 44: MAE=0.5988


Epoch 45/150: 100%|██████████| 81/81 [00:01<00:00, 41.00it/s]


Epoch 45: MAE=0.5967


Epoch 46/150: 100%|██████████| 81/81 [00:02<00:00, 39.20it/s]


Epoch 46: MAE=0.5964


Epoch 47/150: 100%|██████████| 81/81 [00:01<00:00, 40.60it/s]


Epoch 47: MAE=0.5937


Epoch 48/150: 100%|██████████| 81/81 [00:02<00:00, 36.96it/s]


Epoch 48: MAE=0.5953


Epoch 49/150: 100%|██████████| 81/81 [00:01<00:00, 40.64it/s]


Epoch 49: MAE=0.5946


Epoch 50/150: 100%|██████████| 81/81 [00:02<00:00, 40.47it/s]


Epoch 50: MAE=0.5928


Epoch 51/150: 100%|██████████| 81/81 [00:02<00:00, 39.95it/s]


Epoch 51: MAE=0.5937


Epoch 52/150: 100%|██████████| 81/81 [00:02<00:00, 37.27it/s]


Epoch 52: MAE=0.5914


Epoch 53/150: 100%|██████████| 81/81 [00:01<00:00, 41.07it/s]


Epoch 53: MAE=0.5905


Epoch 54/150: 100%|██████████| 81/81 [00:01<00:00, 40.94it/s]


Epoch 54: MAE=0.5894


Epoch 55/150: 100%|██████████| 81/81 [00:02<00:00, 40.18it/s]


Epoch 55: MAE=0.5898


Epoch 56/150: 100%|██████████| 81/81 [00:01<00:00, 40.82it/s]


Epoch 56: MAE=0.5871


Epoch 57/150: 100%|██████████| 81/81 [00:01<00:00, 40.67it/s]


Epoch 57: MAE=0.5871


Epoch 58/150: 100%|██████████| 81/81 [00:01<00:00, 40.86it/s]


Epoch 58: MAE=0.5882


Epoch 59/150: 100%|██████████| 81/81 [00:02<00:00, 37.60it/s]


Epoch 59: MAE=0.5845


Epoch 60/150: 100%|██████████| 81/81 [00:02<00:00, 39.92it/s]


Epoch 60: MAE=0.5835


Epoch 61/150: 100%|██████████| 81/81 [00:01<00:00, 41.19it/s]


Epoch 61: MAE=0.5844


Epoch 62/150: 100%|██████████| 81/81 [00:01<00:00, 40.56it/s]


Epoch 62: MAE=0.5833


Epoch 63/150: 100%|██████████| 81/81 [00:02<00:00, 37.12it/s]


Epoch 63: MAE=0.5816


Epoch 64/150: 100%|██████████| 81/81 [00:02<00:00, 40.42it/s]


Epoch 64: MAE=0.5830


Epoch 65/150: 100%|██████████| 81/81 [00:02<00:00, 39.75it/s]


Epoch 65: MAE=0.5813


Epoch 66/150: 100%|██████████| 81/81 [00:02<00:00, 40.39it/s]


Epoch 66: MAE=0.5808


Epoch 67/150: 100%|██████████| 81/81 [00:01<00:00, 40.97it/s]


Epoch 67: MAE=0.5774


Epoch 68/150: 100%|██████████| 81/81 [00:01<00:00, 40.75it/s]


Epoch 68: MAE=0.5795


Epoch 69/150: 100%|██████████| 81/81 [00:01<00:00, 40.67it/s]


Epoch 69: MAE=0.5786


Epoch 70/150: 100%|██████████| 81/81 [00:02<00:00, 36.81it/s]


Epoch 70: MAE=0.5762


Epoch 71/150: 100%|██████████| 81/81 [00:01<00:00, 40.67it/s]


Epoch 71: MAE=0.5765


Epoch 72/150: 100%|██████████| 81/81 [00:01<00:00, 40.67it/s]


Epoch 72: MAE=0.5761


Epoch 73/150: 100%|██████████| 81/81 [00:01<00:00, 40.77it/s]


Epoch 73: MAE=0.5746


Epoch 74/150: 100%|██████████| 81/81 [00:02<00:00, 37.46it/s]


Epoch 74: MAE=0.5760


Epoch 75/150: 100%|██████████| 81/81 [00:02<00:00, 40.00it/s]


Epoch 75: MAE=0.5763


Epoch 76/150: 100%|██████████| 81/81 [00:01<00:00, 41.25it/s]


Epoch 76: MAE=0.5753


Epoch 77/150: 100%|██████████| 81/81 [00:01<00:00, 40.78it/s]


Epoch 77: MAE=0.5757


Epoch 78/150: 100%|██████████| 81/81 [00:01<00:00, 40.86it/s]


Epoch 78: MAE=0.5737


Epoch 79/150: 100%|██████████| 81/81 [00:01<00:00, 40.63it/s]


Epoch 79: MAE=0.5720


Epoch 80/150: 100%|██████████| 81/81 [00:02<00:00, 40.12it/s]


Epoch 80: MAE=0.5708


Epoch 81/150: 100%|██████████| 81/81 [00:02<00:00, 37.32it/s]


Epoch 81: MAE=0.5719


Epoch 82/150: 100%|██████████| 81/81 [00:01<00:00, 40.89it/s]


Epoch 82: MAE=0.5689


Epoch 83/150: 100%|██████████| 81/81 [00:01<00:00, 41.00it/s]


Epoch 83: MAE=0.5682


Epoch 84/150: 100%|██████████| 81/81 [00:01<00:00, 41.15it/s]


Epoch 84: MAE=0.5690


Epoch 85/150: 100%|██████████| 81/81 [00:02<00:00, 36.57it/s]


Epoch 85: MAE=0.5688


Epoch 86/150: 100%|██████████| 81/81 [00:01<00:00, 40.86it/s]


Epoch 86: MAE=0.5680


Epoch 87/150: 100%|██████████| 81/81 [00:01<00:00, 41.22it/s]


Epoch 87: MAE=0.5685


Epoch 88/150: 100%|██████████| 81/81 [00:02<00:00, 40.24it/s]


Epoch 88: MAE=0.5654


Epoch 89/150: 100%|██████████| 81/81 [00:01<00:00, 40.77it/s]


Epoch 89: MAE=0.5648


Epoch 90/150: 100%|██████████| 81/81 [00:02<00:00, 39.91it/s]


Epoch 90: MAE=0.5646


Epoch 91/150: 100%|██████████| 81/81 [00:01<00:00, 41.23it/s]


Epoch 91: MAE=0.5658


Epoch 92/150: 100%|██████████| 81/81 [00:02<00:00, 37.34it/s]


Epoch 92: MAE=0.5639


Epoch 93/150: 100%|██████████| 81/81 [00:02<00:00, 40.02it/s]


Epoch 93: MAE=0.5630


Epoch 94/150: 100%|██████████| 81/81 [00:02<00:00, 40.44it/s]


Epoch 94: MAE=0.5627


Epoch 95/150: 100%|██████████| 81/81 [00:02<00:00, 38.93it/s]


Epoch 95: MAE=0.5650


Epoch 96/150: 100%|██████████| 81/81 [00:02<00:00, 37.15it/s]


Epoch 96: MAE=0.5628


Epoch 97/150: 100%|██████████| 81/81 [00:02<00:00, 40.10it/s]


Epoch 97: MAE=0.5609


Epoch 98/150: 100%|██████████| 81/81 [00:02<00:00, 40.29it/s]


Epoch 98: MAE=0.5576


Epoch 99/150: 100%|██████████| 81/81 [00:02<00:00, 40.18it/s]


Epoch 99: MAE=0.5607


Epoch 100/150: 100%|██████████| 81/81 [00:02<00:00, 39.13it/s]


Epoch 100: MAE=0.5597


Epoch 101/150: 100%|██████████| 81/81 [00:02<00:00, 40.07it/s]


Epoch 101: MAE=0.5605


Epoch 102/150: 100%|██████████| 81/81 [00:02<00:00, 40.49it/s]


Epoch 102: MAE=0.5611


Epoch 103/150: 100%|██████████| 81/81 [00:02<00:00, 39.76it/s]


Epoch 103: MAE=0.5589


Epoch 104/150: 100%|██████████| 81/81 [00:02<00:00, 36.09it/s]


Epoch 104: MAE=0.5592


Epoch 105/150: 100%|██████████| 81/81 [00:02<00:00, 40.26it/s]


Epoch 105: MAE=0.5589


Epoch 106/150: 100%|██████████| 81/81 [00:01<00:00, 40.52it/s]


Epoch 106: MAE=0.5545


Epoch 107/150: 100%|██████████| 81/81 [00:02<00:00, 37.23it/s]


Epoch 107: MAE=0.5555


Epoch 108/150: 100%|██████████| 81/81 [00:01<00:00, 40.64it/s]


Epoch 108: MAE=0.5555


Epoch 109/150: 100%|██████████| 81/81 [00:02<00:00, 39.39it/s]


Epoch 109: MAE=0.5528


Epoch 110/150: 100%|██████████| 81/81 [00:02<00:00, 40.13it/s]


Epoch 110: MAE=0.5535


Epoch 111/150: 100%|██████████| 81/81 [00:02<00:00, 40.42it/s]


Epoch 111: MAE=0.5536


Epoch 112/150: 100%|██████████| 81/81 [00:02<00:00, 40.33it/s]


Epoch 112: MAE=0.5526


Epoch 113/150: 100%|██████████| 81/81 [00:02<00:00, 40.24it/s]


Epoch 113: MAE=0.5526


Epoch 114/150: 100%|██████████| 81/81 [00:02<00:00, 39.79it/s]


Epoch 114: MAE=0.5517


Epoch 115/150: 100%|██████████| 81/81 [00:02<00:00, 37.49it/s]


Epoch 115: MAE=0.5553


Epoch 116/150: 100%|██████████| 81/81 [00:01<00:00, 40.68it/s]


Epoch 116: MAE=0.5510


Epoch 117/150: 100%|██████████| 81/81 [00:02<00:00, 40.19it/s]


Epoch 117: MAE=0.5518


Epoch 118/150: 100%|██████████| 81/81 [00:02<00:00, 36.97it/s]


Epoch 118: MAE=0.5526


Epoch 119/150: 100%|██████████| 81/81 [00:02<00:00, 39.24it/s]


Epoch 119: MAE=0.5510


Epoch 120/150: 100%|██████████| 81/81 [00:02<00:00, 40.37it/s]


Epoch 120: MAE=0.5506


Epoch 121/150: 100%|██████████| 81/81 [00:02<00:00, 40.29it/s]


Epoch 121: MAE=0.5514


Epoch 122/150: 100%|██████████| 81/81 [00:02<00:00, 40.43it/s]


Epoch 122: MAE=0.5489


Epoch 123/150: 100%|██████████| 81/81 [00:02<00:00, 40.48it/s]


Epoch 123: MAE=0.5491


Epoch 124/150: 100%|██████████| 81/81 [00:02<00:00, 39.05it/s]


Epoch 124: MAE=0.5471


Epoch 125/150: 100%|██████████| 81/81 [00:02<00:00, 40.29it/s]


Epoch 125: MAE=0.5504


Epoch 126/150: 100%|██████████| 81/81 [00:02<00:00, 36.77it/s]


Epoch 126: MAE=0.5449


Epoch 127/150: 100%|██████████| 81/81 [00:01<00:00, 40.58it/s]


Epoch 127: MAE=0.5486


Epoch 128/150: 100%|██████████| 81/81 [00:02<00:00, 40.45it/s]


Epoch 128: MAE=0.5456


Epoch 129/150: 100%|██████████| 81/81 [00:02<00:00, 39.81it/s]


Epoch 129: MAE=0.5460


Epoch 130/150: 100%|██████████| 81/81 [00:02<00:00, 37.35it/s]


Epoch 130: MAE=0.5479


Epoch 131/150: 100%|██████████| 81/81 [00:01<00:00, 40.83it/s]


Epoch 131: MAE=0.5448


Epoch 132/150: 100%|██████████| 81/81 [00:02<00:00, 40.27it/s]


Epoch 132: MAE=0.5442


Epoch 133/150: 100%|██████████| 81/81 [00:01<00:00, 40.66it/s]


Epoch 133: MAE=0.5424


Epoch 134/150: 100%|██████████| 81/81 [00:02<00:00, 39.66it/s]


Epoch 134: MAE=0.5429


Epoch 135/150: 100%|██████████| 81/81 [00:01<00:00, 41.03it/s]


Epoch 135: MAE=0.5433


Epoch 136/150: 100%|██████████| 81/81 [00:01<00:00, 41.02it/s]


Epoch 136: MAE=0.5443


Epoch 137/150: 100%|██████████| 81/81 [00:02<00:00, 37.69it/s]


Epoch 137: MAE=0.5441


Epoch 138/150: 100%|██████████| 81/81 [00:01<00:00, 41.02it/s]


Epoch 138: MAE=0.5411


Epoch 139/150: 100%|██████████| 81/81 [00:02<00:00, 39.96it/s]


Epoch 139: MAE=0.5422


Epoch 140/150: 100%|██████████| 81/81 [00:02<00:00, 40.15it/s]


Epoch 140: MAE=0.5426


Epoch 141/150: 100%|██████████| 81/81 [00:02<00:00, 37.35it/s]


Epoch 141: MAE=0.5430


Epoch 142/150: 100%|██████████| 81/81 [00:02<00:00, 40.45it/s]


Epoch 142: MAE=0.5414


Epoch 143/150: 100%|██████████| 81/81 [00:02<00:00, 40.21it/s]


Epoch 143: MAE=0.5422


Epoch 144/150: 100%|██████████| 81/81 [00:02<00:00, 39.88it/s]


Epoch 144: MAE=0.5398


Epoch 145/150: 100%|██████████| 81/81 [00:01<00:00, 40.75it/s]


Epoch 145: MAE=0.5384


Epoch 146/150: 100%|██████████| 81/81 [00:01<00:00, 40.86it/s]


Epoch 146: MAE=0.5399


Epoch 147/150: 100%|██████████| 81/81 [00:02<00:00, 40.48it/s]


Epoch 147: MAE=0.5404


Epoch 148/150: 100%|██████████| 81/81 [00:02<00:00, 36.90it/s]


Epoch 148: MAE=0.5407


Epoch 149/150: 100%|██████████| 81/81 [00:02<00:00, 40.32it/s]


Epoch 149: MAE=0.5389


Epoch 150/150: 100%|██████████| 81/81 [00:01<00:00, 40.62it/s]


Epoch 150: MAE=0.5400
✅ submission_hf_text_model.csv saved


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm

# ================= 1. LOAD DATA =================
train = pd.read_csv('/kaggle/input/hshussu/train.tsv', sep='\t')
test = pd.read_csv('/kaggle/input/hshussu/test.tsv', sep='\t')

# эмбеддинги CLS (например, из Hugging Face)
X_train_emb = np.load('/kaggle/input/train-test-embedings-text-ru/train_text_embs.npy')
X_test_emb = np.load('/kaggle/input/train-test-embedings-text-ru/test_text_embs.npy')

# ================= 2. Tabular features =================
drop_cols = ['id', 'name', 'address', 'coordinates', 'target', 'text']
cat_cols = ['category']
num_cols = [c for c in train.columns if c not in drop_cols + cat_cols]

for col in cat_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train[col], test[col]], axis=0).astype(str))
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

X_train_tab = train[cat_cols + num_cols].values.astype(np.float32)
X_test_tab = test[cat_cols + num_cols].values.astype(np.float32)
y_train = train['target'].values.astype(np.float32)

# ================= 3. Dataset =================
class EmbedTabDataset(Dataset):
    def __init__(self, tab, emb, y=None):
        self.tab = torch.tensor(tab, dtype=torch.float32)
        self.emb = torch.tensor(emb, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.tab)
    def __getitem__(self, idx):
        if self.y is None:
            return self.tab[idx], self.emb[idx]
        return self.tab[idx], self.emb[idx], self.y[idx]

train_ds = EmbedTabDataset(X_train_tab, X_train_emb, y_train)
test_ds = EmbedTabDataset(X_test_tab, X_test_emb)
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False)

# ================= 4. MODEL =================
class SpeechTransformerEncoder(nn.Module):
    """Transformer-encoder, усиливающий эмбеддинги"""
    def __init__(self, emb_dim=768, n_heads=8, n_layers=2):
        super().__init__()
        layer = nn.TransformerEncoderLayer(d_model=emb_dim, nhead=n_heads, batch_first=True)
        self.encoder = nn.TransformerEncoder(layer, num_layers=n_layers)
        self.proj = nn.Linear(emb_dim, emb_dim)

    def forward(self, x):
        # добавляем фиктивное измерение "времени"
        x = x.unsqueeze(1)  # [B, 1, emb_dim]
        x = self.encoder(x)
        return self.proj(x.squeeze(1))

class ModelWithSpeechEncoder(nn.Module):
    def __init__(self, emb_dim, tab_dim, hidden_dim=256):
        super().__init__()
        self.speech_encoder = SpeechTransformerEncoder(emb_dim)
        self.fc_tab = nn.Sequential(
            nn.Linear(tab_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.fc_out = nn.Sequential(
            nn.Linear(emb_dim + 128, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, tab, emb):
        e = self.speech_encoder(emb)
        t = self.fc_tab(tab)
        x = torch.cat([t, e], dim=1)
        return self.fc_out(x).squeeze(1)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = ModelWithSpeechEncoder(emb_dim=X_train_emb.shape[1], tab_dim=X_train_tab.shape[1]).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=2e-4)
criterion = nn.L1Loss()

# ================= 5. TRAINING =================
for epoch in range(20):
    model.train()
    total_loss = 0
    for tab, emb, y in tqdm(train_dl, desc=f"Epoch {epoch+1}"):
        tab, emb, y = tab.to(device), emb.to(device), y.to(device)
        opt.zero_grad()
        pred = model(tab, emb)
        loss = criterion(pred, y)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | MAE={total_loss/len(train_dl):.4f}")

# ================= 6. PREDICT =================
model.eval()
preds = []
with torch.no_grad():
    for tab, emb in test_dl:
        tab, emb = tab.to(device), emb.to(device)
        p = model(tab, emb).cpu().numpy()
        preds.append(p)
preds = np.clip(np.concatenate(preds), 1, 5)
pd.DataFrame({'id': test['id'], 'target': preds}).to_csv('submission_speech_transformer.csv', index=False)
print("✅ Saved submission_speech_transformer.csv")


Epoch 1: 100%|██████████| 643/643 [01:07<00:00,  9.53it/s]


Epoch 1 | MAE=0.6538


Epoch 2: 100%|██████████| 643/643 [01:07<00:00,  9.58it/s]


Epoch 2 | MAE=0.6294


Epoch 3: 100%|██████████| 643/643 [01:12<00:00,  8.92it/s]


Epoch 3 | MAE=0.6232


Epoch 4: 100%|██████████| 643/643 [01:31<00:00,  6.99it/s]


Epoch 4 | MAE=0.6177


Epoch 5: 100%|██████████| 643/643 [01:25<00:00,  7.50it/s]


Epoch 5 | MAE=0.6140


Epoch 6: 100%|██████████| 643/643 [01:09<00:00,  9.25it/s]


Epoch 6 | MAE=0.6112


Epoch 7: 100%|██████████| 643/643 [01:09<00:00,  9.30it/s]


Epoch 7 | MAE=0.6091


Epoch 8: 100%|██████████| 643/643 [01:08<00:00,  9.36it/s]


Epoch 8 | MAE=0.6061


Epoch 9: 100%|██████████| 643/643 [01:05<00:00,  9.75it/s]


Epoch 9 | MAE=0.6052


Epoch 10: 100%|██████████| 643/643 [01:07<00:00,  9.48it/s]


Epoch 10 | MAE=0.6019


Epoch 11: 100%|██████████| 643/643 [01:06<00:00,  9.68it/s]


Epoch 11 | MAE=0.6009


Epoch 12: 100%|██████████| 643/643 [01:07<00:00,  9.53it/s]


Epoch 12 | MAE=0.5993


Epoch 13: 100%|██████████| 643/643 [01:06<00:00,  9.70it/s]


Epoch 13 | MAE=0.5966


Epoch 14: 100%|██████████| 643/643 [01:07<00:00,  9.55it/s]


Epoch 14 | MAE=0.5934


Epoch 15: 100%|██████████| 643/643 [01:06<00:00,  9.73it/s]


Epoch 15 | MAE=0.5897


Epoch 16: 100%|██████████| 643/643 [01:05<00:00,  9.78it/s]


Epoch 16 | MAE=0.5942


Epoch 17: 100%|██████████| 643/643 [01:05<00:00,  9.79it/s]


Epoch 17 | MAE=0.5899


Epoch 18: 100%|██████████| 643/643 [01:06<00:00,  9.66it/s]


Epoch 18 | MAE=0.5856


Epoch 19: 100%|██████████| 643/643 [01:05<00:00,  9.76it/s]


Epoch 19 | MAE=0.5838


Epoch 20: 100%|██████████| 643/643 [01:07<00:00,  9.53it/s]


Epoch 20 | MAE=0.5815
✅ Saved submission_speech_transformer.csv


In [None]:
from transformers import AutoModel
import torch.nn as nn
import torch

class HFEmbedEncoder(nn.Module):
    """Использует Hugging Face модель для дообработки эмбеддингов"""
    def __init__(self, model_name, emb_dim, freeze=True):
        super().__init__()
        self.hf_model = AutoModel.from_pretrained(model_name)
        self.proj_in = nn.Linear(emb_dim, self.hf_model.config.hidden_size)
        self.proj_out = nn.Linear(self.hf_model.config.hidden_size, emb_dim)
        if freeze:
            for p in self.hf_model.parameters():
                p.requires_grad = False

    def forward(self, emb):
        # добавляем фиктивное "временное" измерение
        x = emb.unsqueeze(1)
        outputs = self.hf_model(inputs_embeds=self.proj_in(x))
        h = outputs.last_hidden_state.mean(dim=1)
        return self.proj_out(h)

class HFModelWithEmbeddings(nn.Module):
    def __init__(self, hf_name, emb_dim, tab_dim):
        super().__init__()
        self.emb_encoder = HFEmbedEncoder(hf_name, emb_dim)
        self.fc_tab = nn.Sequential(
            nn.Linear(tab_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.fc_out = nn.Sequential(
            nn.Linear(emb_dim + 128, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, tab, emb):
        e = self.emb_encoder(emb)
        t = self.fc_tab(tab)
        x = torch.cat([t, e], dim=1)
        return self.fc_out(x).squeeze(1)

# Пример инициализации
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = HFModelWithEmbeddings(
    hf_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
    emb_dim=X_train_emb.shape[1],
    tab_dim=X_train_tab.shape[1]
).to(device)


config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

2025-10-18 09:44:43.572565: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760780683.815839      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760780683.884622      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

In [None]:
opt = torch.optim.AdamW(model.parameters(), lr=2e-4)
criterion = nn.L1Loss()
model = HFModelWithEmbeddings(
    hf_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
    emb_dim=X_train_emb.shape[1],
    tab_dim=X_train_tab.shape[1]
).to(device)
# ================= 5. TRAINING =================
for epoch in range(5):
    model.train()
    total_loss = 0
    for tab, emb, y in tqdm(train_dl, desc=f"Epoch {epoch+1}"):
        tab, emb, y = tab.to(device), emb.to(device), y.to(device)
        opt.zero_grad()
        pred = model(tab, emb)
        loss = criterion(pred, y)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | MAE={total_loss/len(train_dl):.4f}")

# ================= 6. PREDICT =================
model.eval()
preds = []
with torch.no_grad():
    for tab, emb in test_dl:
        tab, emb = tab.to(device), emb.to(device)
        p = model(tab, emb).cpu().numpy()
        preds.append(p)
preds = np.clip(np.concatenate(preds), 1, 5)
pd.DataFrame({'id': test['id'], 'target': preds}).to_csv('submission_speech_transformer.csv', index=False)
print("✅ Saved submission_speech_transformer.csv")

Epoch 1: 100%|██████████| 643/643 [01:39<00:00,  6.44it/s]


Epoch 1 | MAE=3.3234


Epoch 2: 100%|██████████| 643/643 [01:09<00:00,  9.29it/s]


Epoch 2 | MAE=3.3235


Epoch 3: 100%|██████████| 643/643 [01:06<00:00,  9.65it/s]


Epoch 3 | MAE=3.3229


Epoch 4: 100%|██████████| 643/643 [01:07<00:00,  9.50it/s]


Epoch 4 | MAE=3.3231


Epoch 5: 100%|██████████| 643/643 [01:06<00:00,  9.67it/s]


Epoch 5 | MAE=3.3236
✅ Saved submission_speech_transformer.csv


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm
from transformers import AutoModel

# ================== 1. LOAD DATA ==================
train = pd.read_csv('/kaggle/input/hshussu/train.tsv', sep='\t')
test = pd.read_csv('/kaggle/input/hshussu/test.tsv', sep='\t')
reviews = pd.read_csv('/kaggle/input/hshussu/reviews.txv/reviews.tsv', sep='\t')

# reviews: [item_id, text, maybe rating]
print(f"Train: {train.shape}, Test: {test.shape}, Reviews: {reviews.shape}")

# ================== 2. LOAD EMBEDDINGS ==================
# (готовые эмбеддинги из Hugging Face, например CLS-токены)
X_train_emb = np.load('/kaggle/input/train-test-embedings-text-ru/train_text_embs.npy')
X_test_emb = np.load('/kaggle/input/train-test-embedings-text-ru/test_text_embs.npy')

print(f"Embeddings: train {X_train_emb.shape}, test {X_test_emb.shape}")

# ================== 3. FEATURE ENGINEERING ==================
drop_cols = ['id', 'name', 'address', 'coordinates', 'target', 'text']
cat_cols = ['category']
num_cols = [c for c in train.columns if c not in drop_cols + cat_cols]

# Категориальные признаки
for col in cat_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train[col], test[col]], axis=0).astype(str))
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# Числовые признаки
scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])
test[num_cols] = scaler.transform(test[num_cols])

X_train_tab = train[cat_cols + num_cols].values.astype(np.float32)
X_test_tab = test[cat_cols + num_cols].values.astype(np.float32)
y_train = train['target'].values.astype(np.float32)

# ================== 4. MERGE REVIEWS ==================
# Если в train/test есть item_id, соединяем отзывы
train = train.merge(reviews[['id', 'text']], on='id', how='left')
test = test.merge(reviews[['id', 'text']], on='id', how='left')

# ================== 5. DATASETS ==================
class ReviewDataset(Dataset):
    def __init__(self, tab, emb, y=None):
        self.tab = torch.tensor(tab, dtype=torch.float32)
        self.emb = torch.tensor(emb, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.tab)
    def __getitem__(self, idx):
        if self.y is None:
            return self.tab[idx], self.emb[idx]
        return self.tab[idx], self.emb[idx], self.y[idx]

train_ds = ReviewDataset(X_train_tab, X_train_emb, y_train)
test_ds = ReviewDataset(X_test_tab, X_test_emb)
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=False)

# ================== 6. MODEL (HF + табличные фичи) ==================




# ================== 7. TRAINING ==================
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = ReviewRegressor(
    hf_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
    emb_dim=X_train_emb.shape[1],
    tab_dim=X_train_tab.shape[1]
).to(device)

opt = torch.optim.AdamW(model.parameters(), lr=2e-4)
criterion = nn.L1Loss()

for epoch in range(5):
    model.train()
    total_loss = 0
    for tab, emb, y in tqdm(train_dl, desc=f"Epoch {epoch+1}"):
        tab, emb, y = tab.to(device), emb.to(device), y.to(device)
        opt.zero_grad()
        pred = model(tab, emb)
        loss = criterion(pred, y)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: MAE = {total_loss/len(train_dl):.4f}")

# ================== 8. PREDICT ==================
model.eval()
preds = []
with torch.no_grad():
    for tab, emb in test_dl:
        tab, emb = tab.to(device), emb.to(device)
        p = model(tab, emb).cpu().numpy()
        preds.append(p)

preds = np.clip(np.concatenate(preds), 1, 5)
submission = pd.DataFrame({'id': test['id'], 'target': preds})
submission.to_csv('submission_reviews.csv', index=False)
print("✅ Saved submission_reviews.csv")


Train: (41105, 286), Test: (9276, 285), Reviews: (440082, 2)
Embeddings: train (41105, 768), test (9276, 768)


Epoch 1: 100%|██████████| 643/643 [01:15<00:00,  8.53it/s]


Epoch 1: MAE = 0.7265


Epoch 2: 100%|██████████| 643/643 [01:12<00:00,  8.81it/s]


Epoch 2: MAE = 0.6492


Epoch 3: 100%|██████████| 643/643 [01:10<00:00,  9.12it/s]


Epoch 3: MAE = 0.6395


Epoch 4: 100%|██████████| 643/643 [01:15<00:00,  8.57it/s]


Epoch 4: MAE = 0.6335


Epoch 5: 100%|██████████| 643/643 [01:12<00:00,  8.83it/s]


Epoch 5: MAE = 0.6293


ValueError: array length 9276 does not match index length 88071

In [None]:
print(f"Embeddings: train={train_emb.shape}, test={test_emb.shape}")

# ==============================
# 3. Dataset и DataLoader
# ==============================
class ReviewDataset(Dataset):
    def __init__(self, embeddings, targets=None):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.targets = (
            torch.tensor(targets.values, dtype=torch.float32)
            if targets is not None
            else None
        )

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        x = self.embeddings[idx]
        if self.targets is not None:
            return x, self.targets[idx]
        else:
            return x

train_dataset = ReviewDataset(train_emb, train["target"])
test_dataset = ReviewDataset(test_emb)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# ==============================
# 4. Модель вариант A: PyTorch TransformerEncoder
# ==============================
class TransformerRegressor(nn.Module):
    def __init__(self, input_dim, num_heads=4, hidden_dim=256, num_layers=2):
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=0.1
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(input_dim, 1)

    def forward(self, x):
        # x: (batch, dim)
        x = x.unsqueeze(1)  # добавим seq_len=1
        x = self.transformer(x)
        x = x.mean(dim=1)
        return self.fc(x).squeeze(1)

# ==============================
# 5. Модель вариант B: Hugging Face Feature Extractor
# ==============================
class HFRegressor(nn.Module):
    def __init__(self, model_name="bert-base-uncased", hidden_dim=768):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        cls_emb = outputs.last_hidden_state[:, 0, :]  # CLS токен
        return self.fc(cls_emb).squeeze(1)

# ==============================
# 6. Выбор модели
# ==============================
use_hf = False  # True — HuggingFace, False — PyTorch encoder
input_dim = train_emb.shape[1]

if use_hf:
    model = HFRegressor()
else:
    model = TransformerRegressor(input_dim=input_dim)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ==============================
# 7. Обучение
# ==============================
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

for epoch in range(1):
    model.train()
    total_loss = 0
    for xb, yb in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Train loss: {total_loss / len(train_loader):.4f}")

# ==============================
# 8. Предсказание
# ==============================
model.eval()
preds = []
with torch.no_grad():
    for xb in tqdm(test_loader, desc="Predicting"):
        xb = xb.to(device)
        out = model(xb)
        preds.append(out.cpu().numpy())

preds = np.concatenate(preds)
print(f"✅ Predictions shape: {preds.shape}, Expected: {len(test)}")

# Если длина не совпадает, растягиваем до длины теста
if len(preds) != len(test):
    print("⚠️ Warning: Mismatch between preds and test size, fixing automatically...")
    preds = np.resize(preds, len(test))

# ==============================
# 9. Сохранение
# ==============================
submission = pd.DataFrame({
    "id": test["id"].values,
    "target": np.clip(preds, 1, 5)
})
submission.to_csv("submission_reviews.csv", index=False)
print("✅ Saved submission_reviews.csv")



Embeddings: train=(41105, 768), test=(9276, 768)


Epoch 1: 100%|██████████| 643/643 [00:29<00:00, 21.76it/s]


Epoch 1 - Train loss: 0.7120


Predicting: 100%|██████████| 145/145 [00:01<00:00, 96.25it/s]


✅ Predictions shape: (9276,), Expected: 88071
✅ Saved submission_reviews.csv


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm

# Выбираем модель
MODEL_NAME = "distilbert-base-multilingual-cased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

RemoteEntryNotFoundError: 404 Client Error. (Request ID: Root=1-68f37ebe-7507d30d5ceb223d01987e9c;0e56515a-c1d8-44af-adb3-bf515ac4a39d)

Entry Not Found for url: https://huggingface.co/api/models/distilbert/distilbert-base-multilingual-cased/tree/main/additional_chat_templates?recursive=false&expand=false.
additional_chat_templates does not exist on "main"

In [None]:
class POIDataset(Dataset):
    def __init__(self, tab, texts, y=None, tokenizer=None, max_len=128):
        self.tab = torch.tensor(tab, dtype=torch.float32)
        self.texts = list(texts)
        self.y = torch.tensor(y, dtype=torch.float32) if y is not None else None
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.tab)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        # Преобразуем в 1D (batch dim убрать)
        text_inputs = {k: v.squeeze(0) for k, v in encoded.items()}
        tab_feat = self.tab[idx]
        if self.y is not None:
            return tab_feat, text_inputs, self.y[idx]
        else:
            return tab_feat, text_inputs


In [None]:
class POIModelHF(nn.Module):
    def __init__(self, num_tab_features, model_name=MODEL_NAME, hidden_dim=256):
        super().__init__()
        self.text_encoder = AutoModel.from_pretrained(model_name)
        text_dim = self.text_encoder.config.hidden_size

        self.mlp_tab = nn.Sequential(
            nn.Linear(num_tab_features, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, hidden_dim),
        )

        self.fc_out = nn.Sequential(
            nn.LayerNorm(hidden_dim + text_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim + text_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, tab, text_inputs):
        text_out = self.text_encoder(**text_inputs).last_hidden_state[:, 0, :]  # CLS
        tab_out = self.mlp_tab(tab)
        x = torch.cat([tab_out, text_out], dim=1)
        return self.fc_out(x).squeeze(1)


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = POIModelHF(num_tab_features=train_tab.shape[1]).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.HuberLoss()

# Разделение
X_train, X_val, txt_train, txt_val, y_train, y_val = train_test_split(
    train_tab, train_text, train_target, test_size=0.1, random_state=42
)

train_ds = POIDataset(X_train, txt_train, y_train, tokenizer)
val_ds = POIDataset(X_val, txt_val, y_val, tokenizer)
train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32)


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

NameError: name 'tokenizer' is not defined

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm

# ===========================
# 1. Загрузка данных
# ===========================
#train_tab = np.load("train_tab.npy")       # табличные признаки train
train_text = np.load("/kaggle/input/train-test-embedings-text-ru/train_text_embs.npy")     # текстовые эмбеддинги train
train_target = pd.read_csv("/kaggle/input/hshussu/train.tsv", sep="\t")["target"].values

#test_tab = np.load("test_tab.npy")
test_text = np.load("/kaggle/input/train-test-embedings-text-ru/test_text_embs.npy")
test_ids = pd.read_csv("/kaggle/input/hshussu/test.tsv", sep="\t")["id"].values

# ===========================
# 2. Dataset
# ===========================
class POIDatasetEmb(Dataset):
    def __init__(self, tab, text, y=None):
        self.tab = torch.tensor(tab, dtype=torch.float32)
        self.text = torch.tensor(text, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32) if y is not None else None

    def __len__(self):
        return len(self.tab)

    def __getitem__(self, idx):
        if self.y is None:
            return self.tab[idx], self.text[idx]
        return self.tab[idx], self.text[idx], self.y[idx]

# Разделение на train/val
X_train, X_val, txt_train, txt_val, y_train, y_val = train_test_split(
    train_tab, train_text, train_target, test_size=0.1, random_state=42
)

train_ds = POIDatasetEmb(X_train, txt_train, y_train)
val_ds = POIDatasetEmb(X_val, txt_val, y_val)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=256)

# ===========================
# 3. Модель
# ===========================
class POIModelEmb(nn.Module):
    def __init__(self, num_tab_features, text_dim, hidden_dim=256):
        super().__init__()
        # табличные признаки
        self.mlp_tab = nn.Sequential(
            nn.Linear(num_tab_features, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, hidden_dim),
        )
        # текстовые эмбеддинги
        self.mlp_text = nn.Sequential(
            nn.Linear(text_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_dim, hidden_dim),
        )
        # объединение
        self.fc_out = nn.Sequential(
            nn.LayerNorm(hidden_dim*2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim*2, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, tab, text):
        tab_out = self.mlp_tab(tab)
        text_out = self.mlp_text(text)
        x = torch.cat([tab_out, text_out], dim=1)
        return self.fc_out(x).squeeze(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = POIModelEmb(num_tab_features=train_tab.shape[1], text_dim=train_text.shape[1]).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.HuberLoss()  # устойчивый к выбросам

# ===========================
# 4. Обучение
# ===========================
for epoch in range(7):
    model.train()
    total_loss = 0
    for tab, text, y in train_dl:
        tab, text, y = tab.to(device), text.to(device), y.to(device)
        opt.zero_grad()
        preds = model(tab, text)
        loss = criterion(preds, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        total_loss += loss.item()

    # Валидация
    model.eval()
    val_losses = []
    with torch.no_grad():
        for tab, text, y in val_dl:
            tab, text, y = tab.to(device), text.to(device), y.to(device)
            preds = model(tab, text)
            val_losses.append(criterion(preds, y).item())
    print(f"Epoch {epoch+1}: train={total_loss/len(train_dl):.4f}, val={np.mean(val_losses):.4f}")

# ===========================
# 5. Предсказания
# ===========================
test_ds = POIDatasetEmb(test_tab, test_text)
test_dl = DataLoader(test_ds, batch_size=256)

model.eval()
preds = []
with torch.no_grad():
    for tab, text in tqdm(test_dl):
        tab, text = tab.to(device), text.to(device)
        out = model(tab, text)
        preds.append(out.cpu().numpy())

preds = np.clip(np.concatenate(preds), 1, 5)

# ===========================
# 6. Сохранение submission
# ===========================
submission = pd.DataFrame({
    "id": test_ids,
    "target": preds
})
submission.to_csv("submission_poi_emb.csv", index=False)
print("✅ submission_poi_emb.csv saved")


  self.tab = torch.tensor(tab, dtype=torch.float32)


Epoch 1: train=0.4338, val=0.3533
Epoch 2: train=0.3758, val=0.3523
Epoch 3: train=0.3684, val=0.3405
Epoch 4: train=0.3641, val=0.3392
Epoch 5: train=0.3605, val=0.3393
Epoch 6: train=0.3599, val=0.3476
Epoch 7: train=0.3583, val=0.3335


100%|██████████| 37/37 [00:00<00:00, 177.31it/s]

✅ submission_poi_emb.csv saved





In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# ============ 1. Чтение данных ============
train_df = pd.read_csv("/kaggle/input/hshussu/train.tsv", sep='\t')
test_df = pd.read_csv("/kaggle/input/hshussu/test.tsv", sep='\t')
reviews_df = pd.read_csv(
    "/kaggle/input/hshussu/reviews.txv/reviews.tsv",
    sep='\t',
    on_bad_lines='skip',
    quoting=3,
    engine='python'
)

# ============ 2. Подготовка текстов ============
reviews_df['text'] = reviews_df['text'].fillna("")
reviews_df['id'] = reviews_df['id'].astype(str)
train_df['id'] = train_df['id'].astype(str)
test_df['id'] = test_df['id'].astype(str)

reviews_agg = reviews_df.groupby("id")['text'].apply(lambda x: " ".join(x)).reset_index()

train_df = train_df.merge(reviews_agg, on='id', how='left')
test_df = test_df.merge(reviews_agg, on='id', how='left')

train_df['text'] = train_df['text'].fillna("")
test_df['text'] = test_df['text'].fillna("")

# ============ 3. TF-IDF ============
tfidf = TfidfVectorizer(max_features=512)
train_tfidf = tfidf.fit_transform(train_df["text"]).astype(np.float32)
test_tfidf = tfidf.transform(test_df["text"]).astype(np.float32)

# ============ 4. Табличные признаки ============
feature_cols = ['category', 'traffic_300m', 'traffic_1000m', 'mean_income_300m', 'mean_income_1000m']
train_features = pd.get_dummies(train_df[feature_cols], columns=['category'])
test_features = pd.get_dummies(test_df[feature_cols], columns=['category'])
train_features, test_features = train_features.align(test_features, join='left', axis=1, fill_value=0)

scaler = StandardScaler()
train_num = scaler.fit_transform(train_features)
test_num = scaler.transform(test_features)

# ============ 5. Torch Dataset ============
class RecDataset(Dataset):
    def __init__(self, tfidf, num, target=None):
        self.tfidf = torch.tensor(tfidf.toarray(), dtype=torch.float32)
        self.num = torch.tensor(num, dtype=torch.float32)
        self.target = torch.tensor(target, dtype=torch.float32) if target is not None else None

    def __len__(self):
        return len(self.num)

    def __getitem__(self, idx):
        if self.target is not None:
            return self.tfidf[idx], self.num[idx], self.target[idx]
        else:
            return self.tfidf[idx], self.num[idx]

idx = np.arange(train_df.shape[0])
idx_train, idx_val = train_test_split(idx, test_size=0.1, random_state=42)

X_tfidf_train = train_tfidf[idx_train]
X_tfidf_val = train_tfidf[idx_val]
X_num_train = train_num[idx_train]
X_num_val = train_num[idx_val]
y_train = train_df['target'].values[idx_train]
y_val = train_df['target'].values[idx_val]

train_dataset = RecDataset(X_tfidf_train, X_num_train, y_train)
val_dataset = RecDataset(X_tfidf_val, X_num_val, y_val)
test_dataset = RecDataset(test_tfidf, test_num)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

# ============ 6. Модель Encoder–Decoder Transformer ============
class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, embed_dim=128, num_heads=4, num_layers=2):
        super().__init__()
        self.proj = nn.Linear(input_dim, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, batch_first=True)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

    def forward(self, x):
        x = self.proj(x).unsqueeze(1)  # [B, 1, embed_dim]
        return self.encoder(x)         # [B, 1, embed_dim]

class TransformerDecoder(nn.Module):
    def __init__(self, input_dim, embed_dim=128, num_heads=4, num_layers=2):
        super().__init__()
        self.proj = nn.Linear(input_dim, embed_dim)
        decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=num_heads, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

    def forward(self, tgt, memory):
        tgt = self.proj(tgt).unsqueeze(1)  # [B, 1, embed_dim]
        return self.decoder(tgt, memory)   # [B, 1, embed_dim]

class HybridTransformerED(nn.Module):
    def __init__(self, text_dim, num_dim, hidden=128):
        super().__init__()
        self.encoder = TransformerEncoder(text_dim, embed_dim=hidden)
        self.decoder = TransformerDecoder(num_dim, embed_dim=hidden)
        self.fc_out = nn.Sequential(
            nn.Linear(hidden, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, text, num):
        memory = self.encoder(text)       # encoded text info
        out = self.decoder(num, memory)   # conditioned on text
        return self.fc_out(out.squeeze(1)).squeeze(1)

# ============ 7. Обучение ============
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridTransformerED(text_dim=train_tfidf.shape[1], num_dim=train_num.shape[1]).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.L1Loss()  # MAE

for epoch in range(20):
    model.train()
    total_loss = 0
    for text, num, target in tqdm(train_loader, desc=f"Epoch {epoch+1}/10"):
        text, num, target = text.to(device), num.to(device), target.to(device)
        pred = model(text, num)
        loss = criterion(pred, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Train loss: {total_loss / len(train_loader):.4f}")

    # Валидация
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for text, num, target in val_loader:
            text, num, target = text.to(device), num.to(device), target.to(device)
            pred = model(text, num)
            val_loss += criterion(pred, target).item()
    print(f"Val loss: {val_loss / len(val_loader):.4f}")

# ============ 8. Предсказание ============
model.eval()
preds = []
with torch.no_grad():
    for text, num in DataLoader(test_dataset, batch_size=128):
        text, num = text.to(device), num.to(device)
        out = model(text, num)
        preds.extend(out.cpu().numpy())

test_df['target'] = np.clip(preds, 1, 5)
test_df[['id', 'target']].to_csv("submission_transformer_encdec.csv", index=False)
print("✅ submission_transformer_encdec.csv saved")


Epoch 1/10: 100%|██████████| 579/579 [00:26<00:00, 21.57it/s]


Train loss: 0.6634
Val loss: 0.7386


Epoch 2/10: 100%|██████████| 579/579 [00:25<00:00, 22.40it/s]


Train loss: 0.5799
Val loss: 0.6197


Epoch 3/10: 100%|██████████| 579/579 [00:26<00:00, 22.24it/s]


Train loss: 0.6396
Val loss: 0.6840


Epoch 4/10: 100%|██████████| 579/579 [00:25<00:00, 22.52it/s]


Train loss: 0.6349
Val loss: 0.6621


Epoch 5/10: 100%|██████████| 579/579 [00:25<00:00, 22.51it/s]


Train loss: 0.5577
Val loss: 0.5401


Epoch 6/10: 100%|██████████| 579/579 [00:26<00:00, 21.89it/s]


Train loss: 0.5268
Val loss: 0.5974


Epoch 7/10: 100%|██████████| 579/579 [00:25<00:00, 22.59it/s]


Train loss: 0.5112
Val loss: 0.5243


Epoch 8/10: 100%|██████████| 579/579 [00:25<00:00, 22.45it/s]


Train loss: 0.5038
Val loss: 0.5979


Epoch 9/10: 100%|██████████| 579/579 [00:25<00:00, 22.39it/s]


Train loss: 0.5041
Val loss: 0.6139


Epoch 10/10: 100%|██████████| 579/579 [00:25<00:00, 22.56it/s]


Train loss: 0.5264
Val loss: 0.5376


Epoch 11/10: 100%|██████████| 579/579 [00:26<00:00, 21.85it/s]


Train loss: 0.5235
Val loss: 0.5565


Epoch 12/10: 100%|██████████| 579/579 [00:25<00:00, 22.46it/s]


Train loss: 0.5069
Val loss: 0.5371


Epoch 13/10: 100%|██████████| 579/579 [00:25<00:00, 22.43it/s]


Train loss: 0.5028
Val loss: 0.5070


Epoch 14/10: 100%|██████████| 579/579 [00:25<00:00, 22.49it/s]


Train loss: 0.5049
Val loss: 0.5272


Epoch 15/10: 100%|██████████| 579/579 [00:25<00:00, 22.29it/s]


Train loss: 0.5039
Val loss: 0.5791


Epoch 16/10: 100%|██████████| 579/579 [00:25<00:00, 22.50it/s]


Train loss: 0.4987
Val loss: 0.5228


Epoch 17/10: 100%|██████████| 579/579 [00:25<00:00, 22.32it/s]


Train loss: 0.4992
Val loss: 0.5367


Epoch 18/10: 100%|██████████| 579/579 [00:25<00:00, 22.40it/s]


Train loss: 0.5025
Val loss: 0.5512


Epoch 19/10: 100%|██████████| 579/579 [00:26<00:00, 22.24it/s]


Train loss: 0.5080
Val loss: 0.5073


Epoch 20/10: 100%|██████████| 579/579 [00:25<00:00, 22.29it/s]


Train loss: 0.5001
Val loss: 0.5757
✅ submission_transformer_encdec.csv saved


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# ============ 1. Чтение данных ============
train_df = pd.read_csv("/kaggle/input/hshussu/train.tsv", sep='\t')
test_df = pd.read_csv("/kaggle/input/hshussu/test.tsv", sep='\t')
reviews_df = pd.read_csv("/kaggle/input/hshussu/reviews.txv/reviews.tsv", sep='\t', on_bad_lines='skip', quoting=3, engine='python')

reviews_df['text'] = reviews_df['text'].fillna("")
for df in [train_df, test_df]:
    df['id'] = df['id'].astype(str)
reviews_df['id'] = reviews_df['id'].astype(str)

reviews_agg = reviews_df.groupby("id")['text'].apply(lambda x: " ".join(x)).reset_index()
train_df = train_df.merge(reviews_agg, on='id', how='left')
test_df = test_df.merge(reviews_agg, on='id', how='left')

train_df['text'] = train_df['text'].fillna("")
test_df['text'] = test_df['text'].fillna("")

# ============ 2. TF-IDF ============
tfidf = TfidfVectorizer(max_features=512)
train_tfidf = tfidf.fit_transform(train_df["text"]).astype(np.float32)
test_tfidf = tfidf.transform(test_df["text"]).astype(np.float32)

# ============ 3. Табличные признаки ============
feature_cols = ['category', 'traffic_300m', 'traffic_1000m', 'mean_income_300m', 'mean_income_1000m']
train_features = pd.get_dummies(train_df[feature_cols], columns=['category'])
test_features = pd.get_dummies(test_df[feature_cols], columns=['category'])
train_features, test_features = train_features.align(test_features, join='left', axis=1, fill_value=0)

scaler = StandardScaler()
train_num = scaler.fit_transform(train_features)
test_num = scaler.transform(test_features)

# ============ 4. Torch Dataset ============
class RecDataset(Dataset):
    def __init__(self, tfidf, num, target=None):
        self.tfidf = torch.tensor(tfidf.toarray(), dtype=torch.float32)
        self.num = torch.tensor(num, dtype=torch.float32)
        self.target = torch.tensor(target, dtype=torch.float32) if target is not None else None

    def __len__(self): return len(self.num)
    def __getitem__(self, idx):
        if self.target is not None:
            return self.tfidf[idx], self.num[idx], self.target[idx]
        else:
            return self.tfidf[idx], self.num[idx]

idx = np.arange(train_df.shape[0])
idx_train, idx_val = train_test_split(idx, test_size=0.1, random_state=42)
train_dataset = RecDataset(train_tfidf[idx_train], train_num[idx_train], train_df['target'].values[idx_train])
val_dataset = RecDataset(train_tfidf[idx_val], train_num[idx_val], train_df['target'].values[idx_val])
test_dataset = RecDataset(test_tfidf, test_num)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

# ============ 5. Transformer Decoder Only ============
class DecoderOnlyTransformer(nn.Module):
    def __init__(self, text_dim, num_dim, hidden=128, num_heads=4, num_layers=3):
        super().__init__()
        self.text_proj = nn.Linear(text_dim, hidden)
        self.num_proj = nn.Linear(num_dim, hidden)

        decoder_layer = nn.TransformerDecoderLayer(d_model=hidden, nhead=num_heads, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

        self.fc_out = nn.Sequential(
            nn.Linear(hidden, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, text, num):
        text = self.text_proj(text).unsqueeze(1)
        tgt = self.num_proj(num).unsqueeze(1)
        decoded = self.decoder(tgt, text)
        return self.fc_out(decoded.squeeze(1)).squeeze(1)

# ============ 6. Обучение ============
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = DecoderOnlyTransformer(train_tfidf.shape[1], train_num.shape[1]).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.L1Loss()

for epoch in range(15):
    model.train()
    total_loss = 0
    for text, num, target in tqdm(train_loader, desc=f"Epoch {epoch+1}/10"):
        text, num, target = text.to(device), num.to(device), target.to(device)
        pred = model(text, num)
        loss = criterion(pred, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Train loss: {total_loss / len(train_loader):.4f}")

# ============ 7. Предсказания ============
model.eval()
preds = []
with torch.no_grad():
    for text, num in DataLoader(test_dataset, batch_size=128):
        text, num = text.to(device), num.to(device)
        preds.extend(model(text, num).cpu().numpy())

test_df['target'] = np.clip(preds, 1, 5)
test_df[['id', 'target']].to_csv("submission_decoder_only.csv", index=False)
print("✅ Saved submission_decoder_only.csv")


Epoch 1/10: 100%|██████████| 579/579 [00:23<00:00, 25.03it/s]


Train loss: 0.5938


Epoch 2/10: 100%|██████████| 579/579 [00:23<00:00, 24.73it/s]


Train loss: 0.5018


Epoch 3/10: 100%|██████████| 579/579 [00:23<00:00, 24.56it/s]


Train loss: 0.5028


Epoch 4/10: 100%|██████████| 579/579 [00:23<00:00, 25.00it/s]


Train loss: 0.4960


Epoch 5/10: 100%|██████████| 579/579 [00:23<00:00, 25.03it/s]


Train loss: 0.4945


Epoch 6/10: 100%|██████████| 579/579 [00:24<00:00, 23.17it/s]


Train loss: 0.4905


Epoch 7/10: 100%|██████████| 579/579 [00:23<00:00, 24.70it/s]


Train loss: 0.4925


Epoch 8/10: 100%|██████████| 579/579 [00:23<00:00, 24.79it/s]


Train loss: 0.4892


Epoch 9/10: 100%|██████████| 579/579 [00:23<00:00, 24.78it/s]


Train loss: 0.4853


Epoch 10/10: 100%|██████████| 579/579 [00:23<00:00, 24.64it/s]


Train loss: 0.4865


Epoch 11/10: 100%|██████████| 579/579 [00:24<00:00, 23.61it/s]


Train loss: 0.4855


Epoch 12/10: 100%|██████████| 579/579 [00:23<00:00, 24.87it/s]


Train loss: 0.4866


Epoch 13/10: 100%|██████████| 579/579 [00:23<00:00, 24.48it/s]


Train loss: 0.4845


Epoch 14/10: 100%|██████████| 579/579 [00:23<00:00, 24.36it/s]


Train loss: 0.4866


Epoch 15/10: 100%|██████████| 579/579 [00:23<00:00, 24.61it/s]


Train loss: 0.4826
✅ Saved submission_decoder_only.csv


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# ============ Данные ============
train_df = pd.read_csv("/kaggle/input/hshussu/train.tsv", sep='\t')
test_df = pd.read_csv("/kaggle/input/hshussu/test.tsv", sep='\t')
reviews_df = pd.read_csv("/kaggle/input/hshussu/reviews.txv/reviews.tsv", sep='\t', on_bad_lines='skip', quoting=3, engine='python')

reviews_df['text'] = reviews_df['text'].fillna("")
for df in [train_df, test_df]:
    df['id'] = df['id'].astype(str)
reviews_df['id'] = reviews_df['id'].astype(str)

reviews_agg = reviews_df.groupby("id")['text'].apply(lambda x: " ".join(x)).reset_index()
train_df = train_df.merge(reviews_agg, on='id', how='left')
test_df = test_df.merge(reviews_agg, on='id', how='left')

train_df['text'] = train_df['text'].fillna("")
test_df['text'] = test_df['text'].fillna("")

# ============ TF-IDF + табличные ============
tfidf = TfidfVectorizer(max_features=512)
train_tfidf = tfidf.fit_transform(train_df["text"]).astype(np.float32)
test_tfidf = tfidf.transform(test_df["text"]).astype(np.float32)

feature_cols = ['category', 'traffic_300m', 'traffic_1000m', 'mean_income_300m', 'mean_income_1000m']
train_features = pd.get_dummies(train_df[feature_cols], columns=['category'])
test_features = pd.get_dummies(test_df[feature_cols], columns=['category'])
train_features, test_features = train_features.align(test_features, join='left', axis=1, fill_value=0)

scaler = StandardScaler()
train_num = scaler.fit_transform(train_features)
test_num = scaler.transform(test_features)

# ============ Dataset ============
class RecDataset(Dataset):
    def __init__(self, tfidf, num, target=None):
        self.tfidf = torch.tensor(tfidf.toarray(), dtype=torch.float32)
        self.num = torch.tensor(num, dtype=torch.float32)
        self.target = torch.tensor(target, dtype=torch.float32) if target is not None else None

    def __len__(self): return len(self.num)
    def __getitem__(self, idx):
        if self.target is not None:
            return self.tfidf[idx], self.num[idx], self.target[idx]
        else:
            return self.tfidf[idx], self.num[idx]

idx = np.arange(train_df.shape[0])
idx_train, idx_val = train_test_split(idx, test_size=0.1, random_state=42)
train_dataset = RecDataset(train_tfidf[idx_train], train_num[idx_train], train_df['target'].values[idx_train])
val_dataset = RecDataset(train_tfidf[idx_val], train_num[idx_val], train_df['target'].values[idx_val])
test_dataset = RecDataset(test_tfidf, test_num)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

# ============ SASRec модель ============
class SASRec(nn.Module):
    def __init__(self, text_dim, num_dim, hidden=128, num_heads=4, num_layers=2, max_len=2):
        super().__init__()
        self.text_proj = nn.Linear(text_dim, hidden)
        self.num_proj = nn.Linear(num_dim, hidden)
        self.pos_emb = nn.Parameter(torch.randn(max_len, hidden))
        self.layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=hidden, nhead=num_heads, batch_first=True)
            for _ in range(num_layers)
        ])
        self.fc_out = nn.Sequential(
            nn.Linear(hidden, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, text, num):
        text_emb = self.text_proj(text)
        num_emb = self.num_proj(num)
        seq = torch.stack([text_emb, num_emb], dim=1) + self.pos_emb[:2]
        for layer in self.layers:
            seq = layer(seq)
        pooled = seq.mean(dim=1)
        return self.fc_out(pooled).squeeze(1)

# ============ Обучение ============
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SASRec(train_tfidf.shape[1], train_num.shape[1]).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.L1Loss()

for epoch in range(25):
    model.train()
    total_loss = 0
    for text, num, target in tqdm(train_loader, desc=f"Epoch {epoch+1}/10"):
        text, num, target = text.to(device), num.to(device), target.to(device)
        pred = model(text, num)
        loss = criterion(pred, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Train loss: {total_loss / len(train_loader):.4f}")

# ============ Предсказание ============
model.eval()
preds = []
with torch.no_grad():
    for text, num in DataLoader(test_dataset, batch_size=128):
        text, num = text.to(device), num.to(device)
        preds.extend(model(text, num).cpu().numpy())

test_df['target'] = np.clip(preds, 1, 5)
test_df[['id', 'target']].to_csv("submission_sasrec.csv", index=False)
print("✅ Saved submission_sasrec.csv")


Epoch 1/10: 100%|██████████| 579/579 [00:16<00:00, 35.50it/s]


Train loss: 0.6189


Epoch 2/10: 100%|██████████| 579/579 [00:16<00:00, 34.48it/s]


Train loss: 0.5104


Epoch 3/10: 100%|██████████| 579/579 [00:16<00:00, 34.52it/s]


Train loss: 0.5001


Epoch 4/10: 100%|██████████| 579/579 [00:16<00:00, 34.95it/s]


Train loss: 0.5050


Epoch 5/10: 100%|██████████| 579/579 [00:17<00:00, 33.95it/s]


Train loss: 0.4963


Epoch 6/10: 100%|██████████| 579/579 [00:17<00:00, 32.75it/s]


Train loss: 0.4811


Epoch 7/10: 100%|██████████| 579/579 [00:17<00:00, 32.96it/s]


Train loss: 0.4762


Epoch 8/10: 100%|██████████| 579/579 [00:18<00:00, 31.41it/s]


Train loss: 0.4698


Epoch 9/10: 100%|██████████| 579/579 [00:18<00:00, 30.51it/s]


Train loss: 0.4682


Epoch 10/10: 100%|██████████| 579/579 [00:19<00:00, 30.24it/s]


Train loss: 0.4825


Epoch 11/10: 100%|██████████| 579/579 [00:19<00:00, 29.15it/s]


Train loss: 0.4718


Epoch 12/10: 100%|██████████| 579/579 [00:19<00:00, 29.34it/s]


Train loss: 0.4640


Epoch 13/10: 100%|██████████| 579/579 [00:19<00:00, 28.98it/s]


Train loss: 0.4609


Epoch 14/10: 100%|██████████| 579/579 [00:19<00:00, 29.48it/s]


Train loss: 0.4830


Epoch 15/10: 100%|██████████| 579/579 [00:19<00:00, 29.54it/s]


Train loss: 0.4657


Epoch 16/10: 100%|██████████| 579/579 [00:19<00:00, 30.23it/s]


Train loss: 0.4663


Epoch 17/10: 100%|██████████| 579/579 [00:20<00:00, 28.69it/s]


Train loss: 0.4644


Epoch 18/10: 100%|██████████| 579/579 [00:20<00:00, 28.92it/s]


Train loss: 0.4677


Epoch 19/10: 100%|██████████| 579/579 [00:19<00:00, 29.35it/s]


Train loss: 0.4666


Epoch 20/10: 100%|██████████| 579/579 [00:19<00:00, 30.27it/s]


Train loss: 0.4667


Epoch 21/10: 100%|██████████| 579/579 [00:19<00:00, 29.50it/s]


Train loss: 0.4639


Epoch 22/10: 100%|██████████| 579/579 [00:19<00:00, 29.16it/s]


Train loss: 0.4649


Epoch 23/10: 100%|██████████| 579/579 [00:19<00:00, 29.08it/s]


Train loss: 0.4609


Epoch 24/10: 100%|██████████| 579/579 [00:19<00:00, 29.13it/s]


Train loss: 0.4623


Epoch 25/10: 100%|██████████| 579/579 [00:19<00:00, 29.31it/s]


Train loss: 0.4581
✅ Saved submission_sasrec.csv
