In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRanker
from tqdm import tqdm


df = pd.read_parquet('/kaggle/input/reccomend/train_data.pq')
sample = pd.read_csv('/kaggle/input/asddbfd/sample_submission (1).csv')
print(f"Train shape: {df.shape}, Sample: {sample.shape}")


df['user_clicks'] = df.groupby('user_id')['item_id'].transform('count')
df['item_popularity'] = df.groupby('item_id')['user_id'].transform('count')

# Normalize popularity
df['item_popularity'] = np.log1p(df['item_popularity'])
df['user_clicks'] = np.log1p(df['user_clicks'])

# Temporal recency feature
df['recency'] = df['date'].max() - df['date']

# Aggregate to (user_id, item_id)
features = df.groupby(['user_id', 'item_id'], as_index=False).agg({
    'recency': 'min',
    'item_popularity': 'mean',
    'user_clicks': 'mean',
    'date': 'max'
})

# Label = 1 (–ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å –∫–ª–∏–∫–∞–ª –Ω–∞ —Ç–æ–≤–∞—Ä)
features['label'] = 1

# ========== 3. NEGATIVE SAMPLING ==========
# –î–ª—è –∫–∞–∂–¥–æ–≥–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è –¥–æ–±–∞–≤–ª—è–µ–º 20 —Å–ª—É—á–∞–π–Ω—ã—Ö –Ω–µ –∫–ª–∏–∫–Ω—É—Ç—ã—Ö —Ç–æ–≤–∞—Ä–æ–≤
unique_items = df['item_id'].unique()
neg_samples = []

print("Generating negatives...")
for uid, grp in tqdm(df.groupby('user_id')):
    pos_items = set(grp['item_id'])
    neg_items = np.random.choice(list(set(unique_items) - pos_items), size=20, replace=False)
    tmp = pd.DataFrame({'user_id': uid, 'item_id': neg_items})
    tmp['label'] = 0
    neg_samples.append(tmp)

neg_df = pd.concat(neg_samples, ignore_index=True)

# Merge with features
train_df = pd.concat([features[['user_id', 'item_id', 'recency', 'item_popularity', 'user_clicks', 'label']], neg_df])
train_df = train_df.fillna(0)

# ========== 4. PREPARE DATA FOR LIGHTGBM ==========
X = train_df[['recency', 'item_popularity', 'user_clicks']]
y = train_df['label']
group = train_df.groupby('user_id').size().to_numpy()

# ========== 5. TRAIN MODEL ==========
print("Training LGBMRanker...")
model = LGBMRanker(
    objective='lambdarank',
    metric='map',
    boosting_type='gbdt',
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=200,
    n_jobs=-1
)

model.fit(X, y, group=group)

# ========== 6. GENERATE RECOMMENDATIONS ==========
print("Generating recommendations...")
user_features = train_df[['user_id']].drop_duplicates()
item_features = df[['item_id']].drop_duplicates()

preds = []
for uid in tqdm(sample['user_id']):
    # –ë–µ—Ä—ë–º –≤—Å–µ –∏–∑–≤–µ—Å—Ç–Ω—ã–µ —Ç–æ–≤–∞—Ä—ã (–¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è –º–æ–∂–Ω–æ –æ–≥—Ä–∞–Ω–∏—á–∏—Ç—å top-500 –ø–æ–ø—É–ª—è—Ä–Ω—ã—Ö)
    candidates = item_features.copy()
    candidates['user_id'] = uid
    candidates['recency'] = 0
    candidates['user_clicks'] = np.log1p(df.loc[df['user_id'] == uid, 'item_id'].count())
    candidates['item_popularity'] = np.log1p(candidates['item_id'].map(df['item_id'].value_counts()).fillna(0))

    X_pred = candidates[['recency', 'item_popularity', 'user_clicks']]
    candidates['score'] = model.predict(X_pred)
    top20 = candidates.nlargest(20, 'score')['item_id'].tolist()
    preds.append({'user_id': uid, 'item_id': ' '.join(map(str, top20))})

sub = pd.DataFrame(preds)
sub.to_csv('submission.csv', index=False)
print("‚úÖ Saved submission.csv")


Loading data...
Train shape: (8777975, 3), Sample: (5864600, 2)
Generating negatives...


  0%|          | 3446/2682603 [12:48<165:53:25,  4.49it/s]


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRanker

# ========== 1. LOAD DATA ==========
print("Loading data...")
df = pd.read_parquet('/kaggle/input/reccomend/train_data.pq')
sample = pd.read_csv('/kaggle/input/asddbfd/sample_submission (1).csv')
print(f"Train shape: {df.shape}, Sample: {sample.shape}")

# ========== 2. FILTER ACTIVE USERS ==========
# –í–æ–∑—å–º–µ–º —Ç–æ–ª—å–∫–æ —Å–∞–º—ã—Ö –∞–∫—Ç–∏–≤–Ω—ã—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π (—É—Å–∫–æ—Ä—è–µ—Ç –æ–±—É—á–µ–Ω–∏–µ –≤ 10-20 —Ä–∞–∑)
user_activity = df['user_id'].value_counts()
active_users = user_activity.head(250_000).index  # –º–æ–∂–Ω–æ —Ä–µ–≥—É–ª–∏—Ä–æ–≤–∞—Ç—å
df = df[df['user_id'].isin(active_users)]

print(f"Using {len(active_users)} active users, {len(df)} interactions")

# ========== 3. FEATURE ENGINEERING ==========
item_pop = df['item_id'].value_counts()
df['item_popularity'] = np.log1p(df['item_id'].map(item_pop))
user_clicks = df['user_id'].value_counts()
df['user_clicks'] = np.log1p(df['user_id'].map(user_clicks))
df['recency'] = df['date'].max() - df['date']

features = df.groupby(['user_id', 'item_id'], as_index=False).agg({
    'recency': 'min',
    'item_popularity': 'mean',
    'user_clicks': 'mean',
    'date': 'max'
})
features['label'] = 1

# ========== 4. FAST NEGATIVE SAMPLING ==========
# –ë–µ—Ä—ë–º top-20000 –ø–æ–ø—É–ª—è—Ä–Ω—ã—Ö —Ç–æ–≤–∞—Ä–æ–≤ –∏ —Å–ª—É—á–∞–π–Ω–æ –Ω–∞–∑–Ω–∞—á–∞–µ–º –∫–∞–∫ –Ω–µ–≥–∞—Ç–∏–≤—ã
top_items = item_pop.head(20_000).index
n_neg = 2  # –Ω–∞ –∫–∞–∂–¥—ã–π –ø–æ–∑–∏—Ç–∏–≤ –¥–æ–±–∞–≤–∏–º 2 –Ω–µ–≥–∞—Ç–∏–≤–∞

user_ids = features['user_id'].unique()
neg_samples = pd.DataFrame({
    'user_id': np.repeat(user_ids, n_neg),
    'item_id': np.random.choice(top_items, size=len(user_ids) * n_neg)
})
neg_samples['label'] = 0

# –û–±—ä–µ–¥–∏–Ω—è–µ–º –∏ —É–±–∏—Ä–∞–µ–º –ø–µ—Ä–µ—Å–µ—á–µ–Ω–∏—è —Å —Ä–µ–∞–ª—å–Ω—ã–º–∏ –∫–ª–∏–∫–∞–º–∏
train_df = pd.concat([features[['user_id', 'item_id', 'recency', 'item_popularity', 'user_clicks', 'label']], neg_samples])
train_df.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)

# ========== 5. PREPARE DATA ==========
X = train_df[['recency', 'item_popularity', 'user_clicks']]
y = train_df['label']
group = train_df.groupby('user_id').size().to_numpy()

# ========== 6. TRAIN LIGHTGBM RANKER ==========
print("Training model...")
model = LGBMRanker(
    objective='lambdarank',
    metric='map',
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=100,
    n_jobs=-1
)
model.fit(X, y, group=group)

# ========== 7. FAST RECOMMENDATION GENERATION ==========
print("Generating predictions...")
popular_items = item_pop.head(500).index.tolist()  # –æ–≥—Ä–∞–Ω–∏—á–∏–º top-500 —Ç–æ–≤–∞—Ä–æ–≤
preds = []

for uid in sample['user_id']:
    if uid not in active_users:
        # fallback ‚Äî –ø–æ–ø—É–ª—è—Ä–Ω—ã–µ —Ç–æ–≤–∞—Ä—ã
        preds.append({'user_id': uid, 'item_id': ' '.join(map(str, popular_items[:20]))})
        continue

    user_clicks_val = np.log1p(user_clicks.get(uid, 1))
    cand = pd.DataFrame({
        'item_id': popular_items,
        'user_id': uid,
        'recency': 0,
        'user_clicks': user_clicks_val,
        'item_popularity': np.log1p(item_pop.loc[popular_items].values)
    })

    cand['score'] = model.predict(cand[['recency', 'item_popularity', 'user_clicks']])
    top20 = cand.nlargest(20, 'score')['item_id'].tolist()
    preds.append({'user_id': uid, 'item_id': ' '.join(map(str, top20))})

submission = pd.DataFrame(preds)
submission.to_csv('submission_fast.csv', index=False)
print("‚úÖ Done! Saved as submission_fast.csv")


In [None]:
import pandas as pd
import numpy as np
import time
import gc
from lightgbm import LGBMRanker
from xgboost import XGBRanker
from catboost import CatBoostRanker
from tqdm import tqdm

# ========== 1. LOAD DATA ==========
print("Loading data...")
df = pd.read_parquet('/kaggle/input/reccomend/train_data.pq')
sample = pd.read_csv('/kaggle/input/asddbfd/sample_submission (1).csv')
print(f"Train shape: {df.shape}, Sample: {sample.shape}")

# ========== 2. REDUCE USERS FOR SPEED ==========
user_activity = df['user_id'].value_counts()
active_users = user_activity.head(250_000).index
df = df[df['user_id'].isin(active_users)]
print(f"Using {len(active_users)} active users, {len(df)} interactions")

# ========== 3. FEATURE ENGINEERING ==========
item_pop = df['item_id'].value_counts()
user_clicks = df['user_id'].value_counts()

df['item_popularity'] = np.log1p(df['item_id'].map(item_pop))
df['user_clicks'] = np.log1p(df['user_id'].map(user_clicks))
df['recency'] = df['date'].max() - df['date']

features = df.groupby(['user_id', 'item_id'], as_index=False).agg({
    'recency': 'min',
    'item_popularity': 'mean',
    'user_clicks': 'mean',
    'date': 'max'
})
features['label'] = 1

# ========== 4. FAST NEGATIVE SAMPLING ==========
top_items = item_pop.head(20_000).index
n_neg = 2  # 2 –Ω–µ–≥–∞—Ç–∏–≤–∞ –Ω–∞ 1 –ø–æ–∑–∏—Ç–∏–≤

user_ids = features['user_id'].unique()
neg_samples = pd.DataFrame({
    'user_id': np.repeat(user_ids, n_neg),
    'item_id': np.random.choice(top_items, size=len(user_ids) * n_neg)
})
neg_samples['label'] = 0

train_df = pd.concat([
    features[['user_id', 'item_id', 'recency', 'item_popularity', 'user_clicks', 'label']],
    neg_samples
], ignore_index=True)

train_df.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)
train_df.fillna(0, inplace=True)
gc.collect()

# ========== 5. PREPARE DATA ==========
X = train_df[['recency', 'item_popularity', 'user_clicks']].values
y = train_df['label'].values
group = train_df.groupby('user_id').size().to_numpy()

# ========== 6. TRAIN MODELS ==========
models = {}

# --- LightGBM ---


# --- XGBoost ---
print("\nüîµ Training XGBoost Ranker...")
start = time.time()
xgb = XGBRanker(
    objective='rank:pairwise',
    learning_rate=0.05,
    n_estimators=100,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method='hist',
    n_jobs=-1,
)
xgb.fit(X, y, group=group)
models['XGBoost'] = xgb
print(f"‚úÖ XGBoost trained in {(time.time()-start)/60:.1f} min")

# --- CatBoost ---


gc.collect()

# ========== 7. GENERATE PREDICTIONS ==========
popular_items = item_pop.head(500).index.tolist()

for model_name, model in models.items():
    print(f"\nüöÄ Generating predictions for {model_name}...")
    preds = []
    t0 = time.time()
    total_users = len(sample)
    step = max(1, total_users // 100)  # –∫–∞–∂–¥—ã–µ 1% ‚Äî –æ–±–Ω–æ–≤–ª–µ–Ω–∏–µ —Å—Ç–∞—Ç—É—Å–∞

    for i, uid in enumerate(sample['user_id'], start=1):
        if uid not in active_users:
            preds.append({'user_id': uid, 'item_id': ' '.join(map(str, popular_items[:20]))})
            continue

        user_clicks_val = np.log1p(user_clicks.get(uid, 1))
        cand = pd.DataFrame({
            'item_id': popular_items,
            'user_id': uid,
            'recency': 0,
            'user_clicks': user_clicks_val,
            'item_popularity': np.log1p(item_pop.loc[popular_items].values)
        })

        X_pred = cand[['recency', 'item_popularity', 'user_clicks']].values
        cand['score'] = model.predict(X_pred)
        top20 = cand.nlargest(20, 'score')['item_id'].tolist()
        preds.append({'user_id': uid, 'item_id': ' '.join(map(str, top20))})

        if i % step == 0:
            done = (i / total_users) * 100
            elapsed = (time.time() - t0) / 60
            print(f"{done:.1f}% done ({elapsed:.1f} min elapsed)")

    submission = pd.DataFrame(preds)
    filename = f'submission_{model_name.lower()}.csv'
    submission.to_csv(filename, index=False)
    print(f"‚úÖ Saved {filename} | Total time: {(time.time()-t0)/60:.1f} min")

print("\nüéØ All models finished and submissions saved.")


In [None]:
popular_items = item_pop.head(20).index.tolist()  # top-200 –≤–º–µ—Å—Ç–æ 500
preds = []

for uid in tqdm(sample['user_id'], desc="Generating predictions"):
    if uid not in active_users:
        # fallback ‚Äî —Ç–æ–ª—å–∫–æ —Ç–æ–ø-20 —Ç–æ–≤–∞—Ä–æ–≤
        preds.append({'user_id': uid, 'item_id': ' '.join(map(str, popular_items[:20]))})
        continue

    user_clicks_val = np.log1p(user_clicks.get(uid, 1))
    cand = pd.DataFrame({
        'item_id': popular_items,
        'user_id': uid,
        'recency': 0,
        'user_clicks': user_clicks_val,
        'item_popularity': np.log1p(item_pop.loc[popular_items].values)
    })

    X_pred = cand[['recency', 'item_popularity', 'user_clicks']].values
    cand['score'] = model.predict(X_pred)
    top20 = cand.nlargest(20, 'score')['item_id'].tolist()
    preds.append({'user_id': uid, 'item_id': ' '.join(map(str, top20))})

submission = pd.DataFrame(preds)
submission.to_csv('submission_small.csv', index=False)


In [None]:
print(submission)

In [None]:
import pandas as pd

# –ó–∞–≥—Ä—É–∂–∞–µ–º –±–æ–ª—å—à–æ–π —Å–∞–±–º–∏—Ç
submission = pd.read_csv('/kaggle/working/submission_small.csv')

# –û–±—Ä–µ–∑–∞–µ–º –¥–æ —Ç–æ–ø-20 –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è
def top20_items(item_str):
    items = item_str.split()
    return ' '.join(items[:20])

submission['item_id'] = submission['item_id'].astype(str).apply(top20_items)

# –ü—Ä–∏–≤–æ–¥–∏–º –∫ int, —á—Ç–æ–±—ã –Ω–µ –±—ã–ª–æ –ª–∏—à–Ω–∏—Ö –∑–Ω–∞–∫–æ–≤
submission['item_id'] = submission['item_id'].apply(lambda x: ' '.join(map(str, map(int, x.split()))))

# –°–æ—Ö—Ä–∞–Ω—è–µ–º –±–µ–∑ –∏–Ω–¥–µ–∫—Å–∞
submission.to_csv('submission_small.csv.gz', index=False, compression='gzip')

print("‚úÖ –§–∞–π–ª —É–º–µ–Ω—å—à–µ–Ω –∏ —Å–æ—Ö—Ä–∞–Ω—ë–Ω –∫–∞–∫ submission_small.csv")


In [None]:
import pandas as pd
import numpy as np

# –ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞–Ω–Ω—ã–µ
train = pd.read_parquet('/kaggle/input/reccomend/train_data.pq')
sample = pd.read_csv('/kaggle/input/asddbfd/sample_submission (1).csv')

print(train.head())
print(sample.head())
# –¢–æ–ø-1000 –ø–æ–ø—É–ª—è—Ä–Ω—ã—Ö —Ç–æ–≤–∞—Ä–æ–≤
top_items = train['item_id'].value_counts().head(500).index.tolist()

# –ü–æ—Å–ª–µ–¥–Ω–∏–µ –∫–ª–∏–∫–∏ –∫–∞–∂–¥–æ–≥–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è
last_clicks = train.groupby('user_id')['item_id'].apply(lambda x: x.tolist()[-20:])


In [None]:
import random

user_pos_items = train.groupby('user_id')['item_id'].apply(set).to_dict()

pairs = []

for user, pos_items in user_pos_items.items():
    for item in pos_items:
        #print(user,pos_items)
        neg_item = random.choice([i for i in top_items if i not in pos_items])
        pairs.append([user, item, neg_item])
        print(pairs)

pairs = pd.DataFrame(pairs, columns=['user_id', 'pos_item', 'neg_item'])
print(pairs.head())


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

class PairDataset(Dataset):
    def __init__(self, df):
        self.users = df['user_id'].values
        self.pos_items = df['pos_item'].values
        self.neg_items = df['neg_item'].values

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.pos_items[idx], self.neg_items[idx]

class RankNetModel(nn.Module):
    def __init__(self, n_users, n_items, emb_size=32):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, emb_size)
        self.item_emb = nn.Embedding(n_items, emb_size)

    def forward(self, u, i, j):
        u_e = self.user_emb(u)
        i_e = self.item_emb(i)
        j_e = self.item_emb(j)
        # score difference
        x = (i_e - j_e) * u_e
        return torch.sum(x, dim=1)

# –ü—Å–µ–≤–¥–æ-–ø—Ä–æ–Ω—É–º–µ—Ä—É–µ–º –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π –∏ —Ç–æ–≤–∞—Ä—ã
user2id = {u:i for i,u in enumerate(train['user_id'].unique())}
item2id = {i:i for i,i in enumerate(train['item_id'].unique())}

pairs['user_id'] = pairs['user_id'].map(user2id)
pairs['pos_item'] = pairs['pos_item'].map(item2id)
pairs['neg_item'] = pairs['neg_item'].map(item2id)

dataset = PairDataset(pairs)
loader = DataLoader(dataset, batch_size=1024, shuffle=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = RankNetModel(len(user2id), len(item2id), emb_size=32).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()


In [None]:
for epoch in range(1):  # –Ω–µ—Å–∫–æ–ª—å–∫–æ —ç–ø–æ—Ö –¥–ª—è baseline
    model.train()
    total_loss = 0
    for u, i, j in loader:
        u, i, j = u.to(device), i.to(device), j.to(device)
        optimizer.zero_grad()
        # label = 1 –¥–ª—è pos>neg
        scores = model(u, i, j)
        loss = criterion(scores, torch.ones_like(scores))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")


In [None]:
model.eval()
topk = 20
preds = []

for user in train['user_id'].unique():
    u_id = torch.tensor([user2id[user]]*len(item2id)).to(device)
    items = torch.tensor(list(range(len(item2id)))).to(device)
    with torch.no_grad():
        scores = model.user_emb(u_id) * model.item_emb(items)
        scores = scores.sum(dim=1)
    top_items_idx = torch.topk(scores, topk).indices.cpu().numpy()
    top_items_ids = [list(item2id.keys())[i] for i in top_items_idx]
    preds.append([user] + top_items_ids)

submission = pd.DataFrame(preds, columns=sample.columns)
submission.to_csv('submission.csv', index=False)


In [None]:
# –ü–æ–ª–Ω—ã–π/–æ–±–Ω–æ–≤–ª—ë–Ω–Ω—ã–π –ø—Ä–∏–º–µ—Ä (–∑–∞–º–µ–Ω–∞ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É—é—â–∏—Ö —á–∞—Å—Ç–µ–π –ø—Ä–µ–¥—ã–¥—É—â–µ–≥–æ —Å–∫—Ä–∏–ø—Ç–∞)

import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ========== –ó–ê–ì–†–£–ó–ö–ê ==========
train = pd.read_parquet('/kaggle/input/reccomend/train_data.pq')  # –≤–∞—à train
sample = pd.read_csv('/kaggle/input/asddbfd/sample_submission (1).csv')  # –æ–±—Ä–∞–∑–µ—Ü —Å–∞–±–º–∏—Ç–∞ (–¥–ª–∏–Ω–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç)
# –ü–æ–ª—É—á–∏–º —Å–ø–∏—Å–æ–∫ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π, –¥–ª—è –∫–æ—Ç–æ—Ä—ã—Ö —Ç—Ä–µ–±—É–µ—Ç—Å—è –ø—Ä–µ–¥—Å–∫–∞–∑–∞—Ç—å (–≤ sample)
test_users = sample['user_id'].unique()
print("users in sample:", len(test_users))

# ========== –û–ì–†–ê–ù–ò–ß–ï–ù–ò–ï –ö–ê–ù–î–ò–î–ê–¢–û–í ==========
# –í–æ–∑—å–º—ë–º —Ç–æ–ø-N –ø–æ–ø—É–ª—è—Ä–Ω—ã—Ö —Ç–æ–≤–∞—Ä–æ–≤ –¥–ª—è —É—Å–∫–æ—Ä–µ–Ω–∏—è (–º–æ–∂–Ω–æ —É–≤–µ–ª–∏—á–∏—Ç—å N)
TOP_N = 1000
top_items = train['item_id'].value_counts().head(TOP_N).index.tolist()

# –°–ª–æ–≤–∞—Ä–∏ —Å–æ–ø–æ—Å—Ç–∞–≤–ª–µ–Ω–∏—è –¥–ª—è –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–Ω–æ–≥–æ –º–Ω–æ–∂–µ—Å—Ç–≤–∞ –ø—Ä–µ–¥–º–µ—Ç–æ–≤
item_list = top_items
item2id = {item: idx for idx, item in enumerate(item_list)}
id2item = {idx: item for item, idx in item2id.items()}

# –°–ª–æ–≤–∞—Ä—å –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã—Ö —Ç–æ–≤–∞—Ä–æ–≤ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è (—Ç–æ–ª—å–∫–æ –∏–∑ top_items)
user_pos_items_full = train.groupby('user_id')['item_id'].apply(list).to_dict()
user_pos_items = {u: set([i for i in lst if i in item2id]) for u, lst in user_pos_items_full.items()}

# ========== –°–û–ó–î–ê–ù–ò–ï –ü–ê–† –î–õ–Ø PAIRWISE –û–ë–£–ß–ï–ù–ò–Ø (RankNet) ==========
pairs = []
for user, pos_set in user_pos_items.items():
    if not pos_set:
        continue
    # –î–ª—è –∫–∞–∂–¥–æ–≥–æ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω–æ–≥–æ –ø—Ä–∏–º–µ—Ä–∞ –ø–æ–¥–±–∏—Ä–∞–µ–º –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã–π –∏–∑ top_items
    for pos in pos_set:
        # negative ‚Äî —Å–ª—É—á–∞–π–Ω—ã–π item –∏–∑ top_items, –∫–æ—Ç–æ—Ä–æ–≥–æ –Ω–µ—Ç —É –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è
        neg_candidates = [it for it in item_list if it not in pos_set]
        if not neg_candidates:
            continue
        neg = random.choice(neg_candidates)
        pairs.append((user, pos, neg))

pairs_df = pd.DataFrame(pairs, columns=['user_id', 'pos_item', 'neg_item'])
# –ü–µ—Ä–µ–Ω—É–º–µ—Ä—É–µ–º –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π –≤ –∫–æ–º–ø–∞–∫—Ç–Ω—ã–µ id
user_list = list({u for u in pairs_df['user_id'].unique()}.union(set(test_users)))
user2id = {u: idx for idx, u in enumerate(user_list)}
id2user = {idx: u for u, idx in user2id.items()}

pairs_df['u_id'] = pairs_df['user_id'].map(user2id)
pairs_df['pos_id'] = pairs_df['pos_item'].map(item2id)
pairs_df['neg_id'] = pairs_df['neg_item'].map(item2id)

# –û—Ç–±—Ä–∞—Å—ã–≤–∞–µ–º –ø–∞—Ä—ã, –≥–¥–µ pos –∏–ª–∏ neg –Ω–µ –≤ item2id (–Ω–∞ –≤—Å—è–∫–∏–π —Å–ª—É—á–∞–π)
pairs_df = pairs_df.dropna(subset=['pos_id', 'neg_id']).astype({'u_id':int,'pos_id':int,'neg_id':int})

# ========== DATASET / DATALOADER ==========
class PairDataset(Dataset):
    def __init__(self, df):
        self.u = df['u_id'].values
        self.i = df['pos_id'].values
        self.j = df['neg_id'].values
    def __len__(self):
        return len(self.u)
    def __getitem__(self, idx):
        return self.u[idx], self.i[idx], self.j[idx]

dataset = PairDataset(pairs_df)
loader = DataLoader(dataset, batch_size=2048, shuffle=True, num_workers=2)

# ========== –ú–û–î–ï–õ–¨ (RankNet-–ø–æ–¥–æ–±–Ω–∞—è) ==========
class RankNetModel(nn.Module):
    def __init__(self, n_users, n_items, emb_size=64):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, emb_size)
        self.item_emb = nn.Embedding(n_items, emb_size)
        # –Ω–µ–±–æ–ª—å—à–æ–π MLP –¥–ª—è –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è —Å–∫–∞–ª—è—Ä–Ω–æ–≥–æ —Ä–∞–∑–ª–∏—á–∏—è (–æ–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ)
        self.out = nn.Linear(emb_size, 1, bias=False)  # –º–æ–∂–Ω–æ —É–ø—Ä–æ—Å—Ç–∏—Ç—å/—É—Å–ª–æ–∂–Ω–∏—Ç—å

    def forward(self, u, i, j):
        u_e = self.user_emb(u)             # (B, E)
        i_e = self.item_emb(i)             # (B, E)
        j_e = self.item_emb(j)             # (B, E)
        # score = dot(user, item)
        s_i = (u_e * i_e).sum(dim=1)       # (B,)
        s_j = (u_e * j_e).sum(dim=1)       # (B,)
        x = s_i - s_j                      # (B,)
        return x

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = RankNetModel(n_users=len(user2id), n_items=len(item2id), emb_size=64).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()




users in sample: 293230
Epoch 1/3  avg loss: 3.942721
Epoch 2/3  avg loss: 2.414918
Epoch 3/3  avg loss: 1.482728
Saved submission.csv, rows: 5864600


In [None]:
# ========== –û–ë–£–ß–ï–ù–ò–ï (–Ω–µ—Å–∫–æ–ª—å–∫–æ —ç–ø–æ—Ö –¥–ª—è baseline) ==========
EPOCHS = 16
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0.0
    for u, i, j in loader:
        u = u.to(device).long()
        i = i.to(device).long()
        j = j.to(device).long()
        opt.zero_grad()
        logits = model(u, i, j)
        loss = criterion(logits, torch.ones_like(logits, device=device))
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{EPOCHS}  avg loss: {total_loss/len(loader):.6f}")

# ========== –ü–†–ï–î–°–ö–ê–ó–ê–ù–ò–Ø –î–õ–Ø users –∏–∑ sample (top-20) ==========
model.eval()
TOPK = 20
# Prepare item tensors once
all_item_ids = torch.arange(len(item2id), device=device).long()
all_item_emb = model.item_emb(all_item_ids)  # (N_items, E)

submission_rows = []
with torch.no_grad():
    for u in test_users:
        # map user to internal id; –µ—Å–ª–∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—å –Ω–µ –±—ã–ª –≤ train pairs, –¥–æ–±–∞–≤–∏–º –Ω–æ–≤—ã–π id (–µ—Å–ª–∏ –Ω–µ—Ç ‚Äî —Ä–µ–∫–æ–º–µ–Ω–¥–æ–≤–∞—Ç—å –ø–æ –ø–æ–ø—É–ª—è—Ä–Ω–æ—Å—Ç–∏)
        if u in user2id:
            u_id = torch.tensor([user2id[u]], device=device).long()
            u_emb = model.user_emb(u_id)  # (1, E)
            # –≤—ã—á–∏—Å–ª–∏–º —Å–∫–æ—Ä –¥–ª—è –≤—Å–µ—Ö candidate items: dot(u_emb, all_item_emb)
            scores = (u_emb @ all_item_emb.t()).squeeze(0)  # (N_items,)
            topk_idx = torch.topk(scores, min(TOPK, scores.size(0))).indices.cpu().numpy().tolist()
            top_items_pred = [id2item[idx] for idx in topk_idx]
        else:
            # fallback: –ø—Ä–æ—Å—Ç–æ –≤–µ—Ä–Ω—É—Ç—å —Å–∞–º—ã–µ –ø–æ–ø—É–ª—è—Ä–Ω—ã–µ —Ç–æ–ø-N (–≤ –ø–æ—Ä—è–¥–∫–µ –ø–æ–ø—É–ª—è—Ä–Ω–æ—Å—Ç–∏)
            top_items_pred = item_list[:TOPK]

        # –£–±–µ–¥–∏–º—Å—è, —á—Ç–æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è —É–Ω–∏–∫–∞–ª—å–Ω—ã –∏ –Ω–µ —Å–æ–¥–µ—Ä–∂–∞—Ç —É–∂–µ –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–æ–≤–∞–≤—à–∏—Ö (–æ–ø—Ü–∏–æ–Ω–∞–ª—å–Ω–æ)
        # –ù–æ –¥–ª—è baseline ‚Äî –æ—Å—Ç–∞–≤–∏–º –∫–∞–∫ –µ—Å—Ç—å; –º–æ–∂–Ω–æ —Ñ–∏–ª—å—Ç—Ä–æ–≤–∞—Ç—å –ø–æ user_pos_items_full.

        # –î–æ–±–∞–≤–ª—è–µ–º –≤ –¥–ª–∏–Ω–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç: –æ–¥–Ω—É —Å—Ç—Ä–æ–∫—É –Ω–∞ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã–π item
        for it in top_items_pred:
            submission_rows.append((u, it))

# –°–æ–∑–¥–∞—ë–º DataFrame –∏ —Å–æ—Ö—Ä–∞–Ω—è–µ–º –≤ —Ñ–æ—Ä–º–∞—Ç–µ, –∞–Ω–∞–ª–æ–≥–∏—á–Ω–æ–º sample
sub_df = pd.DataFrame(submission_rows, columns=['user_id', 'item_id'])
sub_df.to_csv('submission.csv', index=False)
print("Saved submission.csv, rows:", len(sub_df))

Epoch 1/16  avg loss: 0.003568
Epoch 2/16  avg loss: 0.001962
Epoch 3/16  avg loss: 0.001055
Epoch 4/16  avg loss: 0.000557
Epoch 5/16  avg loss: 0.000293
Epoch 6/16  avg loss: 0.000153
Epoch 7/16  avg loss: 0.000080
Epoch 8/16  avg loss: 0.000042
Epoch 9/16  avg loss: 0.000022
Epoch 10/16  avg loss: 0.000012
Epoch 11/16  avg loss: 0.000006


KeyboardInterrupt: 

In [None]:
sub_df = pd.DataFrame(submission_rows, columns=['user_id', 'item_id'])
sub_df.to_csv('submission.csv', index=False)
print("Saved submission.csv, rows:", len(sub_df))

Saved submission.csv, rows: 5864600


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm

# ========== 1. Load data ==========
train = pd.read_parquet('/kaggle/input/reccomend/train_data.pq')
sample = pd.read_csv('/kaggle/input/asddbfd/sample_submission (1).csv')
test_users = sample['user_id'].unique()

# ========== 2. Feature generation ==========
print("Building simple features...")

# Item popularity
item_pop = train['item_id'].value_counts().rename('item_pop')

# User activity
user_freq = train['user_id'].value_counts().rename('user_freq')

# Recency feature: max date per user/item
user_last = train.groupby('user_id')['date'].max().rename('user_last')
item_last = train.groupby('item_id')['date'].max().rename('item_last')

train = train.join(item_pop, on='item_id')
train = train.join(user_freq, on='user_id')
train = train.join(user_last, on='user_id')
train = train.join(item_last, on='item_id')
train['recency'] = train['user_last'] - train['date']

# ========== 3. Build candidates (top popular + recent clicks per user) ==========
top_items = train['item_id'].value_counts().head(1000).index.tolist()
user_recent = train.groupby('user_id')['item_id'].apply(lambda x: x.tail(10).tolist())

pairs = []
for user, items in tqdm(user_recent.items()):
    for pos in items:
        neg = np.random.choice(top_items)
        pairs.append((user, pos, 1))  # positive
        pairs.append((user, neg, 0))  # negative

pairs = pd.DataFrame(pairs, columns=['user_id', 'item_id', 'label'])
pairs = pairs.join(item_pop, on='item_id')
pairs = pairs.join(item_last, on='item_id')
pairs = pairs.join(user_freq, on='user_id')
pairs = pairs.join(user_last, on='user_id')
pairs['recency'] = pairs['user_last'] - pairs['item_last']
pairs = pairs.fillna(0)

# ========== 4. Prepare for XGBoost ==========
features = ['item_pop', 'item_last', 'user_freq', 'user_last', 'recency']
X = pairs[features].values
y = pairs['label'].values

# group sizes (number of items per user)
group = pairs.groupby('user_id').size().values

ranker = xgb.XGBRanker(
    objective='rank:pairwise',
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    tree_method='hist',
    random_state=42
)

print("Training XGBoost Ranker...")
ranker.fit(X, y, group=group)

# ========== 5. Prediction ==========
print("Generating recommendations...")
sub_rows = []
for user in tqdm(test_users):
    # —Å–æ–∑–¥–∞–µ–º –∫–∞–Ω–¥–∏–¥–∞—Ç–æ–≤ (–ø–æ—Å–ª–µ–¥–Ω–∏–µ –∫–ª–∏–∫–∏ + —Ç–æ–ø)
    cands = list(set(user_recent.get(user, [])) | set(top_items))
    df = pd.DataFrame({'user_id': user, 'item_id': cands})
    df = df.join(item_pop, on='item_id')
    df = df.join(item_last, on='item_id')
    df = df.join(user_freq, on='user_id')
    df = df.join(user_last, on='user_id')
    df['recency'] = df['user_last'] - df['item_last']
    df = df.fillna(0)
    X_test = df[features].values
    preds = ranker.predict(X_test)
    df['pred'] = preds
    top20 = df.sort_values('pred', ascending=False).head(20)
    for item in top20['item_id'].tolist():
        sub_rows.append((user, item))

sub = pd.DataFrame(sub_rows, columns=['user_id', 'item_id'])
sub.to_csv('submission_xgboost.csv', index=False)
print("‚úÖ submission_xgboost.csv saved")


Building simple features...


2682603it [09:39, 4628.76it/s] 


Training XGBoost Ranker...
Generating recommendations...


  1%|          | 2722/293230 [29:18<52:08:34,  1.55it/s]


KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm

# ========== 1. Load and sort data ==========
train = pd.read_parquet('/kaggle/input/reccomend/train_data.pq')
sample = pd.read_csv('/kaggle/input/asddbfd/sample_submission (1).csv')
test_users = sample['user_id'].unique()

# –°–æ—Ä—Ç–∏—Ä—É–µ–º –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–∏—è –ø–æ –¥–∞—Ç–µ
train = train.sort_values(['user_id', 'date'])

# ========== 2. –ü–æ–¥–≥–æ—Ç–æ–≤–∏–º –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏ ==========
max_len = 20  # –¥–ª–∏–Ω–∞ –∏—Å—Ç–æ—Ä–∏–∏
user_sequences = (
    train.groupby('user_id')['item_id']
    .apply(lambda x: x.tolist()[-max_len:])
    .to_dict()
)

# –°–ª–æ–≤–∞—Ä—å item_id ‚Üí –∏–Ω–¥–µ–∫—Å
item_vocab = {it: idx+1 for idx, it in enumerate(train['item_id'].unique())}  # +1 –¥–ª—è PAD=0
id2item = {v:k for k,v in item_vocab.items()}
n_items = len(item_vocab) + 1

# ========== 3. Dataset ==========
class SeqDataset(Dataset):
    def __init__(self, user_seq):
        self.users = list(user_seq.keys())
        self.seqs = list(user_seq.values())

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        seq = self.seqs[idx]
        seq = [item_vocab[i] for i in seq if i in item_vocab]
        pad_len = max_len - len(seq)
        seq = [0]*pad_len + seq  # left padding
        target = seq[-1]  # last item
        return torch.tensor(seq[:-1]), torch.tensor(target)

train_ds = SeqDataset(user_sequences)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

# ========== 4. SASRec-–ø–æ–¥–æ–±–Ω–∞—è –º–æ–¥–µ–ª—å ==========
class TransformerRec(nn.Module):
    def __init__(self, num_items, d_model=512, nhead=8, num_layers=6, max_len=5000):
        super().__init__()
        self.num_items = num_items
        self.d_model = d_model
        self.max_len = max_len

        self.item_emb = nn.Embedding(num_items + 1, d_model, padding_idx=0)
        self.pos_emb = nn.Embedding(max_len, d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            batch_first=False,  # —Å—Ç–∞–Ω–¥–∞—Ä—Ç–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç (L, B, D)
            dropout=0.1
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.proj = nn.Linear(d_model, num_items + 1)

        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.item_emb.weight, mean=0, std=0.01)
        nn.init.normal_(self.pos_emb.weight, mean=0, std=0.01)
        nn.init.xavier_normal_(self.proj.weight)
        nn.init.constant_(self.proj.bias, 0)

    def forward(self, seq):
        batch_size, seq_len = seq.size()

        pos = torch.arange(seq_len, device=seq.device).unsqueeze(0).expand(batch_size, -1)

        x = self.item_emb(seq) + self.pos_emb(pos)  # (B, L, D)
        x = x.permute(1, 0, 2)  # (L, B, D)

        out = self.encoder(x)  # (L, B, D)
        out = out.permute(1, 0, 2)  # (B, L, D)

        logits = self.proj(out)  # (B, L, num_items + 1)

        return logits

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = TransformerRec(n_items=n_items).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# ========== 5. –û–±—É—á–µ–Ω–∏–µ ==========
for epoch in range(3):
    model.train()
    total_loss = 0
    for seq, tgt in train_dl:
        seq, tgt = seq.to(device), tgt.to(device)
        opt.zero_grad()
        logits = model(seq)
        loss = criterion(logits, tgt)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: loss={total_loss/len(train_dl):.4f}")

# ========== 6. –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è ==========
model.eval()
TOPK = 20
submission_rows = []

with torch.no_grad():
    for user in tqdm(test_users):
        seq = user_sequences.get(user, [])
        seq = [item_vocab[i] for i in seq if i in item_vocab]
        pad_len = max_len - len(seq)
        seq = [0]*pad_len + seq
        seq_t = torch.tensor(seq[:-1]).unsqueeze(0).to(device)
        logits = model(seq_t)
        topk_idx = torch.topk(logits, TOPK, dim=1).indices[0].cpu().numpy()
        top_items = [id2item[i] for i in topk_idx if i in id2item]
        for it in top_items:
            submission_rows.append((user, it))

sub = pd.DataFrame(submission_rows, columns=['user_id', 'item_id'])
sub.to_csv('submission_transformer.csv', index=False)
print("‚úÖ submission_transformer.csv saved")


TypeError: TransformerRec.__init__() got an unexpected keyword argument 'n_items'

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm

# ========== 1. Load and sort data ==========
train = pd.read_parquet('/kaggle/input/reccomend/train_data.pq')
sample = pd.read_csv('/kaggle/input/asddbfd/sample_submission (1).csv')
test_users = sample['user_id'].unique()

# –°–æ—Ä—Ç–∏—Ä—É–µ–º –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–∏—è –ø–æ –¥–∞—Ç–µ
train = train.sort_values(['user_id', 'date'])

# ========== 2. –ü–æ–¥–≥–æ—Ç–æ–≤–∏–º –ø–æ—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å–Ω–æ—Å—Ç–∏ ==========
max_len = 20  # –¥–ª–∏–Ω–∞ –∏—Å—Ç–æ—Ä–∏–∏
user_sequences = (
    train.groupby('user_id')['item_id']
    .apply(lambda x: x.tolist()[-max_len:])
    .to_dict()
)

# –°–ª–æ–≤–∞—Ä—å item_id ‚Üí –∏–Ω–¥–µ–∫—Å
item_vocab = {it: idx+1 for idx, it in enumerate(train['item_id'].unique())}  # +1 –¥–ª—è PAD=0
id2item = {v:k for k,v in item_vocab.items()}
n_items = len(item_vocab) + 1

# ========== 3. Dataset ==========
class SeqDataset(Dataset):
    def __init__(self, user_seq):
        self.users = list(user_seq.keys())
        self.seqs = list(user_seq.values())

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        seq = self.seqs[idx]
        seq = [item_vocab[i] for i in seq if i in item_vocab]
        pad_len = max_len - len(seq)
        seq = [0]*pad_len + seq  # left padding
        target = seq[-1]  # last item
        return torch.tensor(seq[:-1]), torch.tensor(target)

train_ds = SeqDataset(user_sequences)
train_dl = DataLoader(train_ds, batch_size=256, shuffle=True)

# ========== 4. SASRec-–ø–æ–¥–æ–±–Ω–∞—è –º–æ–¥–µ–ª—å ==========
import torch
import torch.nn as nn
import torch.nn.functional as F

class SASRec(nn.Module):
    def __init__(self, n_items, d_model=64, n_heads=4, n_layers=2, max_len=20, dropout=0.2):
        super().__init__()
        self.item_emb = nn.Embedding(n_items, d_model, padding_idx=0)
        self.pos_emb = nn.Embedding(max_len, d_model)
        self.dropout = nn.Dropout(dropout)
        self.layernorm = nn.LayerNorm(d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads, dim_feedforward=d_model*4,
            batch_first=True, dropout=dropout, activation='gelu'
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.fc = nn.Linear(d_model, n_items)
        self.max_len = max_len

    def forward(self, seq):
        B, L = seq.shape
        pos = torch.arange(L, device=seq.device).unsqueeze(0).expand(B, -1)
        x = self.item_emb(seq) + self.pos_emb(pos)
        x = self.layernorm(self.dropout(x))

        # causal mask: –º–æ–¥–µ–ª—å –Ω–µ –≤–∏–¥–∏—Ç –±—É–¥—É—â–µ–µ
        mask = torch.triu(torch.ones(L, L, device=seq.device), diagonal=1).bool()
        out = self.encoder(x, mask)
        logits = self.fc(out)  # (B, L, n_items)
        return logits

def compute_loss(logits, seq):
    # —Ç–∞—Ä–≥–µ—Ç—ã: —Å–ª–µ–¥—É—é—â–∏–π item –¥–ª—è –∫–∞–∂–¥–æ–π –ø–æ–∑–∏—Ü–∏–∏
    targets = seq[:, 1:]
    inputs = logits[:, :-1, :]
    loss = F.cross_entropy(
        inputs.reshape(-1, inputs.size(-1)),
        targets.reshape(-1),
        ignore_index=0
    )
    return loss


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SASRec(n_items=n_items).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# ========== 5. –û–±—É—á–µ–Ω–∏–µ ==========
for epoch in range(1):
    model.train()
    total_loss = 0
    for seq, tgt in train_dl:
        seq, tgt = seq.to(device), tgt.to(device)
        opt.zero_grad()
        logits = model(seq)
        loss = criterion(logits, tgt)
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}: loss={total_loss/len(train_dl):.4f}")

# ========== 6. –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è ==========
model.eval()
TOPK = 20
submission_rows = []

with torch.no_grad():
    for user in tqdm(test_users):
        seq = user_sequences.get(user, [])
        seq = [item_vocab[i] for i in seq if i in item_vocab]
        pad_len = max_len - len(seq)
        seq = [0]*pad_len + seq
        seq_t = torch.tensor(seq[:-1]).unsqueeze(0).to(device)
        logits = model(seq_t)
        topk_idx = torch.topk(logits, TOPK, dim=1).indices[0].cpu().numpy()
        top_items = [id2item[i] for i in topk_idx if i in id2item]
        for it in top_items:
            submission_rows.append((user, it))

sub = pd.DataFrame(submission_rows, columns=['user_id', 'item_id'])
sub.to_csv('submission_transformer.csv', index=False)
print("‚úÖ submission_transformer.csv saved")


OutOfMemoryError: CUDA out of memory. Tried to allocate 13.42 GiB. GPU 0 has a total capacity of 15.89 GiB of which 1.74 GiB is free. Process 2500 has 14.15 GiB memory in use. Of the allocated memory 13.83 GiB is allocated by PyTorch, and 26.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
torch.save(model.state_dict(), 'my_model.pth')



In [None]:
# two_tower_recommender.py
import os
import random
from collections import defaultdict
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ----------------- CONFIG -----------------
TRAIN_PATH = '/kaggle/input/reccomend/train_data.pq'
SAMPLE_PATH = '/kaggle/input/asddbfd/sample_submission (1).csv'  # –∑–∞–º–µ–Ω–∏—Ç–µ –ø—Ä–∏ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ—Å—Ç–∏
OUTPUT_PATH = 'submission_two_tower.csv'

EMB_DIM = 64        # —Ä–∞–∑–º–µ—Ä —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
HIDDEN = 128        # —Ä–∞–∑–º–µ—Ä MLP –≤–Ω—É—Ç—Ä–∏ –±–∞—à–µ–Ω
BATCH_SIZE = 4096
EPOCHS = 3
LR = 1e-3
TOP_ITEMS = 2000    # –∫–∞–Ω–¥–∏–¥–∞—Ç—ã (–ø–æ–ø—É–ª—è—Ä–Ω—ã–µ) ‚Äî –º–æ–∂–Ω–æ —É–≤–µ–ª–∏—á–∏—Ç—å
TOPK = 20
NUM_NEG = 1         # –æ—Ç—Ä–∏—Ü–∞—Ç–µ–ª—å–Ω—ã—Ö –Ω–∞ –ø–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–π
SEED = 42

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

# ----------------- DATA LOAD -----------------
print("Loading data...")
train = pd.read_parquet(TRAIN_PATH)
sample = pd.read_csv(SAMPLE_PATH)

# users to predict (preserve order if needed)
test_users_ordered = sample['user_id'].values  # –¥–ª–∏–Ω–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç likely repeats users; we'll take unique in order
_, idx = np.unique(test_users_ordered, return_index=True)
test_users = test_users_ordered[np.sort(idx)]

# ----------------- CANDIDATE REDUCTION -----------------
# —Ç–æ–ø –ø–æ–ø—É–ª—è—Ä–Ω—ã—Ö —Ç–æ–≤–∞—Ä–æ–≤ –¥–ª—è –∫–∞–Ω–¥–∏–¥–∞—Ç–æ–≤ (—É—Å–∫–æ—Ä–µ–Ω–∏–µ)
popular_items = train['item_id'].value_counts().index.tolist()
top_items = popular_items[:TOP_ITEMS]

# ----------------- ID MAPPINGS -----------------
all_users = train['user_id'].unique().tolist()
# include test users even if they didn't appear in training pairs mapping ‚Äî but prompt said they do
for u in test_users:
    if u not in all_users:
        all_users.append(u)
all_items = list({i for i in train['item_id'].unique() if i in top_items})  # limit items to top_items
# ensure top_items are present in item list
all_items_set = set(all_items)
for it in top_items:
    if it not in all_items_set:
        all_items.append(it)

user2id = {u: idx for idx, u in enumerate(all_users)}
id2user = {v:k for k,v in user2id.items()}
item2id = {i: idx for idx, i in enumerate(all_items)}
id2item = {v:k for k,v in item2id.items()}

n_users = len(user2id)
n_items = len(item2id)
print(f"n_users={n_users}, n_items={n_items}")

# ----------------- BUILD USER HISTORIES -----------------
user_pos = train.groupby('user_id')['item_id'].apply(list).to_dict()
# filter histories to items inside item2id (candidates)
user_pos_filtered = {u: [i for i in lst if i in item2id] for u, lst in user_pos.items()}

# ----------------- TRAINING PAIR DATASET -----------------
pairs = []
for u, items in user_pos_filtered.items():
    if not items:
        continue
    # each positive item can produce NUM_NEG negative samples
    for pos in set(items):
        for _ in range(NUM_NEG):
            # sample negative from top_items but exclude user's positives
            neg = random.choice(top_items)
            # if neg not in item2id, skip (could happen if top_items larger than our filtered)
            if neg not in item2id:
                continue
            if neg in items:
                continue
            pairs.append((u, pos, neg))

print(f"Pairs for training: {len(pairs)}")

pairs_df = pd.DataFrame(pairs, columns=['user', 'pos', 'neg'])
pairs_df['u_id'] = pairs_df['user'].map(user2id)
pairs_df['pos_id'] = pairs_df['pos'].map(item2id)
pairs_df['neg_id'] = pairs_df['neg'].map(item2id)
pairs_df = pairs_df.dropna().astype(int)

class PairDataset(Dataset):
    def __init__(self, df):
        self.u = df['u_id'].values
        self.pos = df['pos_id'].values
        self.neg = df['neg_id'].values
    def __len__(self):
        return len(self.u)
    def __getitem__(self, idx):
        return self.u[idx], self.pos[idx], self.neg[idx]

train_loader = DataLoader(PairDataset(pairs_df), batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)

# ----------------- MODEL -----------------
class Tower(nn.Module):
    def __init__(self, n_entities, emb_dim, hidden):
        super().__init__()
        self.emb = nn.Embedding(n_entities, emb_dim)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, emb_dim)
        )

    def forward(self, idx):
        x = self.emb(idx)
        x = self.mlp(x)
        # L2-normalize for stable dot product / cosine-like scoring
        x = x / (x.norm(p=2, dim=1, keepdim=True) + 1e-8)
        return x

class TwoTowerModel(nn.Module):
    def __init__(self, n_users, n_items, emb_dim=64, hidden=128):
        super().__init__()
        self.user_tower = Tower(n_users, emb_dim, hidden)
        self.item_tower = Tower(n_items, emb_dim, hidden)

    def forward(self, u_idx, i_idx=None, j_idx=None):
        u_emb = self.user_tower(u_idx)  # (B, E)
        if i_idx is not None:
            i_emb = self.item_tower(i_idx)
        else:
            i_emb = None
        if j_idx is not None:
            j_emb = self.item_tower(j_idx)
        else:
            j_emb = None
        return u_emb, i_emb, j_emb

model = TwoTowerModel(n_users=n_users, n_items=n_items, emb_dim=EMB_DIM, hidden=HIDDEN).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# BPR-style loss using logits difference + softplus
def bpr_loss(u_emb, pos_emb, neg_emb):
    # u_emb: (B, E), pos_emb/neg_emb: (B, E)
    pos_scores = (u_emb * pos_emb).sum(dim=1)   # (B,)
    neg_scores = (u_emb * neg_emb).sum(dim=1)
    x = pos_scores - neg_scores
    # softplus(-x) encourages pos>neg; equivalently use -log(sigmoid(x))
    return torch.nn.functional.softplus(-x).mean()

# ----------------- TRAINING LOOP -----------------
print("Start training...")
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch}"):
        u_ids, pos_ids, neg_ids = batch
        u_ids = u_ids.to(device).long()
        pos_ids = pos_ids.to(device).long()
        neg_ids = neg_ids.to(device).long()

        optimizer.zero_grad()
        u_emb, pos_emb, neg_emb = model(u_ids, pos_ids, neg_ids)
        loss = bpr_loss(u_emb, pos_emb, neg_emb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch} avg loss: {avg_loss:.6f}")

# ----------------- GENERATE RECOMMENDATIONS -----------------
# Precompute all item embeddings (for candidate set)
print("Computing item embeddings...")
model.eval()
with torch.no_grad():
    # item indices 0..n_items-1
    all_item_idx = torch.arange(n_items, device=device).long()
    item_embs = model.item_tower.emb(all_item_idx)
    item_embs = model.item_tower.mlp(item_embs)
    item_embs = item_embs / (item_embs.norm(p=2, dim=1, keepdim=True) + 1e-8)  # (n_items, E)
    # to cpu for faster topk if GPU memory limited (we'll keep on device to use torch.topk)
    item_embs_t = item_embs  # keep on device

# We'll produce recommendations for unique test users in order
print("Generating submission...")
submission_rows = []
unique_test_users = test_users  # ordered unique users from earlier

batch_size_predict = 1024
for i in tqdm(range(0, len(unique_test_users), batch_size_predict)):
    batch_users = unique_test_users[i:i+batch_size_predict]
    u_idx = []
    for u in batch_users:
        u_idx.append(user2id.get(u, None))
    # if some users missing mapping (unlikely), fallback to popular items
    valid_mask = [idx is not None and idx < n_users for idx in u_idx]
    # create tensor of user indices (fill invalid with 0)
    u_idx_tensor = torch.tensor([x if x is not None else 0 for x in u_idx], device=device).long()
    with torch.no_grad():
        u_emb_batch = model.user_tower.emb(u_idx_tensor)
        u_emb_batch = model.user_tower.mlp(u_emb_batch)
        u_emb_batch = u_emb_batch / (u_emb_batch.norm(p=2, dim=1, keepdim=True) + 1e-8)  # (B, E)
        # scores = u_emb_batch @ item_embs_t.T  -> (B, n_items)
        scores = torch.matmul(u_emb_batch, item_embs_t.t())  # (B, n_items)
        topk_vals, topk_idxs = torch.topk(scores, k=min(TOPK, n_items), dim=1)

    for bi, uid in enumerate(batch_users):
        if not valid_mask[bi]:
            # fallback: top popular items
            recs = top_items[:TOPK]
        else:
            idxs = topk_idxs[bi].cpu().numpy().tolist()
            recs = [id2item[idx] for idx in idxs]
        # append to long-format (one row per recommendation)
        for it in recs:
            submission_rows.append((uid, it))

sub_df = pd.DataFrame(submission_rows, columns=['user_id', 'item_id'])
sub_df.to_csv(OUTPUT_PATH, index=False)
print(f"Saved {OUTPUT_PATH} with {len(sub_df)} rows")


Loading data...
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_37/4168239699.py", line None, in <cell line: 0>
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
          ^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/ultratb.py", line 1101, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^