<a href="https://colab.research.google.com/github/dashatenoff/recsys-vk/blob/main/notebooks/hubrid_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Hubrid_mode

# Hybrid Recommender System (ALS + LightGBM)

## Project goal
Build a hybrid recommendation system:
- Candidate generation: ALS (implicit)
- Re-ranking: LightGBM Ranker
- Metric: MAP@10

## Data
- User interactions (implicit feedback)
- User metadata
- Item metadata
- Item embeddings

## Pipeline
1. Build user-item matrix
2. Train ALS model
3. Generate top-100 candidates per user
4. Create features
5. Train LightGBM Ranker
6. Evaluate MAP@10


#Imports


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from google.colab import drive
import os


#Load Data

In [None]:
drive.mount('/content/drive')
os.listdir('/content/drive/MyDrive')
train = pd.read_parquet('/content/drive/MyDrive/VK/train.parquet')
test = pd.read_parquet('/content/drive/MyDrive/VK/test.parquet')
item_embeddings = pd.read_parquet('/content/drive/MyDrive/VK/item_embeddings.parquet')
item_metadata = pd.read_parquet('/content/drive/MyDrive/VK/item_metadata.parquet')
user_metadata = pd.read_parquet('/content/drive/MyDrive/VK/user_metadata.parquet')

## Candidate generation with ALS
We train ALS to generate top-100 candidates per user.


#Подготовка данных

In [None]:
from scipy.sparse import csr_matrix
user_to_index = {}
item_to_index = {}
rows, cols, data = [], [], []

for row in train.itertuples():
  u = user_to_index.setdefault(row.user_id, len(user_to_index))
  i = item_to_index.setdefault(row.item_id, len(item_to_index))

  rows.append(u)
  cols.append(i)
  data.append(1)

user_item_matrix = csr_matrix((data, (rows, cols)))

#Обучение ALS модели

In [None]:
!pip install implicit

In [None]:
import implicit
als_model = implicit.als.AlternatingLeastSquares(
    factors=32, #размерность embeddding
    regularization=10,
    alpha=1000,
    iterations=20,
)

als_model.fit(user_item_matrix)

In [None]:
index_to_item = { v: k for k, v in item_to_index.items()}

def recommend_for_user_als(user_id, k=100):
  if user_id not in user_to_index:
    return []

  user_idx = user_to_index[user_id]

  item_index, scores  = als_model.recommend(
      user_idx,
      user_item_matrix[user_idx],
      N = k,
      filter_already_liked_items=True
  )

  return  [
      (index_to_item[i], s) for i, s in zip(item_index, scores)
      ]
test_user = test['user_id'].iloc[0]
recommend_for_user_als(test_user, k=10)


In [None]:
all_test_users = test['user_id'].nunique()
covered_users = len(test_users_test)

print("Coverage:", covered_users / all_test_users)


In [None]:
recs = []
test_users = test['user_id'].unique()

for user_id in test_users:
  recs_items = recommend_for_user_als(user_id, k=100)
  for item, score in recs_items:
    recs.append({
      'user_id' : user_id,
      'item_id' : item,
      'als_score' : score
    })
submission = pd.DataFrame(recs)
submission.to_csv('submission_als', index=False)
submission.head(20)


## Generate candidates
For each user we generate top-100 items using ALS.


In [None]:
user_history_test = test.groupby('user_id')['item_id'].apply(set)

recs_test = []
test_users_test = [u for u in test['user_id'].unique() if u in user_to_index]

for user_id in test_users_test:
  recs_items = recommend_for_user_als(user_id, k=100)
  for item, score in recs_items:
    recs_test.append({
      'user_id' : user_id,
      'item_id' : item,
      'als_score' : score
    })
recs_test = pd.DataFrame(recs_test)

recs_test['label'] = recs_test.apply(
    lambda x: int(x['item_id'] in user_history_test.get(x['user_id'], set())), axis=1
)


In [None]:
# user_history = train.groupby('user_id')['item_id'].apply(set)

# recs_train = []
# test_users_train = train['user_id'].unique()

# for user_id in test_users_train:
#   recs_items = recommend_for_user_als(user_id, k=100)
#   for item, score in recs_items:
#     recs_train.append({
#       'user_id' : user_id,
#       'item_id' : item,
#       'als_score' : score
#     })
# recs_train = pd.DataFrame(recs_train)

# recs_train['label'] = recs_train.apply(
#     lambda x: int(x['item_id'] in user_history.get(x['user_id'], set())), axis=1
# )


In [None]:
recs_test['label'].mean()

In [None]:
u = test_users_test[10]

rec = recommend_for_user_als(u, k=100)
rec = set([ i for i, _ in rec])
len(rec & user_history_test[u])

In [None]:
user_metadata.head()

## Feature engineering

Features:
- ALS score
- User: age, gender, geo
- Item: duration, author_id
- Item embeddings



In [None]:
item_metadata.head()
# item_embeddings.head()

In [None]:
# recs_train = recs_train.merge(
#     user_metadata[['user_id', 'age', 'gender', 'geo']],
#     on='user_id',
#     how='left'
# )

# recs_train = recs_train.merge(
#     item_metadata[['item_id', 'author_id', 'duration', 'embedding']],
#     on='item_id',
#     how='left'
# )

# emb_df = pd.DataFrame(list(recs_train['embedding']), index=recs_train.index)
# emb_df.columns = [f"emb_{i}" for i in range(1, emb_df.shape[1]+1)]
# recs_train = recs_train.drop('embedding', axis=1)
recs_train = pd.concat([recs_train, emb_df], axis=1)

In [None]:
# User features
recs_test = recs_test.merge(
    user_metadata[['user_id', 'age', 'gender', 'geo']],
    on='user_id',
    how='left'
)

# Item features
recs_test = recs_test.merge(
    item_metadata[['item_id', 'author_id', 'duration', 'embedding']],
    on='item_id',
    how='left'
)

# Expand embeddings
emb_df = pd.DataFrame(
    list(recs_test['embedding']),
    index=recs_test.index
)
emb_df.columns = [f"emb_{i}" for i in range(1, emb_df.shape[1] + 1)]

# Drop original embedding column and concatenate
recs_test = recs_test.drop('embedding', axis=1)
recs_test = pd.concat([recs_test, emb_df], axis=1)


In [None]:
recs_test.head(10)



## LightGBM Ranker
We train a LambdaRank model for re-ranking.


###Обучение

In [None]:
# from numpy._core.defchararray import startswith
# y = recs_train['label']

# feature = ['als_score', 'age', 'gender', 'geo', 'author_id', 'duration'] + [col for col in recs_train if col.startswith('emb')]

In [None]:
# from sklearn.model_selection import train_test_split
# unique  = recs_train['user_id'].unique()
# u_train, u_test = train_test_split(unique, test_size=0.2, random_state=42)
# train_df = recs_train[recs_train['user_id'].isin(u_train)]
# test_df = recs_train[recs_train['user_id'].isin(u_test)]
# X_train = train_df[feature]
# X_test = test_df[feature]
# y_train = train_df['label']
# y_test = test_df['label']
# group_train = train_df.groupby('user_id').size().tolist()

In [None]:
import lightgbm as lgb
ranker = lgb.LGBMRanker(
    objective='lambdarank',
    n_esimators=200,
    learning_rate=0.05,
    group=group_train
)

ranker.fit(X_train, y_train, group=group_train )

In [None]:
# test_df = test_df.sort_values(['user_id', 'score'], ascending=[True, False])
# prediction = test_df.groupby('user_id').head(10)
# prediction = prediction[['user_id', 'item_id']]
# prediction.head()

###Для test

In [None]:
from numpy._core.defchararray import startswith
y = recs_test['label']

feature = ['als_score', 'age', 'gender', 'geo', 'author_id', 'duration'] + [col for col in recs_test if col.startswith('emb')]

X_test = recs_test[feature]
recs_test['score'] = ranker.predict(X_test)

recs_test = recs_test.sort_values(['user_id', 'score'], ascending=[True, False])
prediction = recs_test.groupby('user_id').head(10)
prediction = prediction[['user_id', 'item_id']]
prediction.head()

## Evaluation
Metric: MAP@10


In [None]:
def mapk(df, k=10):
    scores = []
    for _, g in df.groupby('user_id'):
        g = g.sort_values('score', ascending=False).head(k)

        hits = 0
        ap = 0

        for i, label in enumerate(g['label'].values):
            if label == 1:
                hits += 1
                ap += hits / (i + 1)

        if g['label'].sum() > 0:
            ap /= min(k, g['label'].sum())

        scores.append(ap)

    return np.mean(scores)

print("Hybrid MAP@10:", mapk(recs_test, 10))


## Hybrid Model Results

A two-stage hybrid recommendation system was implemented:

- **Candidate generation:** Implicit ALS (Top-100 items per user)  
- **Re-ranking:** LightGBM Ranker (LambdaRank)

**Features used:**
- ALS score  
- User metadata (age, gender, geo)  
- Item metadata (author_id, duration)  
- Item embeddings  

**Evaluation metric:** MAP@10

**Results:**
- ALS baseline: ~0.016  
- Hybrid model: **MAP@10 ≈ 0.16**

The hybrid approach significantly improves recommendation quality by combining collaborative filtering for candidate retrieval and learning-to-rank for optimal item ordering.
