<a href="https://colab.research.google.com/github/dashatenoff/recsys-vk/blob/main/notenooks/hubrid_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Hubrid_mode

# Hybrid Recommender System (ALS + LightGBM)

## Project goal
Build a hybrid recommendation system:
- Candidate generation: ALS (implicit)
- Re-ranking: LightGBM Ranker
- Metric: MAP@10

## Data
- User interactions (implicit feedback)
- User metadata
- Item metadata
- Item embeddings

## Pipeline
1. Build user-item matrix
2. Train ALS model
3. Generate top-100 candidates per user
4. Create features
5. Train LightGBM Ranker
6. Evaluate MAP@10


#Imports


In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from google.colab import drive
import os


#Load Data

In [5]:
drive.mount('/content/drive')
os.listdir('/content/drive/MyDrive')
train = pd.read_parquet('/content/drive/MyDrive/VK/train.parquet')
test = pd.read_parquet('/content/drive/MyDrive/VK/test.parquet')
item_embeddings = pd.read_parquet('/content/drive/MyDrive/VK/item_embeddings.parquet')
item_metadata = pd.read_parquet('/content/drive/MyDrive/VK/item_metadata.parquet')
user_metadata = pd.read_parquet('/content/drive/MyDrive/VK/user_metadata.parquet')

Mounted at /content/drive


## Candidate generation with ALS
We train ALS to generate top-100 candidates per user.


#Подготовка данных

In [6]:
from scipy.sparse import csr_matrix
user_to_index = {}
item_to_index = {}
rows, cols, data = [], [], []

for row in train.itertuples():
  u = user_to_index.setdefault(row.user_id, len(user_to_index))
  i = item_to_index.setdefault(row.item_id, len(item_to_index))

  rows.append(u)
  cols.append(i)
  data.append(1)

user_item_matrix = csr_matrix((data, (rows, cols)))

#Обучение ALS модели

In [7]:
!pip install implicit

Collecting implicit
  Downloading implicit-0.7.2.tar.gz (70 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: implicit
  Building wheel for implicit (pyproject.toml) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.7.2-cp312-cp312-linux_x86_64.whl size=933265 sha256=f08a79c59d077639c630704a5f2aba9025459a83c69dc50c41c368cbfb2842e4
  Stored in directory: /root/.cache/pip/wheels/b2/00/4f/9ff8af07a0a53ac6007ea5d739da19cfe147a2df542b6899f8
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.7.2


In [8]:
import implicit
als_model = implicit.als.AlternatingLeastSquares(
    factors=32, #размерность embeddding
    regularization=10,
    alpha=1000,
    iterations=20,
)

als_model.fit(user_item_matrix)

  check_blas_config()


  0%|          | 0/20 [00:00<?, ?it/s]

In [56]:
index_to_item = { v: k for k, v in item_to_index.items()}

def recommend_for_user_als(user_id, k=100):
  if user_id not in user_to_index:
    return []

  user_idx = user_to_index[user_id]

  item_index, scores  = als_model.recommend(
      user_idx,
      user_item_matrix[user_idx],
      N = k,
      filter_already_liked_items=True
  )

  return  [
      (index_to_item[i], s) for i, s in zip(item_index, scores)
      ]
test_user = test['user_id'].iloc[0]
recommend_for_user_als(test_user, k=10)


[(548293766, np.float32(1.1634432)),
 (255846437, np.float32(1.1529819)),
 (498569894, np.float32(1.1423321)),
 (4816140, np.float32(1.1320298)),
 (589460502, np.float32(1.1304524)),
 (382943759, np.float32(1.1184676)),
 (64187707, np.float32(1.1168696)),
 (238095999, np.float32(1.1104788)),
 (252524701, np.float32(1.1100986)),
 (134162399, np.float32(1.1099293))]

In [72]:
all_test_users = test['user_id'].nunique()
covered_users = len(test_users_test)

print("Coverage:", covered_users / all_test_users)


Coverage: 1.0


In [68]:
recs = []
test_users = test['user_id'].unique()

for user_id in test_users:
  recs_items = recommend_for_user_als(user_id, k=100)
  for item, score in recs_items:
    recs.append({
      'user_id' : user_id,
      'item_id' : item,
      'als_score' : score
    })
submission = pd.DataFrame(recs)
submission.to_csv('submission_als', index=False)
submission.head(20)


Unnamed: 0,user_id,item_id,als_score
0,506947605,548293766,1.163443
1,506947605,255846437,1.152982
2,506947605,498569894,1.142332
3,506947605,4816140,1.13203
4,506947605,589460502,1.130452
5,506947605,382943759,1.118468
6,506947605,64187707,1.11687
7,506947605,238095999,1.110479
8,506947605,252524701,1.110099
9,506947605,134162399,1.109929


## Generate candidates
For each user we generate top-100 items using ALS.


In [69]:
user_history_test = test.groupby('user_id')['item_id'].apply(set)

recs_test = []
test_users_test = [u for u in test['user_id'].unique() if u in user_to_index]

for user_id in test_users_test:
  recs_items = recommend_for_user_als(user_id, k=100)
  for item, score in recs_items:
    recs_test.append({
      'user_id' : user_id,
      'item_id' : item,
      'als_score' : score
    })
recs_test = pd.DataFrame(recs_test)

recs_test['label'] = recs_test.apply(
    lambda x: int(x['item_id'] in user_history_test.get(x['user_id'], set())), axis=1
)


In [11]:
# user_history = train.groupby('user_id')['item_id'].apply(set)

# recs_train = []
# test_users_train = train['user_id'].unique()

# for user_id in test_users_train:
#   recs_items = recommend_for_user_als(user_id, k=100)
#   for item, score in recs_items:
#     recs_train.append({
#       'user_id' : user_id,
#       'item_id' : item,
#       'als_score' : score
#     })
# recs_train = pd.DataFrame(recs_train)

# recs_train['label'] = recs_train.apply(
#     lambda x: int(x['item_id'] in user_history.get(x['user_id'], set())), axis=1
# )


In [66]:
recs_test['label'].mean()

np.float64(0.05142857142857143)

In [61]:
u = test_users_test[10]

rec = recommend_for_user_als(u, k=100)
rec = set([ i for i, _ in rec])
len(rec & user_history_test[u])

4

In [16]:
user_metadata.head()

Unnamed: 0,user_id,age,gender,geo,train_interactions_rank
0,202612548,18,1,1,965
1,189035614,18,1,13,6457
2,22320303,18,1,14,8919
3,194699221,18,1,16,2424
4,392744532,18,1,17,3383


## Feature engineering

Features:
- ALS score
- User: age, gender, geo
- Item: duration, author_id
- Item embeddings



In [17]:
item_metadata.head()
# item_embeddings.head()

Unnamed: 0,item_id,author_id,duration,train_interactions_rank,embedding
0,326091735,116090,89,17367,"[-0.5078125, 0.044647216796875, 0.447021484375..."
1,337826988,120666,37,8281,"[-0.253173828125, 0.128173828125, 0.2349853515..."
2,582660968,125834,167,12133,"[-0.59814453125, -0.1922607421875, 0.017166137..."
3,223344189,127291,90,17943,"[-0.343994140625, 0.053680419921875, 0.1712646..."
4,413392655,127581,99,18253,"[-0.54541015625, 0.035614013671875, 0.05685424..."


In [18]:
# recs_train = recs_train.merge(
#     user_metadata[['user_id', 'age', 'gender', 'geo']],
#     on='user_id',
#     how='left'
# )

# recs_train = recs_train.merge(
#     item_metadata[['item_id', 'author_id', 'duration', 'embedding']],
#     on='item_id',
#     how='left'
# )

# emb_df = pd.DataFrame(list(recs_train['embedding']), index=recs_train.index)
# emb_df.columns = [f"emb_{i}" for i in range(1, emb_df.shape[1]+1)]
# recs_train = recs_train.drop('embedding', axis=1)
recs_train = pd.concat([recs_train, emb_df], axis=1)

In [73]:
# User features
recs_test = recs_test.merge(
    user_metadata[['user_id', 'age', 'gender', 'geo']],
    on='user_id',
    how='left'
)

# Item features
recs_test = recs_test.merge(
    item_metadata[['item_id', 'author_id', 'duration', 'embedding']],
    on='item_id',
    how='left'
)

# Expand embeddings
emb_df = pd.DataFrame(
    list(recs_test['embedding']),
    index=recs_test.index
)
emb_df.columns = [f"emb_{i}" for i in range(1, emb_df.shape[1] + 1)]

# Drop original embedding column and concatenate
recs_test = recs_test.drop('embedding', axis=1)
recs_test = pd.concat([recs_test, emb_df], axis=1)


In [78]:
recs_test.head(10)



Unnamed: 0,user_id,item_id,als_score,label,age,gender,geo,author_id,duration,emb_1,...,emb_23,emb_24,emb_25,emb_26,emb_27,emb_28,emb_29,emb_30,emb_31,emb_32
0,506947605,548293766,1.163443,0,31,1,25,283424,66,-0.541992,...,-0.08136,-0.0159,-0.12793,0.002621,0.004093,0.020142,-0.010445,0.009521,-0.103516,-0.108337
1,506947605,255846437,1.152982,0,31,1,25,283424,60,-0.341309,...,-0.064758,-0.026154,-0.02829,0.017746,-0.037292,0.091431,-0.032532,0.199463,-0.076904,-0.053345
2,506947605,498569894,1.142332,0,31,1,25,727332,108,-0.384033,...,-0.161621,0.010185,-0.062561,-0.037567,-0.083313,0.026947,0.004707,0.059082,-0.007088,-0.024307
3,506947605,4816140,1.13203,0,31,1,25,249556,58,-0.639648,...,0.097534,-0.022034,0.052032,0.087769,0.042175,-0.092407,0.042603,0.077637,0.008057,0.06842
4,506947605,589460502,1.130452,0,31,1,25,1269212,89,-0.451172,...,-0.084106,-0.113159,0.032227,0.104797,-0.020493,-0.115723,-0.05481,-0.07782,-0.130981,-0.026901
5,506947605,382943759,1.118468,0,31,1,25,1235869,46,-0.421875,...,-0.008011,-0.165894,0.068848,-0.08313,0.115051,0.020935,0.110352,-0.121155,-0.170166,-0.161743
6,506947605,64187707,1.11687,1,31,1,25,475773,85,-0.526855,...,0.129761,-0.131714,-0.078918,-0.097778,0.15271,-0.035614,0.102112,-0.104614,0.005066,-0.165161
7,506947605,238095999,1.110479,0,31,1,25,1245575,41,-0.335693,...,0.202393,0.098755,0.068726,0.098083,0.077698,-0.183594,0.025375,-0.141968,-0.101379,-0.008293
8,506947605,252524701,1.110099,0,31,1,25,242661,59,-0.601562,...,0.022369,-0.014809,0.076782,-0.146606,-0.041504,-0.038177,0.207031,-0.0021,0.006836,0.007996
9,506947605,134162399,1.109929,0,31,1,25,432679,48,-0.264893,...,-0.081055,-0.014465,0.114441,-0.00898,0.102356,-0.081116,0.007473,-0.156372,-0.087708,-0.110901


## LightGBM Ranker
We train a LambdaRank model for re-ranking.


###Обучение

In [20]:
# from numpy._core.defchararray import startswith
# y = recs_train['label']

# feature = ['als_score', 'age', 'gender', 'geo', 'author_id', 'duration'] + [col for col in recs_train if col.startswith('emb')]

In [21]:
# from sklearn.model_selection import train_test_split
# unique  = recs_train['user_id'].unique()
# u_train, u_test = train_test_split(unique, test_size=0.2, random_state=42)
# train_df = recs_train[recs_train['user_id'].isin(u_train)]
# test_df = recs_train[recs_train['user_id'].isin(u_test)]
# X_train = train_df[feature]
# X_test = test_df[feature]
# y_train = train_df['label']
# y_test = test_df['label']
# group_train = train_df.groupby('user_id').size().tolist()

In [22]:
import lightgbm as lgb
ranker = lgb.LGBMRanker(
    objective='lambdarank',
    n_esimators=200,
    learning_rate=0.05,
    group=group_train
)

ranker.fit(X_train, y_train, group=group_train )

Please use group argument of the Dataset constructor to pass this parameter.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.940551 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8980
[LightGBM] [Info] Number of data points in the train set: 780900, number of used features: 38


In [30]:
# test_df = test_df.sort_values(['user_id', 'score'], ascending=[True, False])
# prediction = test_df.groupby('user_id').head(10)
# prediction = prediction[['user_id', 'item_id']]
# prediction.head()

Unnamed: 0,user_id,item_id
9394,33549,138789428
9371,33549,218695535
9396,33549,148995241
9386,33549,412044180
9397,33549,84487256


###Для test

In [None]:
from numpy._core.defchararray import startswith
y = recs_test['label']

feature = ['als_score', 'age', 'gender', 'geo', 'author_id', 'duration'] + [col for col in recs_test if col.startswith('emb')]

X_test = recs_test[feature]
recs_test['score'] = ranker.predict(X_test)

recs_test = recs_test.sort_values(['user_id', 'score'], ascending=[True, False])
prediction = recs_test.groupby('user_id').head(10)
prediction = prediction[['user_id', 'item_id']]
prediction.head()

## Evaluation
Metric: MAP@10


In [82]:
def mapk(df, k=10):
    scores = []
    for _, g in df.groupby('user_id'):
        g = g.sort_values('score', ascending=False).head(k)

        hits = 0
        ap = 0

        for i, label in enumerate(g['label'].values):
            if label == 1:
                hits += 1
                ap += hits / (i + 1)

        if g['label'].sum() > 0:
            ap /= min(k, g['label'].sum())

        scores.append(ap)

    return np.mean(scores)

print("Hybrid MAP@10:", mapk(recs_test, 10))


Hybrid MAP@10: 0.16249047760800023


## Hybrid Model Results

A two-stage hybrid recommendation system was implemented:

- **Candidate generation:** Implicit ALS (Top-100 items per user)  
- **Re-ranking:** LightGBM Ranker (LambdaRank)

**Features used:**
- ALS score  
- User metadata (age, gender, geo)  
- Item metadata (author_id, duration)  
- Item embeddings  

**Evaluation metric:** MAP@10

**Results:**
- ALS baseline: ~0.016  
- Hybrid model: **MAP@10 ≈ 0.16**

The hybrid approach significantly improves recommendation quality by combining collaborative filtering for candidate retrieval and learning-to-rank for optimal item ordering.
