<a href="https://colab.research.google.com/github/dashatenoff/recsys-vk/blob/main/nootebooks/item_based_cf_jaccard_v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Item-based Collaborative Filtering (Jaccard)

This notebook implements a simple item-based collaborative filtering
recommender system using Jaccard similarity.

The model is based on implicit user-item interactions (views) from the VK-LSVD dataset.
The implementation focuses on clarity and conceptual correctness rather than efficiency.


In [7]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
import os
os.listdir('/content/drive/MyDrive')
train = pd.read_parquet('/content/drive/MyDrive/VK/train.parquet')
test = pd.read_parquet('/content/drive/MyDrive/VK/test.parquet')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Функция похожести айтемов

In [8]:
user_items = train.groupby('user_id')['item_id'].apply(set)
item_users = train.groupby('item_id')['user_id'].apply(set)

def jaccard(item1, item2):
  users1 = item_users.get(item1,set())
  users2 = item_users.get(item2, set())

  if not users1 or not users2:
    return 0.0

  return len(users1 & users2) / len(users1 | users2)


popular_items = train['item_id'].value_counts().head(2000).index


#RECOMPUTE ПОХОЖЕСТЕЙ АЙТЕМОВ

In [10]:
from collections import defaultdict
item_sim = defaultdict(list)
items = list(popular_items)

for i in range(len(items)):
  for j in range(i+1, len(items)):
    i1, i2 = items[i], items[j]
    sim = jaccard(i1, i2)
    if sim >= 0.01:
      item_sim[i1].append((i2, sim))
      item_sim[i2].append((i1, sim))


#Функция рекомендаций для ОДНОГО пользователя

In [11]:

def recommend_for_user(user_id, user_items, item_sim, k=10):
  scores = {}

  seen_items = user_items.get(user_id, set())
  for item in seen_items:
    for other_item, sim in item_sim.get(item, []):
        if other_item in seen_items:
          continue


        scores[other_item] = scores.get(other_item, 0) + sim

  ranked_items = sorted(scores.items(), key = lambda x: x[1], reverse=True)

  return [item for item, score in ranked_items[:k]]


In [16]:
test_user_id = list(user_items.keys())[0]
recommend_for_user(test_user_id, user_items, item_sim, k=10)




set()