In [34]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import src.data_loader as data_loader
from src.config import SUBMISSION_PATH
from src.config import TOP_K

In [14]:
# Load dataset
interactions = data_loader.load_interactions()
items = data_loader.load_items()
samples = data_loader.load_samples()
samples.head()

Unnamed: 0,user_id,recommendation
0,0,3758 11248 9088 9895 5101 6074 9295 14050 1096...
1,1,3263 726 1589 14911 6432 10897 6484 7961 8249 ...
2,2,13508 9848 12244 2742 11120 2893 2461 5439 116...
3,3,2821 10734 6357 5934 2085 12608 12539 10551 10...
4,4,12425 219 11602 1487 14178 489 13888 2110 4413...


## 1. Baseline - Collaborative Filtering

### 1.1 Top-Popular Recommendation

- **Idea**: Recommend the most popular books to all users.
- **Advantages**: Simple, requires no user history, effective for cold-start users.
- **Disadvantages**: Not personalized, may ignore niche interests.
- **Libraries**: `pandas`, `numpy`.

In [5]:
# --- Compute item popularity ---
# Count how many times each item has been interacted with
item_popularity = interactions['i'].value_counts().reset_index()
item_popularity.columns = ['i', 'popularity']

# Sort items by popularity in descending order
item_popularity = item_popularity.sort_values(by='popularity', ascending=False)

# --- Generate Top-Pop recommendations for each user ---
user_ids = interactions['u'].unique()  # Get unique user IDs

recommendations = []

for user_id in user_ids:
    # Get items already interacted by this user
    user_items = set(interactions[interactions['u'] == user_id]['i'])
    # Recommend popular items that the user hasn't interacted with yet
    recs = [i for i in item_popularity['i'] if i not in user_items]
    recs = recs[:TOP_K]  # Keep only top N
    # Store recommendation as a space-separated string
    recommendations.append({
        'user_id': user_id,
        'recommendation': ' '.join(map(str, recs))
    })

# Convert to DataFrame
rec_df = pd.DataFrame(recommendations)

# --- Save recommendations to CSV ---
rec_df.to_csv(f"{SUBMISSION_PATH}/top_pop_recommendations.csv", index=False)
print("Top-Pop recommendations saved to top_pop_recommendations.csv")

Top-Pop recommendations saved to top_pop_recommendations.csv


### 1.2 User-User CF / Item-Item CF

- **Idea**: Find users with similar interests and recommend the books they like.
- **Advantages**: Intuitive and easy to understand.
- **Disadvantages**: Cold-start problem (new users/items with no interactions), issues with sparse matrices.
- **Libraries**: `surprise`, `implicit`, `scikit-learn`.

In [1]:
# user-item matrix
user_item_matrix = interactions.pivot_table(index="u", columns="i", values="t", fill_value=0)

matrix_np = user_item_matrix.values
user_ids = user_item_matrix.index.tolist()
item_ids = user_item_matrix.columns.tolist()

# ============================================================
# 2. Compute similarity matrices
# ============================================================

user_similarity = cosine_similarity(matrix_np)      # user-user similarity
item_similarity = cosine_similarity(matrix_np.T)    # item-item similarity

# ============================================================
# 3. Recommendation functions (10 items)
# ============================================================

def user_based_recommend(user_index, matrix, similarity, TOP_K):
    """
    User-Based Collaborative Filtering
    Score = weighted sum of similar users' ratings
    """
    user_sim = similarity[user_index]
    user_ratings = matrix

    scores = user_sim @ user_ratings

    # remove items already interacted
    scores[user_ratings[user_index] > 0] = -1

    top_items_idx = np.argsort(-scores)[:TOP_K]
    return [item_ids[i] for i in top_items_idx]


def item_based_recommend(user_index, matrix, similarity, TOP_K):
    """
    Item-Based Collaborative Filtering
    Score = similarity between items the user interacted and all items
    """
    user_vector = matrix[user_index]
    scores = similarity @ user_vector

    # remove items already interacted
    scores[user_vector > 0] = -1

    top_items_idx = np.argsort(-scores)[:TOP_K]
    return [item_ids[i] for i in top_items_idx]


# ============================================================
# 4. Generate results for each model
# ============================================================

user_based_rows = []
item_based_rows = []

for idx, u in enumerate(user_ids):

    # user-based CF
    ub_rec = user_based_recommend(idx, matrix_np, user_similarity, TOP_K)
    ub_line = " ".join(map(str, ub_rec))
    user_based_rows.append([u, ub_line])

    # item-based CF
    ib_rec = item_based_recommend(idx, matrix_np, item_similarity, TOP_K)
    ib_line = " ".join(map(str, ib_rec))
    item_based_rows.append([u, ib_line])


# ============================================================
# 5. Save CSV files (submission format)
# ============================================================

user_based_df = pd.DataFrame(user_based_rows, columns=["user_id", "recommendation"])
item_based_df = pd.DataFrame(item_based_rows, columns=["user_id", "recommendation"])
item_based_df = pd.DataFrame(item_based_rows, columns=["user_id", "recommendation"])

user_based_df.to_csv(f"{SUBMISSION_PATH}/user_based_CF_recommendations.csv", index=False)
item_based_df.to_csv(f"{SUBMISSION_PATH}/item_based_CF_recommendations.csv", index=False)

print("Done! Files saved:")
print(" - user_based_CF_recommendations.csv")
print(" - item_based_CF_recommendations.csv")

NameError: name 'interactions' is not defined

## 2. Enhanced model

#### 2.1. Matrix Factorization / Latent Factor Models
- **Examples**: SVD, ALS, FunkSVD, Probabilistic MF
- **Idea**: Map users and items into a low-dimensional latent space to learn hidden features.
- **Advantages**: Can capture latent user-item relationships even in sparse matrices.
- **Disadvantages**: Cold-start items/users still challenging.
- **Libraries**: `surprise`, `implicit` (ALS)

#### 2.2. Content-Based Recommendation
- **Idea**: Use item metadata (e.g., book subjects, authors, publishers) to recommend items similar to user preferences.
- **Advantages**: Works well for cold-start items; does not rely on dense interaction data.
- **Disadvantages**: Limited novelty; may over-recommend similar items.
- **Libraries**: `scikit-learn` (TF-IDF, CountVectorizer + cosine similarity)

In [24]:
# -------------------------
# 1️⃣ Merge interactions with item features
# -------------------------
# For content-based, we'll use Author + Publisher + Subjects as text
items['feature_text'] = (
    items['Author'].fillna('') + ' ' +
    items['Publisher'].fillna('') + ' ' +
    items['Subjects'].fillna('')
)

# -------------------------
# 2️⃣ TF-IDF on items
# -------------------------
tfidf = TfidfVectorizer(stop_words='english')
item_features_matrix = tfidf.fit_transform(items['feature_text'])

# -------------------------
# 3️⃣ Build user profiles
# -------------------------
# For each user, average the TF-IDF vectors of items they interacted with
user_profiles = {}
for user_id, group in interactions.groupby('u'):
    item_indices = group['i'].values
    # Convert to np.array to avoid np.matrix issues
    user_vector = item_features_matrix[item_indices].mean(axis=0)
    user_vector = np.asarray(user_vector)  # <-- convert to ndarray
    user_profiles[user_id] = user_vector

# -------------------------
# 4️⃣ Compute recommendations
# -------------------------
top_n = 10
recommendations = []

for user_id, user_vector in user_profiles.items():
    # Compute similarity with all items
    sim = cosine_similarity(user_vector, item_features_matrix)  # 1 x num_items
    sim = sim.flatten()

    # Exclude items already interacted
    interacted_items = set(interactions[interactions['u'] == user_id]['i'])
    sim_filtered = [(idx, score) for idx, score in enumerate(sim) if idx not in interacted_items]

    # Sort and take top N
    top_items = sorted(sim_filtered, key=lambda x: x[1], reverse=True)[:top_n]
    top_item_ids = [idx for idx, _ in top_items]

    recommendations.append({
        'user_id': user_id,
        'recommendation': ' '.join(map(str, top_item_ids))
    })

# -------------------------
# 5️⃣ Save to CSV
# -------------------------
recommendation_df = pd.DataFrame(recommendations)
recommendation_df.to_csv(f"{SUBMISSION_PATH}/content_based_recommendations.csv", index=False)
print("Saved recommendations to 'content_based_recommendations.csv'")

Saved recommendations to 'content_based_recommendations.csv'


#### 2.3. Embedding / Deep Learning Models
- **Idea**: Learn dense user/item embeddings through MF, Neural Collaborative Filtering (NCF), or Transformers.
- **Advantages**: Handles large sparse matrices; captures complex nonlinear relationships.
- **Libraries**: `PyTorch`, `TensorFlow`, `LightFM`

#### 2.4. Graph Neural Networks (Graph-based Models)
- **Idea**: Represent user-item interactions as a bipartite graph and learn node embeddings with GNNs (e.g., PinSAGE, NGCF).
- **Advantages**: Naturally handles sparse interactions and captures higher-order relationships.
- **Disadvantages**: More complex and computationally expensive.

## 3. Advanced Model / Feature Engineering Exploration

## 4. Integrated / Stacked