In [3]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
#import enhanced data (to avoid running API code every time)
filepath = 'https://raw.githubusercontent.com/fedeamedeo/machine-learning-project/main/'
interactions = pd.read_csv(filepath + 'interactions_with_items_improved-2.csv')
items = pd.read_csv(filepath + 'items_improved.csv')

In [5]:

# === ALIGN ITEM IDs: Use item metadata as master ===
item_id_mapping = {item_id: idx for idx, item_id in enumerate(items["i"].unique())}
items["i"] = items["i"].map(item_id_mapping)

interactions["i"] = interactions["i"].map(item_id_mapping)
interactions = interactions.dropna(subset=["i"]).astype({"i": int})


In [6]:

# === MAP USER IDs ===
user_id_mapping = {user_id: idx for idx, user_id in enumerate(interactions["u"].unique())}
interactions["u"] = interactions["u"].map(user_id_mapping)


In [7]:

# === CREATE TEXT COLUMN FOR CONTENT-BASED ===
items["metadata_text"] = (
    items["Title"].fillna("") + " " +
    items["Author"].fillna("") + " " +
    items["Publisher"].fillna("") + " " +
    items["Subjects"].fillna("")
)


In [8]:

# === CREATE EMBEDDINGS ===
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

item_embeddings = model.encode(items["metadata_text"].tolist(), show_progress_bar=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:

# === TRAIN/TEST SPLIT ===
interactions["pct_rank"] = interactions.groupby("u")["t"].rank(pct=True, method="dense")
train_data = interactions[interactions["pct_rank"] < 0.8]
test_data = interactions[interactions["pct_rank"] >= 0.8]


In [None]:

# === CREATE TRAIN MATRICES ===
import numpy as np

n_users = interactions["u"].nunique()
n_items = interactions["i"].nunique()

def create_data_matrix(data, n_users, n_items):
    matrix = np.zeros((n_users, n_items))
    for row in data.itertuples():
        matrix[row.u, row.i] = 1
    return matrix

train_data_matrix = create_data_matrix(train_data, n_users, n_items)
test_data_matrix = create_data_matrix(test_data, n_users, n_items)


In [None]:

# === USER EMBEDDINGS (CONTENT-BASED) ===
train_histories = train_data.groupby("u")["i"].apply(list).to_dict()
user_embeddings = np.zeros((n_users, item_embeddings.shape[1]))

for user_id, item_ids in train_histories.items():
    valid_items = [i for i in item_ids if i < len(item_embeddings)]
    if valid_items:
        user_embeddings[user_id] = np.mean(item_embeddings[valid_items], axis=0)


In [None]:

# === CONTENT-BASED SIMILARITY SCORES ===
from sklearn.metrics.pairwise import cosine_similarity
cb_scores = cosine_similarity(user_embeddings, item_embeddings)


In [None]:

# === COLLABORATIVE FILTERING PREDICTIONS ===
item_similarity = cosine_similarity(train_data_matrix.T)

def item_based_predict(interactions, similarity, epsilon=1e-9):
    pred = similarity.dot(interactions.T) / (similarity.sum(axis=1)[:, np.newaxis] + epsilon)
    return pred.T

cf_scores = item_based_predict(train_data_matrix, item_similarity)


In [None]:
def mean_average_precision_at_k(predictions, ground_truth, k=10):
    map_at_k = 0
    n_users = predictions.shape[0]
    for user in range(n_users):
        predicted_ranks = np.argsort(predictions[user])[::-1][:k]
        true_items = np.where(ground_truth[user] == 1)[0]
        hits = 0
        sum_precisions = 0
        for i, item in enumerate(predicted_ranks):
            if item in true_items:
                hits += 1
                sum_precisions += hits / (i + 1)
        map_at_k += sum_precisions / min(len(true_items), k) if len(true_items) > 0 else 0
    return map_at_k / n_users

In [None]:

# === HYBRID MODEL ===
from sklearn.preprocessing import MinMaxScaler

def normalize_scores(matrix):
    scaler = MinMaxScaler()
    return scaler.fit_transform(matrix)

normalized_cb = normalize_scores(cb_scores)
normalized_cf = normalize_scores(cf_scores)
min_items = min(normalized_cb.shape[1], normalized_cf.shape[1])
normalized_cb = normalized_cb[:, :min_items]
normalized_cf = normalized_cf[:, :min_items]
assert normalized_cb.shape == normalized_cf.shape

alpha = 1
#for a in np.linspace(0, 1, 11):
    #hybrid = a * normalized_cf + (1 - a) * normalized_cb
    #score = mean_average_precision_at_k(hybrid, test_data_matrix)
    #print(f"alpha={a:.1f}, MAP@10={score:.4f}")
hybrid_scores = alpha * normalized_cf + (1 - alpha) * normalized_cb


In [None]:
print("Hybrid MAP@10:", mean_average_precision_at_k(hybrid_scores, test_data_matrix))

In [None]:
# Recover unique original user_ids and item_ids in the order they appear in hybrid_scores
original_user_ids = interactions["u"].drop_duplicates().values
original_item_ids = items["i"].drop_duplicates().values

# Create a DataFrame to store recommendations
recommendations = []

for user_idx, user_id in enumerate(original_user_ids):
    top_item_indices = np.argsort(hybrid_scores[user_idx])[::-1][:10]
    top_item_ids = [str(original_item_ids[i]) for i in top_item_indices]
    recommendation_str = " ".join(top_item_ids)
    recommendations.append({"user_id": user_id, "recommendation": recommendation_str})

# Convert to DataFrame and save as CSV
submission_df = pd.DataFrame(recommendations)
submission_df.to_csv("submission.csv", index=False)

In [None]:
#files.download('hybrid_recommendations.csv')