In [3]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
#import enhanced data (to avoid running API code every time)
filepath = 'https://raw.githubusercontent.com/fedeamedeo/machine-learning-project/main/'
interactions = pd.read_csv(filepath + 'interactions_with_items_improved-2.csv')
items = pd.read_csv(filepath + 'items_improved.csv')

In [5]:

# === ALIGN ITEM IDs: Use item metadata as master ===
item_id_mapping = {item_id: idx for idx, item_id in enumerate(items["i"].unique())}
items["i"] = items["i"].map(item_id_mapping)

interactions["i"] = interactions["i"].map(item_id_mapping)
interactions = interactions.dropna(subset=["i"]).astype({"i": int})


In [6]:

# === MAP USER IDs ===
user_id_mapping = {user_id: idx for idx, user_id in enumerate(interactions["u"].unique())}
interactions["u"] = interactions["u"].map(user_id_mapping)


In [7]:

# === CREATE TEXT COLUMN FOR CONTENT-BASED ===
items["metadata_text"] = (
    items["Title"].fillna("") + " " +
    items["Author"].fillna("") + " " +
    items["Publisher"].fillna("") + " " +
    items["Subjects"].fillna("")
)


In [9]:

# === CREATE EMBEDDINGS ===
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

item_embeddings = model.encode(items["metadata_text"].tolist(), show_progress_bar=True)


sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/478 [00:00<?, ?it/s]

In [10]:

# === TRAIN/TEST SPLIT ===
interactions["pct_rank"] = interactions.groupby("u")["t"].rank(pct=True, method="dense")
train_data = interactions[interactions["pct_rank"] < 0.8]
test_data = interactions[interactions["pct_rank"] >= 0.8]


In [11]:

# === CREATE TRAIN MATRICES ===
import numpy as np

n_users = interactions["u"].nunique()
n_items = interactions["i"].nunique()

def create_data_matrix(data, n_users, n_items):
    matrix = np.zeros((n_users, n_items))
    for row in data.itertuples():
        matrix[row.u, row.i] = 1
    return matrix

train_data_matrix = create_data_matrix(train_data, n_users, n_items)
test_data_matrix = create_data_matrix(test_data, n_users, n_items)


In [12]:

# === USER EMBEDDINGS (CONTENT-BASED) ===
train_histories = train_data.groupby("u")["i"].apply(list).to_dict()
user_embeddings = np.zeros((n_users, item_embeddings.shape[1]))

for user_id, item_ids in train_histories.items():
    valid_items = [i for i in item_ids if i < len(item_embeddings)]
    if valid_items:
        user_embeddings[user_id] = np.mean(item_embeddings[valid_items], axis=0)


In [13]:

# === CONTENT-BASED SIMILARITY SCORES ===
from sklearn.metrics.pairwise import cosine_similarity
cb_scores = cosine_similarity(user_embeddings, item_embeddings)


In [14]:

# === COLLABORATIVE FILTERING PREDICTIONS ===
item_similarity = cosine_similarity(train_data_matrix.T)

def item_based_predict(interactions, similarity, epsilon=1e-9):
    pred = similarity.dot(interactions.T) / (similarity.sum(axis=1)[:, np.newaxis] + epsilon)
    return pred.T

cf_scores = item_based_predict(train_data_matrix, item_similarity)


In [15]:
def mean_average_precision_at_k(predictions, ground_truth, k=10):
    map_at_k = 0
    n_users = predictions.shape[0]
    for user in range(n_users):
        predicted_ranks = np.argsort(predictions[user])[::-1][:k]
        true_items = np.where(ground_truth[user] == 1)[0]
        hits = 0
        sum_precisions = 0
        for i, item in enumerate(predicted_ranks):
            if item in true_items:
                hits += 1
                sum_precisions += hits / (i + 1)
        map_at_k += sum_precisions / min(len(true_items), k) if len(true_items) > 0 else 0
    return map_at_k / n_users

In [16]:

# === HYBRID MODEL ===
from sklearn.preprocessing import MinMaxScaler

def normalize_scores(matrix):
    scaler = MinMaxScaler()
    return scaler.fit_transform(matrix)

normalized_cb = normalize_scores(cb_scores)
normalized_cf = normalize_scores(cf_scores)
min_items = min(normalized_cb.shape[1], normalized_cf.shape[1])
normalized_cb = normalized_cb[:, :min_items]
normalized_cf = normalized_cf[:, :min_items]
assert normalized_cb.shape == normalized_cf.shape

alpha = 1
#for a in np.linspace(0, 1, 11):
    #hybrid = a * normalized_cf + (1 - a) * normalized_cb
    #score = mean_average_precision_at_k(hybrid, test_data_matrix)
    #print(f"alpha={a:.1f}, MAP@10={score:.4f}")
hybrid_scores = alpha * normalized_cf + (1 - alpha) * normalized_cb


In [40]:
print("Hybrid MAP@10:", mean_average_precision_at_k(hybrid_scores, test_data_matrix))

Hybrid MAP@10: 0.17882286262486788


In [33]:
'''def create_recommendations_csv(predictions_matrix,name='recommendations.csv'):
    user_based_recommendations = []
    original_item_ids = items["i"].drop_duplicates().values
    for user_id in range(predictions_matrix.shape[0]):
        # Indices of the top 10 items for each user (based on predicted interactions)
        top_10_items = np.argsort(predictions_matrix[user_id])[-10:][::-1]  # Sorting them in descending order
        top_item_ids = [str(original_item_ids[i]) for i in top_item_indices]
        top_10_items_str = " ".join(map(str, top_10_items))

        # Append the user_id and the recommendations as a string
        user_based_recommendations.append([user_id, top_10_items_str])

    recommendations_df = pd.DataFrame(user_based_recommendations, columns=['user_id', 'recommendation'])
    recommendations_df.to_csv(name, index=False)
    print("Recommendations CSV created successfully.")
    return recommendations_df

In [35]:
#nadege
'''def create_recommendations_csv(predictions_matrix,name='recommendations.csv'):
    user_based_recommendations = []
    for user_id in range(predictions_matrix.shape[0]):
        # Indices of the top 10 items for each user (based on predicted interactions)
        top_10_items = np.argsort(predictions_matrix[user_id])[-10:][::-1]  # Sorting them in descending order

        top_10_items_str = " ".join(map(str, top_10_items))

        # Append the user_id and the recommendations as a string
        user_based_recommendations.append([user_id, top_10_items_str])

    recommendations_df = pd.DataFrame(user_based_recommendations, columns=['user_id', 'recommendation'])
    recommendations_df.to_csv(name, index=False)
    print("Recommendations CSV created successfully.")
    return recommendations_df

In [37]:
def create_recommendations_csv(predictions_matrix, interactions_df, items_df, name='recommendations.csv'):
    # Get original user and item IDs in the order used in predictions_matrix
    user_ids = interactions_df['u'].drop_duplicates().values
    item_ids = items_df['i'].drop_duplicates().values

    user_based_recommendations = []

    for user_idx in range(predictions_matrix.shape[0]):
        # Get top 10 item indices (based on prediction score)
        top_10_item_indices = np.argsort(predictions_matrix[user_idx])[-10:][::-1]

        # Convert matrix column indices back to original item IDs
        top_10_item_ids = [str(item_ids[i]) for i in top_10_item_indices]
        recommendation_str = " ".join(top_10_item_ids)

        # Use original user ID from corresponding row index
        user_id = user_ids[user_idx]

        user_based_recommendations.append([user_id, recommendation_str])

    # Create the submission DataFrame
    recommendations_df = pd.DataFrame(user_based_recommendations, columns=['user_id', 'recommendation'])
    recommendations_df.to_csv(name, index=False)
    print("Recommendations CSV created successfully.")
    return recommendations_df


In [39]:
create_recommendations_csv(hybrid_scores, interactions, items, name='hybrid_recs2.csv')

Recommendations CSV created successfully.


Unnamed: 0,user_id,recommendation
0,0,17 18 0 1 2 7 8 6 5 16
1,1,27 28 29 25 32 31 2426 30 33 1970
2,2,72 73 74 70 69 68 64 57 37 39
3,3,126 120 119 122 123 117 129 98 111 112
4,4,146 157 156 145 151 158 149 147 152 148
...,...,...
7833,7833,795 6303 2586 9384 9385 4288 4290 512 1651 1708
7834,7834,15108 13236 10174 13237 12353 14962 2455 11880...
7835,7835,2487 12246 15004 4026 12202 12203 8888 12201 7...
7836,7836,14121 14040 2841 14904 15097 14122 10854 14429...


In [29]:
from google.colab import files
files.download('hybrid_recs2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>