## Baseline code no improvements

This should give a score around 0.1242 on Kaggle

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#load data
filepath = 'https://raw.githubusercontent.com/fedeamedeo/machine-learning-project/main/Data/'
interactions = pd.read_csv(filepath + 'interactions_train.csv')
items = pd.read_csv(filepath + 'items.csv')

In [51]:
interactions = interactions.drop_duplicates()
# === TRAIN/TEST SPLIT ===
interactions = interactions.sort_values(["u", "t"])
interactions["pct_rank"] = interactions.groupby("u")["t"].rank(pct=True, method="dense")
#interactions.reset_index(inplace=True, drop=True)
train_data = interactions[interactions["pct_rank"] < 0.8]
test_data = interactions[interactions["pct_rank"] >= 0.8]

In [71]:

# === CREATE TRAIN MATRICES ===

n_users = interactions["u"].nunique()
n_items = items["i"].nunique()

def create_data_matrix(data, n_users, n_items):
    """
    This function returns a numpy matrix with shape (n_users, n_items).
    Each entry is a binary value indicating positive interaction.
    """
    data_matrix = np.zeros((n_users, n_items))
    data_matrix[data["u"].values, data["i"].values] = 1
    return data_matrix

train_data_matrix = create_data_matrix(train_data, n_users, n_items)
test_data_matrix = create_data_matrix(test_data, n_users, n_items)


In [82]:

# === item-item CF ===
item_similarity=cosine_similarity(train_data_matrix.T)

def item_based_predict(interactions, similarity, epsilon=1e-9):
    pred = similarity.dot(interactions.T) / (similarity.sum(axis=1)[:, np.newaxis] + epsilon)
    return pred.T

item_prediction = item_based_predict(train_data_matrix, item_similarity)

ValueError: shapes (4,4) and (15291,7838) not aligned: 4 (dim 1) != 15291 (dim 0)

In [73]:
# user-user CF
user_similarity=cosine_similarity(train_data_matrix)
def user_based_predict(interactions, similarity, epsilon=1e-9):
   """
    Predicts user-item interactions based on user-user similarity.
    Parameters:
        interactions (numpy array): The user-item interaction matrix.
        similarity (numpy array): The user-user similarity matrix.
        epsilon (float): Small constant added to the denominator to avoid division by zero.
    Returns:
        numpy array: The predicted interaction scores for each user-item pair.
    """
    # Calculate the weighted sum of interactions based on user similarity
   pred = similarity.dot(interactions) / (np.abs(similarity).sum(axis=1)[:, np.newaxis] + epsilon)
   return pred

# Calculate the user-based predictions for positive interactions
user_prediction = user_based_predict(train_data_matrix, user_similarity)
print("Predicted Interaction Matrix (User-Based):")
print(user_prediction)
print(user_prediction.shape)


Predicted Interaction Matrix (User-Based):
[[0.13171649 0.13259591 0.14274867 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
(7838, 15291)


In [74]:
#Evaluating Our Recommenders

#Defining the function for precision and recall @K
def precision_recall_at_k(prediction, ground_truth, k=10):
    """
    Calculates Precision@K and Recall@K for top-K recommendations.
    Parameters:
        prediction (numpy array): The predicted interaction matrix with scores.
        ground_truth (numpy array): The ground truth interaction matrix (binary).
        k (int): Number of top recommendations to consider.
    Returns:
        precision_at_k (float): The average precision@K over all users.
        recall_at_k (float): The average recall@K over all users.
    """
    num_users = prediction.shape[0]
    precision_at_k, recall_at_k = 0, 0

    for user in range(num_users):
        # Get the indices of the top-K items for the user based on predicted scores
        top_k_items = np.argsort(prediction[user, :])[-k:]

        # Calculate the number of relevant items in the top-K items for the user
        relevant_items_in_top_k = np.isin(top_k_items, np.where(ground_truth[user, :] == 1)[0]).sum()

        # Calculate the total number of relevant items for the user
        total_relevant_items = ground_truth[user, :].sum()

        # Precision@K and Recall@K for this user
        precision_at_k += relevant_items_in_top_k / k
        recall_at_k += relevant_items_in_top_k / total_relevant_items if total_relevant_items > 0 else 0

    # Average Precision@K and Recall@K over all users
    precision_at_k /= num_users
    recall_at_k /= num_users

    return precision_at_k, recall_at_k


In [75]:

#Precision and recall @k for user- and item-based CF
precision_user_k, recall_user_k = precision_recall_at_k(user_prediction, test_data_matrix, k=10)
precision_item_k, recall_item_k = precision_recall_at_k(item_prediction, test_data_matrix, k=10)

print('User-based CF Precision@K:', precision_user_k)
print('User-based CF Recall@K:', recall_user_k)
print('Item-based CF Precision@K:', precision_item_k)
print('Item-based CF Recall@K:', recall_item_k)

User-based CF Precision@K: 0.0565577953559609
User-based CF Recall@K: 0.2906756989169525
Item-based CF Precision@K: 0.05567746874202883
Item-based CF Recall@K: 0.26399850355796506


In [76]:
def mean_average_precision_at_k(predictions, ground_truth, k=10):
    map_at_k = 0
    n_users = predictions.shape[0]
    for user in range(n_users):
        predicted_ranks = np.argsort(predictions[user])[::-1][:k]
        true_items = np.where(ground_truth[user] == 1)[0]
        hits = 0
        sum_precisions = 0
        for i, item in enumerate(predicted_ranks):
            if item in true_items:
                hits += 1
                sum_precisions += hits / (i + 1)
        map_at_k += sum_precisions / min(len(true_items), k) if len(true_items) > 0 else 0
    return map_at_k / n_users

In [81]:
mean_average_precision_at_k(item_prediction, test_data_matrix)

0.1631089784621003

In [83]:
def create_recommendations_csv(predictions_matrix,name='recommendations.csv'):
    cf_recommendations = []
    for user_id in range(predictions_matrix.shape[0]):
        # Indices of the top 10 items for each user (based on predicted interactions)
        top_10_items = np.argsort(predictions_matrix[user_id])[::-1][:10]  # Sorting them in descending order
        top_10_items_str = " ".join(map(str, top_10_items))

        # Append the user_id and the recommendations as a string
        cf_recommendations.append([user_id, top_10_items_str])

    recommendations_df = pd.DataFrame(cf_recommendations, columns=['user_id', 'recommendation'])
    recommendations_df.to_csv(name, index=False)
    print("Recommendations CSV created successfully.")
    return recommendations_df

In [84]:
create_recommendations_csv(user_prediction)

Recommendations CSV created successfully.


Unnamed: 0,user_id,recommendation
0,0,5 9 10 8 11 14 15 12 4 13
1,1,1573 35 36 33 32 31 37 29 30 34
2,2,67 45 75 64 82 56 49 53 58 46
3,3,139 133 156 143 142 128 155 138 40 149
4,4,195 199 196 197 201 193 203 191 198 202
...,...,...
7833,7833,3151 5118 3167 3245 5117 5116 3910 3121 7322 975
7834,7834,7127 3019 101 7129 13890 8413 8999 15276 7128 ...
7835,7835,8367 2125 8999 3019 45 53 1367 11126 4820 3055
7836,7836,14851 1397 7115 14555 10914 14557 8999 611 347...


In [85]:
from google.colab import files
files.download('recommendations.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>