! If running only combined TF-IDF and CF, run only the cells with indications (otherwise not enough RAM)

TF-IDF+CF: run until next text cell

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
filepath='https://raw.githubusercontent.com/fedeamedeo/machine-learning-project/main/Data/'
interactions = pd.read_csv(filepath + 'interactions_train.csv')
items=pd.read_csv(filepath + 'items.csv')

In [3]:
n_users = interactions.u.nunique()
n_items = items.i.nunique()

In [4]:
interactions = interactions.sort_values(["u", "t"])

In [5]:
interactions["pct_rank"] = interactions.groupby("u")["t"].rank(pct=True, method='dense')
interactions.reset_index(inplace=True, drop=True)
interactions.head(10)

Unnamed: 0,u,i,t,pct_rank
0,0,0,1680191000.0,0.04
1,0,1,1680783000.0,0.08
2,0,2,1680801000.0,0.12
3,0,3,1683715000.0,0.16
4,0,3,1683715000.0,0.2
5,0,4,1686569000.0,0.24
6,0,5,1687014000.0,0.28
7,0,6,1687014000.0,0.32
8,0,7,1687014000.0,0.36
9,0,8,1687260000.0,0.4


In [6]:
train_data = interactions[interactions["pct_rank"] < 0.8]
test_data = interactions[interactions["pct_rank"] >= 0.8]

In [7]:
import numpy as np

# Define a function to create the data matrix
def create_data_matrix(data, n_users, n_items):
    """
    This function returns a numpy matrix with shape (n_users, n_items).
    Each entry is a binary value indicating positive interaction.
    """
    data_matrix = np.zeros((n_users, n_items))
    data_matrix[data["u"].values, data["i"].values] = 1
    return data_matrix

In [8]:
# Create the training and testing matrices
train_data_matrix = create_data_matrix(train_data, n_users, n_items)
test_data_matrix = create_data_matrix(test_data, n_users, n_items)

# Display the matrices to understand their structure
print('train_data_matrix')
print(train_data_matrix)
print("number of non-zero values: ", np.sum(train_data_matrix))
print('test_data_matrix')
print(test_data_matrix)
print("number of non-zero values: ", np.sum(test_data_matrix))

train_data_matrix
[[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
number of non-zero values:  49689.0
test_data_matrix
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
number of non-zero values:  19409.0


# Collaborative Filtering item-based

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the item-item similarity matrix
# Check sklearn documentation for cosine similarity to understand how we are
# using this function.
item_similarity = cosine_similarity(train_data_matrix.T)
print("Item-Item Similarity Matrix:")
print(item_similarity)
print(item_similarity.shape)

Item-Item Similarity Matrix:
[[1.         0.40824829 0.33333333 ... 0.         0.         0.        ]
 [0.40824829 1.         0.40824829 ... 0.         0.         0.        ]
 [0.33333333 0.40824829 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
(15291, 15291)


In [10]:
import numpy as np

# Define the function to predict interactions based on item similarity
def item_based_predict(interactions, similarity, epsilon=1e-9):
    """
    Predicts user-item interactions based on item-item similarity.
    Parameters:
        interactions (numpy array): The user-item interaction matrix.
        similarity (numpy array): The item-item similarity matrix.
        epsilon (float): Small constant added to the denominator to avoid division by zero.
    Returns:
        numpy array: The predicted interaction scores for each user-item pair.
    """
    # np.dot does the matrix multiplication. Here we are calculating the
    # weighted sum of interactions based on item similarity
    pred = similarity.dot(interactions.T) / (similarity.sum(axis=1)[:, np.newaxis] + epsilon)
    return pred.T  # Transpose to get users as rows and items as columns



TF-IDF+CF: stop here

In [None]:
# Calculate the item-based predictions for positive interactions
item_prediction = item_based_predict(train_data_matrix, item_similarity)
print("Predicted Interaction Matrix:")
print(item_prediction)
print(item_prediction.shape)

# User-user CF

In [11]:
# Compute the user-user similarity matrix
user_similarity = cosine_similarity(train_data_matrix)
print("User-User Similarity Matrix:")
print(user_similarity)

# Check the shape as a sanity check
print("Shape of User Similarity Matrix:", user_similarity.shape)

User-User Similarity Matrix:
[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
Shape of User Similarity Matrix: (7838, 7838)


In [12]:
# Define the function to predict interactions based on user similarity
def user_based_predict(interactions, similarity, epsilon=1e-9):
    """
    Predicts user-item interactions based on user-user similarity.
    Parameters:
        interactions (numpy array): The user-item interaction matrix.
        similarity (numpy array): The user-user similarity matrix.
        epsilon (float): Small constant added to the denominator to avoid division by zero.
    Returns:
        numpy array: The predicted interaction scores for each user-item pair.
    """
    # Calculate the weighted sum of interactions based on user similarity
    pred = similarity.dot(interactions) / (np.abs(similarity).sum(axis=1)[:, np.newaxis] + epsilon)
    return pred

# Calculate the user-based predictions for positive interactions
user_prediction = user_based_predict(train_data_matrix, user_similarity)
print("Predicted Interaction Matrix (User-Based):")
print(user_prediction)
print(user_prediction.shape)

Predicted Interaction Matrix (User-Based):
[[0.13171649 0.13259591 0.14274867 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
(7838, 15291)


In [13]:
def mean_average_precision_at_k(predictions, ground_truth, k=10):
    map_at_k = 0
    n_users = predictions.shape[0]
    for user in range(n_users):
        predicted_ranks = np.argsort(predictions[user])[::-1][:k]
        true_items = np.where(ground_truth[user] == 1)[0]
        hits = 0
        sum_precisions = 0
        for i, item in enumerate(predicted_ranks):
            if item in true_items:
                hits += 1
                sum_precisions += hits / (i + 1)
        map_at_k += sum_precisions / min(len(true_items), k) if len(true_items) > 0 else 0
    return map_at_k / n_users

In [14]:
mean_average_precision_at_k(user_prediction, test_data_matrix)

0.1800190543196756

In [15]:
def create_recommendations_csv(predictions_matrix,name='recommendations.csv'):
    user_based_recommendations = []
    for user_id in range(predictions_matrix.shape[0]):
        # Indices of the top 10 items for each user (based on predicted interactions)
        top_10_items = np.argsort(predictions_matrix[user_id])[-10:][::-1]  # Sorting them in descending order

        top_10_items_str = " ".join(map(str, top_10_items))

        # Append the user_id and the recommendations as a string
        user_based_recommendations.append([user_id, top_10_items_str])

    recommendations_df = pd.DataFrame(user_based_recommendations, columns=['user_id', 'recommendation'])
    recommendations_df.to_csv(name, index=False)
    print("Recommendations CSV created successfully.")
    return recommendations_df

In [16]:
create_recommendations_csv(user_prediction)

Recommendations CSV created successfully.


Unnamed: 0,user_id,recommendation
0,0,13 4 12 15 14 11 8 10 9 5
1,1,34 30 29 37 31 32 33 36 35 1573
2,2,46 58 53 49 56 82 64 75 45 67
3,3,149 40 138 155 128 142 143 156 133 139
4,4,202 198 191 203 193 201 197 196 199 195
...,...,...
7833,7833,975 7322 3121 3910 5116 5117 3245 3167 5118 3151
7834,7834,13891 7128 15276 8999 8413 13890 7129 101 3019...
7835,7835,3055 4820 11126 1367 53 45 3019 8999 2125 8367
7836,7836,14550 3471 611 8999 14557 10914 14555 7115 139...


In [17]:
from google.colab import files
files.download('recommendations.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# TF-IDF

TF-IFD+CF: run from here

In [28]:
filepath = 'https://raw.githubusercontent.com/fedeamedeo/machine-learning-project/main/'
items= pd.read_csv(filepath+'items_improved.csv')

In [34]:
# Load book metadata
items['content'] = items['Title'].fillna('') + ' ' + \
                   items['Author'].fillna('') + ' ' + \
                   items['Subjects'].fillna('') + ' ' + \
                   items['Publisher'].fillna('')

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# Download French stopwords if not already
import nltk
nltk.download('stopwords')

# Load French stopwords as a list
french_stopwords = stopwords.words('french')
vectorizer = TfidfVectorizer(stop_words=french_stopwords, max_features=1000000)
tfidf_matrix = vectorizer.fit_transform(items['content'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [77]:
from scipy.sparse import csr_matrix

# Merge interactions and items to match on 'i'
merged = interactions.merge(items[['i']], on='i')

# Create a user profile: mean of TF-IDF vectors of books they've interacted with
user_profiles = {}
for user_id, group in merged.groupby('u'):
    indices = group['i'].values
    profile = tfidf_matrix[indices].mean(axis=0).A1
    user_profiles[user_id] = profile


In [78]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

recommendations = []

for user_id, profile in user_profiles.items():
    # Reshape profile to ensure compatibility with tfidf_matrix
    sims = cosine_similarity(profile.reshape(1, -1), tfidf_matrix).flatten()
    top_indices = np.argsort(-sims)[:10]  # top 10
    book_ids = items.iloc[top_indices]['i'].tolist()
    recommendations.append((user_id, book_ids))


In [79]:
tfidf_sims=cosine_similarity(tfidf_matrix)

TF-IDF+CF: stop here

In [80]:
import csv

with open("tf_idf.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["user_id", "recommendation"])
    for user_id, book_ids in recommendations:
        writer.writerow([user_id, " ".join(map(str, book_ids))])

In [81]:
from google.colab import files
files.download('tf_idf.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Embeddings


In [26]:
#create text embeddings
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [27]:
item_embeddings = model.encode(items['content'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/478 [00:00<?, ?it/s]

In [29]:
item_embeddings_df = pd.DataFrame(item_embeddings)
item_embeddings_df.to_csv('item_embeddings.csv', index=False) # Save to CSV
from google.colab import files
files.download('item_embeddings.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
items.head()

In [30]:
embedding_similarity = cosine_similarity(item_embeddings)

In [None]:
embedding_similarity.shape

In [None]:
item_embeddings.shape

In [None]:
sims.shape

In [31]:
# Step 1: Compute user profile embeddings
def compute_user_profiles(train_data_matrix, item_embeddings):
    n_users = train_data_matrix.shape[0]
    embedding_dim = item_embeddings.shape[1]
    user_profiles = np.zeros((n_users, embedding_dim))

    for user_id in range(n_users):
        interacted_items = np.where(train_data_matrix[user_id] > 0)[0]
        if len(interacted_items) > 0:
            user_profiles[user_id] = item_embeddings[interacted_items].mean(axis=0)
        else:
            # Optional: handle cold-start users by using the mean embedding or zeros
            user_profiles[user_id] = np.zeros(embedding_dim)

    return user_profiles

In [32]:
# Step 2: Compute similarity between user profiles and all item embeddings
def predict_scores(user_profiles, item_embeddings):
    return cosine_similarity(user_profiles, item_embeddings)

In [25]:
# Step 3: Generate top-N recommendations
def generate_top_n_recommendations(score_matrix, top_n=10):
    recommendations = []
    for user_id in range(score_matrix.shape[0]):
        top_items = np.argsort(score_matrix[user_id])[::-1][:top_n]
        recommendations.append([user_id, " ".join(map(str, top_items))])
    return recommendations

In [34]:
user_profiles = compute_user_profiles(train_data_matrix, item_embeddings)
score_matrix = predict_scores(user_profiles, item_embeddings)
recommendations = generate_top_n_recommendations(score_matrix, top_n=10)

# To convert to a DataFrame for submission or display:
import pandas as pd
recommendations_df = pd.DataFrame(recommendations, columns=["user_id", "recommendation"])
recommendations_df.to_csv("embedding_based_recommendations.csv", index=False)


In [35]:
files.download('embedding_based_recommendations.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Combining TF-IDF and CF

In [19]:

def map10(predictions, true_positives, k=10):
    """
    Calculates the Mean Average Precision@K (MAP@K) metric.

    Args:
        predictions (numpy.ndarray): A matrix of predicted relevance scores
                                      (users x items).
        true_positives (numpy.ndarray): A binary matrix of true positive interactions
                                        (users x items).
        k (int): The cutoff value for the top-k recommendations.

    Returns:
        float: The MAP@K score.
    """
    map_at_k = 0
    n_users = predictions.shape[0]

    for user in range(n_users):
        predicted_ranks = np.argsort(predictions[user])[::-1][:k]  # Top k predictions
        true_positives_in_rank = np.isin(predicted_ranks, np.where(true_positives[user] == 1)[0])

        precision_at_i = []
        for i in range(k):
            if true_positives_in_rank[i]:  # Check if item at rank i is a true positive
                precision_at_i.append(np.sum(true_positives_in_rank[:i+1]) / (i+1))

        if len(precision_at_i) > 0:  # To avoid division by zero if no relevant items are predicted
            average_precision = np.sum(precision_at_i) / np.sum(true_positives[user])
        else:
            average_precision = 0

        map_at_k += average_precision
        map_at_k /= n_users  # Average over all users
        return map_at_k

In [21]:
alpha=0.8

In [20]:
alphas = np.linspace(0, 1, 11)  # Try alpha from 0.0 to 1.0 in steps of 0.1
results = []

for alpha in alphas:
    # Blend similarity matrices
    combined_similarity = alpha * item_similarity + (1 - alpha) * tfidf_sims

    # Generate predictions using the hybrid similarity
    hybrid_prediction = item_based_predict(train_data_matrix, combined_similarity)

    # Evaluate using your pre-defined MAP@10 function
    score = map10(test_data_matrix, hybrid_prediction)  # or whatever your function is named
    results.append((alpha, score))
    print(f"Alpha: {alpha:.2f}, MAP@10: {score:.5f}")

Alpha: 0.00, MAP@10: 0.00000


KeyboardInterrupt: 

In [22]:
combined_similarity = alpha * item_similarity + (1 - alpha) * tfidf_sims

In [23]:
hybrid_prediction = item_based_predict(train_data_matrix, combined_similarity)

In [26]:
hybrid_recs=generate_top_n_recommendations(hybrid_prediction, top_n=10)

In [27]:
hybrid_recs_df=pd.DataFrame(hybrid_recs, columns=['user_id','recommendation'])
hybrid_recs_df.to_csv('TF_IDF_and_CF.csv', index=False)
from google.colab import files
files.download('TF_IDF_and_CF.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Last time i tried, alpha = 0.8 had a score of 0.1258 (still lower than TF_IDF alone)