# Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
import re
import csv
import warnings
warnings.filterwarnings('ignore')

# Collaborative Filtering (first attempts)


In [2]:
interactions = pd.read_csv('kaggle_data/interactions_train.csv')
items = pd.read_csv('kaggle_data/items.csv')

interactions = interactions.sort_values(["u", "t"])
interactions["pct_rank"] = interactions.groupby("u")["t"].rank(pct=True, method='dense')
interactions.reset_index(inplace=True, drop=True)

train_interactions = interactions[interactions["pct_rank"] < 0.8]
test_interactions = interactions[interactions["pct_rank"] >= 0.8]

print("Training set size:", train_interactions.shape[0])
print("Testing set size:", test_interactions.shape[0])

Training set size: 65419
Testing set size: 21628


In [3]:
# Create a user-item interaction matrix with binary values (1 if read, 0 otherwise)
binary_interaction_matrix = train_interactions.pivot_table(index='u', columns='i', values='t', aggfunc='count')
binary_interaction_matrix = binary_interaction_matrix.notnull().astype(int)

binary_interaction_matrix

i,0,1,2,3,4,5,6,7,8,9,...,15279,15280,15282,15283,15284,15285,15287,15288,15289,15290
u,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7833,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7834,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7836,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## User To User

In [None]:
# Compute cosine similarity between users

user_similarity = cosine_similarity(binary_interaction_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=binary_interaction_matrix.index, columns=binary_interaction_matrix.index)
user_similarity_df

u,0,1,2,3,4,5,6,7,8,9,...,7828,7829,7830,7831,7832,7833,7834,7835,7836,7837
u,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
k = 10

# 1. user-item scores for every user in one shot
scores = user_similarity_df.values @ binary_interaction_matrix.values        # (U, I)

# 2. indices of each user’s k best-scoring items (unsorted)
top_idx_unsorted = np.argpartition(-scores, k-1, axis=1)[:, :k]              # (U, k)

# 3. sort those k items per user so they’re really rank-ordered
rows   = np.arange(scores.shape[0])[:, None]                                  # (U, 1)
order  = np.argsort(-scores[rows, top_idx_unsorted], axis=1)
top_idx = top_idx_unsorted[rows, order]                                       # (U, k) sorted

# 4. look up the *labels* with NumPy → 2-D array → DataFrame
item_labels = binary_interaction_matrix.columns.to_numpy()                    # (I,)
top_labels  = item_labels[top_idx]                                            # (U, k)

recommendations = pd.DataFrame(
    top_labels,                              # the items
    index=binary_interaction_matrix.index,   # the users
    columns=range(k)                         # rank 0…9
)

# quick peek
print(recommendations.head())

     0    1    2    3    4    5    6    7    8     9
u                                                   
0   13    4   12   15   14   11    8   10    9     5
1   34   30   29   37   31   32   33   36   35  1573
2   46   58   53   49   56   82   64   75   45    67
3  149   40  138  155  128  142  143  156  133   139
4  202  198  191  203  193  201  197  196  199   195


In [6]:
def evaluate_precision_recall(recommendations, test_interactions, k=10):
    # Create a mapping from user to the set of items they interacted with in the test set
    test_user_items = test_interactions.groupby('u')['i'].apply(set)
    
    precisions = []
    recalls = []
    
    # Calculate precision and recall for each user with test interactions
    for user, rec_items in recommendations.iterrows():
        if user in test_user_items:
            relevant_items = test_user_items[user]
            rec_set = set(rec_items.values)
            true_positives = len(rec_set.intersection(relevant_items))
            precision = true_positives / len(rec_set)
            recall = true_positives / len(relevant_items) if len(relevant_items) > 0 else 0
            precisions.append(precision)
            recalls.append(recall)
            
    # Compute average precision and recall over all evaluated users
    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    avg_recall = sum(recalls) / len(recalls) if recalls else 0
    
    print("Average Precision at {}: {}".format(k, avg_precision))
    print("Average Recall at {}: {}".format(k, avg_recall))
    
    return avg_precision, avg_recall

In [7]:
evaluate_precision_recall(recommendations, test_interactions, k=10)

Average Precision at 10: 0.05646848685889258
Average Recall at 10: 0.290439819786585


(0.05646848685889258, 0.290439819786585)

## Item to Item

In [8]:
item_similarity = cosine_similarity(binary_interaction_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=binary_interaction_matrix.columns, columns=binary_interaction_matrix.columns)
item_similarity_df

i,0,1,2,3,4,5,6,7,8,9,...,15279,15280,15282,15283,15284,15285,15287,15288,15289,15290
i,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.000000,0.408248,0.333333,0.288675,0.109109,0.235702,0.288675,0.288675,0.218218,0.218218,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.408248,1.000000,0.408248,0.353553,0.133631,0.288675,0.353553,0.353553,0.267261,0.267261,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.333333,0.408248,1.000000,0.288675,0.109109,0.235702,0.288675,0.288675,0.218218,0.218218,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.288675,0.353553,0.288675,1.000000,0.094491,0.204124,0.250000,0.250000,0.188982,0.188982,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.109109,0.133631,0.109109,0.094491,1.000000,0.077152,0.094491,0.094491,0.071429,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15285,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15287,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
15288,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15289,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
k = 10

# 1. compute user-item scores using item similarity
# binary_interaction_matrix: (U, I)
# item_similarity_df: (I, I)
scores = binary_interaction_matrix.values @ item_similarity_df.values  # (U, I)

# 2. indices of each user's k best-scoring items (unsorted)
top_idx_unsorted = np.argpartition(-scores, k-1, axis=1)[:, :k]         # (U, k)

# 3. sort those k items per user to get rank-ordered recommendations
rows = np.arange(scores.shape[0])[:, None]                              # (U, 1)
order = np.argsort(-scores[rows, top_idx_unsorted], axis=1)
top_idx = top_idx_unsorted[rows, order]                                   # (U, k) sorted

# 4. look up the item labels (assuming column labels represent item ids)
item_labels = binary_interaction_matrix.columns.to_numpy()               # (I,)
top_labels = item_labels[top_idx]                                        # (U, k)

recommendations = pd.DataFrame(
    top_labels,                              # recommended items
    index=binary_interaction_matrix.index,   # users
    columns=range(k)                         # ranks 0…9
)

print(recommendations.head())

     0    1    2    3    4    5    6    7    8     9
u                                                   
0   18   11   14   15    1   12    0    2    6     3
1   33   36   31   35   32   29   30   37   34  2988
2   80   76   84   54   86   73   60   77   47    50
3  123  132  157  134  130  150  151  124  152   117
4  192  200  204  194  195  196  199  203  197   191


In [10]:
evaluate_precision_recall(recommendations, test_interactions)

Average Precision at 10: 0.05931360040826742
Average Recall at 10: 0.2816780249109841


(0.05931360040826742, 0.2816780249109841)

In [None]:
# Convert each row of recommendations to a space-separated string
recommendations_str = recommendations.apply(lambda row: ' '.join(row.astype(str)), axis=1)

# Export to CSV with a single-column header
recommendations_str.to_csv('recommendation_CF_item_item.csv', index=True, header=['recommendation'])

# Collaborative Filtering (the best scores)
Recommendations based on user-item interactions. 

In [12]:
# Enable or disable the upscaling of individual book weights for users with low book counts
upscale_low_book = True

# Define the threshold for the number of books below which upscaling will be applied
threshold_low_book = 2

In [None]:
# Load the data
interactions = pd.read_csv('kaggle_data/interactions_train.csv')
items = pd.read_csv('kaggle_data/items.csv')

interactions = interactions.sort_values(["u", "t"])
interactions["pct_rank"] = interactions.groupby("u")["t"].rank(pct=True, method='dense')
interactions.reset_index(inplace=True, drop=True)

# Split the data into training and testing sets
train_interactions = interactions[interactions["pct_rank"] < 0.8]
test_interactions = interactions[interactions["pct_rank"] >= 0.8]

print("Training set size:", train_interactions.shape[0])
print("Testing set size:", test_interactions.shape[0])

Training set size: 65419
Testing set size: 21628


In [None]:
# Aggregate train_interactions to calculate the number of unique items each user has interacted with
train_interactions_agg = train_interactions.groupby('u').agg(
    {
        'i': pd.Series.nunique,
    }
).reset_index().rename(
    columns={
        'i': 'n_items'
    }
)

# Check the loaded data
# Ensure the train_interactions dataset contains the required columns and is not empty
assert 'u' in train_interactions.columns and 'i' in train_interactions.columns and 't' in train_interactions.columns, "train_interactions file missing required columns"
assert not train_interactions.empty, "train_interactions dataset is empty"

# Ensure the items dataset contains the required columns and is not empty
assert 'Title' in items.columns and 'i' in items.columns, "Items file missing required columns"
assert not items.empty, "Items dataset is empty"

# Following adjustments
train_interactions = train_interactions.sort_values(["u", "t"], ascending=[True, False])

# Calculate the percentile rank of each interaction within a user's train_interactions
train_interactions["pct_rank"] = train_interactions.groupby("u")["t"].rank(pct=True, method='dense')
train_interactions.reset_index(inplace=True, drop=True)

# Merge the train_interactions dataset with the aggregated data to include the number of unique items per user
train_interactions = train_interactions.merge(train_interactions_agg, on='u', how='left')
train_interactions['weight'] = train_interactions["pct_rank"]

# If the upscale_low_book flag is True, assign a weight of 1 to train_interactions for users with a number of unique items less than or equal to the threshold
if upscale_low_book:
    train_interactions.loc[train_interactions['n_items'] <= threshold_low_book, 'weight'] = 1

inter_data = train_interactions.copy()

# Create binary interaction matrix
binary_interaction_matrix = inter_data.pivot_table(index='u', columns='i', values='weight', aggfunc='sum')
binary_interaction_matrix = binary_interaction_matrix.fillna(0)  # Fill NaN with 0

# Compute cosine similarity and test symmetry
user_similarity = cosine_similarity(binary_interaction_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=binary_interaction_matrix.index, columns=binary_interaction_matrix.index)

assert np.allclose(user_similarity_df, user_similarity_df.T), "User similarity matrix is not symmetric"

# Top-10 Recommendations from this approach
k = 10
scores = user_similarity_df.values @ binary_interaction_matrix.values
top_idx_unsorted = np.argpartition(-scores, k-1, axis=1)[:, :k]              
rows = np.arange(scores.shape[0])[:, None]
order = np.argsort(-scores[rows, top_idx_unsorted], axis=1)
top_idx = top_idx_unsorted[rows, order]
item_labels = binary_interaction_matrix.columns.to_numpy()                    
top_labels = item_labels[top_idx]                                            
recommendations = pd.DataFrame(top_labels, index=binary_interaction_matrix.index, columns=range(k))
assert recommendations.shape[0] == binary_interaction_matrix.shape[0], "Number of recommendation rows differs from number of users"
assert recommendations.shape[1] == k, "Each user should have 10 recommendations"

In [15]:
evaluate_precision_recall(recommendations, test_interactions)

Average Precision at 10: 0.06033426894615974
Average Recall at 10: 0.28928699997495383


(0.06033426894615974, 0.28928699997495383)

In [None]:
# Refit on full dataset
# interactions agg to get unique books
interactions_agg = interactions.groupby('u').agg(
    {
        'i': pd.Series.nunique,
    }
).reset_index().rename(
    columns={
        'i': 'n_items'
    }
)

# Check the loaded data
# Ensure the train_interactions dataset contains the required columns and is not empty
assert 'u' in interactions.columns and 'i' in interactions.columns and 't' in interactions.columns, "Interactions file missing required columns"
assert not interactions.empty, "Interactions dataset is empty"

# Ensure the items dataset contains the required columns and is not empty
assert 'Title' in items.columns and 'i' in items.columns, "Items file missing required columns"
assert not items.empty, "Items dataset is empty"

# Merge the train_interactions dataset with the aggregated data to include the number of unique items per user
interactions = interactions.sort_values(["u", "t"], ascending=[True, False])
interactions["pct_rank"] = interactions.groupby("u")["t"].rank(pct=True, method='dense')
interactions.reset_index(inplace=True, drop=True)


interactions = interactions.merge(interactions_agg, on='u', how='left')
interactions['weight'] = interactions["pct_rank"]

# If the upscale_low_book flag is True, assign a weight of 1 to train_interactions for users with a number of unique items less than or equal to the threshold
if upscale_low_book:
    interactions.loc[interactions['n_items'] <= threshold_low_book, 'weight'] = 1
train_data = interactions.copy()

# Create binary interaction matrix
binary_interaction_matrix = train_data.pivot_table(index='u', columns='i', values='weight', aggfunc='sum')
binary_interaction_matrix = binary_interaction_matrix.fillna(0)  # Fill NaN with 0

# Compute cosine similarity and test symmetry
user_similarity = cosine_similarity(binary_interaction_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=binary_interaction_matrix.index, columns=binary_interaction_matrix.index)
# The similarity matrix should be symmetric
assert np.allclose(user_similarity_df, user_similarity_df.T), "User similarity matrix is not symmetric"

# Top-10 Recommendations from this approach
k = 10
scores = user_similarity_df.values @ binary_interaction_matrix.values
top_idx_unsorted = np.argpartition(-scores, k-1, axis=1)[:, :k]
rows = np.arange(scores.shape[0])[:, None]
order = np.argsort(-scores[rows, top_idx_unsorted], axis=1)
top_idx = top_idx_unsorted[rows, order]
item_labels = binary_interaction_matrix.columns.to_numpy()                    
top_labels = item_labels[top_idx]                                            
recommendations = pd.DataFrame(top_labels, index=binary_interaction_matrix.index, columns=range(k))
assert recommendations.shape[0] == binary_interaction_matrix.shape[0], "Number of recommendation rows differs from number of users"
assert recommendations.shape[1] == k, "Each user should have 10 recommendations"

In [16]:
# Convert each row of recommendations to a space-separated string
recommendations_str = recommendations.apply(lambda row: ' '.join(row.astype(str)), axis=1)

# Export recommendations to a CSV file
if upscale_low_book:
    recommendations_str.to_csv(
        f'recommendations_collab_weight_pct_upscale_1_nitems_{threshold_low_book}.csv',
        index=True,
        header=['recommendation'],
        index_label='user_id'
    )
else:
    recommendations_str.to_csv(
        'recommendations_collab_weight_pct.csv',
        index=True,
        header=['recommendation'],
        index_label='user_id'
    )

# recommendations_collab_weight_pct_upscale_1_nitems_2.csv = 0.1642 (the best score)
# recommendations_collab_weight_pct.csv = 0.1637 (the second best score): without upscaling

# TFIDF-Based Book Recommendations (not cleaned dataset)
This section demonstrates how to generate book recommendations for each user based on the similarity of books they have already read using the TFIDF approach

In [17]:
# Load book complete data
books = pd.read_csv('kaggle_data/books_complete.csv')

# Download the NLTK stopwords dataset
nltk.download('stopwords')

# Load the list of French stop words from the NLTK stopwords corpus
french_stop_words = stopwords.words('french')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leonardogreco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# Better embeddings + better input data
# Drop unnecessary columns from the books dataframe
df_relevant_cols = books.drop(
    columns = ["dewey_decimal","image",'image_original','dimensions_structured',
               'msrp','binding','edition','related','dimensions',]
)

# Combine 'authors' and 'Author' columns, clean up the data
df_relevant_cols["prep_author"] = df_relevant_cols[['authors','Author']].fillna(method='bfill', axis=1).iloc[:, 0]\
    .str.replace("[",'').str.replace("]",'').str.replace("'",'').str.replace("unknown author",'')

# Clean up the 'synopsis' column
df_relevant_cols["prep_synopsis"] = df_relevant_cols['synopsis'].str.replace("<br/>"," ")\
    .str.replace("\n"," ").str.replace("<br>","")

# Combine 'publisher' and 'Publisher' columns, clean up the data
df_relevant_cols["prep_publisher"] = df_relevant_cols[['publisher','Publisher']].fillna(method='bfill', axis=1).iloc[:, 0]

# Combine 'title_long' and 'Title' columns, clean up the data
df_relevant_cols["prep_title"] = df_relevant_cols[['title_long','Title']].fillna(method='bfill', axis=1).iloc[:, 0]

# Copy the 'language' column to 'prep_language'
df_relevant_cols["prep_language"] = df_relevant_cols['language']

# Clean up the 'subjects' column and store in 'prep_subjects'
df_relevant_cols.loc[df_relevant_cols['subjects'] == "['Subjects']",'subjects'] = pd.NA
df_relevant_cols['subjects'] = df_relevant_cols['subjects'].str.replace("[",'').str.replace("]",'').str.replace("'",'')
df_relevant_cols["prep_subjects"] = df_relevant_cols[['subjects','Subjects']].fillna(method='bfill', axis=1).iloc[:, 0]

# Convert 'pages' column to string
df_relevant_cols['prep_pages'] = df_relevant_cols['pages'].astype('Int64').astype(str)

# Copy the 'ISBN Valid' column to 'prep_isbn'
df_relevant_cols['prep_isbn'] = df_relevant_cols['ISBN Valid']

# Copy the 'date_published' column to 'prep_published'
df_relevant_cols["prep_published"] = df_relevant_cols['date_published']

In [19]:
# Preprocess book data for TFIDF
books['combined_features'] = (
    df_relevant_cols['prep_title'].fillna('') + ' ' +
    df_relevant_cols['prep_author'].fillna('') + ' ' +
    df_relevant_cols['prep_isbn'].fillna('') + ' ' +
    df_relevant_cols['prep_publisher'].fillna('') + ' ' +
    # df_relevant_cols['prep_subjects'].fillna('') + ' ' +
    # df_relevant_cols['prep_synopsis'].fillna('') + ' ' + other features lead to a lower score
    # df_relevant_cols['prep_language'].fillna('') + ' ' +
    df_relevant_cols['prep_pages'].str.replace("<NA>","")
    # df_relevant_cols['prep_published'].fillna('')
)

books['combined_features'] = books['combined_features'].fillna('')


In [20]:
train_interactions = interactions[interactions["pct_rank"] < 0.8]
test_interactions = interactions[interactions["pct_rank"] >= 0.8]

In [21]:

# Create a TF-IDF vectorizer using French stop words and fit-transform the 'combined_features' column of the books
tfidf_vectorizer = TfidfVectorizer(stop_words=french_stop_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(books['combined_features'].fillna(''))

In [22]:
# naive_limit_bool = True, if you want to limit the recommendations, otherwise set it to False
naive_limit_bool = True

interactions_agg = train_interactions.groupby('u').agg(
    min_book_id = ('i', min),
    max_book_id = ('i', max),
).reset_index()


# Compute recommendations for each user
user_recommendations = {}
for user_id, group in interactions.groupby('u'):
    read_books = group['i'].values
    read_books_indices = [books[books['i'] == book_id].index[0] for book_id in read_books]
    read_books_tfidf = tfidf_matrix[read_books_indices]
    similarity_scores = cosine_similarity(read_books_tfidf, tfidf_matrix)
    avg_similarity = np.mean(similarity_scores, axis=0)
    recommended_indices = avg_similarity.argsort()[-15288:][::-1]

    recommended_books = books.iloc[recommended_indices]['i'].values

    if naive_limit_bool:
        lower_bound = interactions_agg[interactions_agg['u'] == user_id]['min_book_id'].values[0]
        upper_bound = interactions_agg[interactions_agg['u'] == user_id]['max_book_id'].values[0]
        
        if upper_bound > lower_bound:
            upper_bound = min(upper_bound+11, 15290)
            recommended_books = recommended_books[(recommended_books >= lower_bound) & (recommended_books <= upper_bound)]

        if len(recommended_books) < 10:
            print(f"User {user_id} has less than 10 recommendations. Found: {len(recommended_books)}")
            print("upper_bound:", upper_bound)
            print("lower_bound:", lower_bound)
    
    user_recommendations[user_id] = recommended_books[:10]

In [23]:
evaluate_precision_recall(pd.DataFrame(user_recommendations).T, test_interactions, k=10)

Average Precision at 10: 0.13916815514161776
Average Recall at 10: 0.6400121269555963


(0.13916815514161776, 0.6400121269555963)

In [24]:
# Display recommendations for a sample user
sample_user_id = list(user_recommendations.keys())[0]
print(f"Recommendations for User {sample_user_id}:")
print(user_recommendations[sample_user_id])

Recommendations for User 0:
[ 3 18 17 15 14 13 20 19  7  6]


In [25]:
# Export recommendations to a CSV file
with open('recommendations_tfidf_naive_proper_input_less_features.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['user_id', 'recommendation'])
    for user_id, recommended_books in user_recommendations.items():
        writer.writerow([user_id, " ".join(map(str, recommended_books))])

# Score = 0.1560
print("Recommendations exported to recommendations_tfidf_naive_proper_input_less_features.csv")

Recommendations exported to recommendations_tfidf_naive_proper_input_less_features.csv


In [26]:
# For each user, who has read only one unique book (does not matter once or several times), recommend 10 times the same book
user_embeddings_df = train_interactions.copy()

user_embeddings_df = user_embeddings_df.sort_values(by=["u", "t"],ascending=[True, False]).reset_index(drop=True)
user_embeddings_df['last_unique_books'] = user_embeddings_df\
        .groupby('u')['i']\
        .apply(lambda x: (~pd.Series(x).duplicated()).cumsum()).reset_index(drop=True)

# Get users who have read exactly 1 unique book
users_with_1_unique_book = user_embeddings_df.groupby('u')['i'].nunique()
users_with_1_unique_book = users_with_1_unique_book[users_with_1_unique_book == 1].index
u_1 = user_embeddings_df[user_embeddings_df['u'].isin(users_with_1_unique_book)].drop_duplicates(subset=['u', 'i'])
u_1

Unnamed: 0,u,i,t,pct_rank,last_unique_books
636,36,669,1.717176e+09,0.750000,1
1244,76,1345,1.685960e+09,0.666667,1
2322,178,2406,1.694169e+09,0.785714,1
2472,188,2577,1.701194e+09,0.600000,1
2767,218,611,1.689785e+09,0.666667,1
...,...,...,...,...,...
65295,7810,4426,1.726226e+09,0.727273,1
65314,7816,14826,1.727356e+09,0.666667,1
65320,7818,14555,1.726739e+09,0.714286,1
65354,7823,53,1.726938e+09,0.761905,1


In [27]:
for user_id in u_1['u'].unique():
    book_id = u_1[u_1['u'] == user_id]['i'].values[0]
    # Create a recommendation list with 10 copies of the book ID
    user_recommendations[user_id] = [book_id] * 10

In [28]:
# Export recommendations to a CSV file
with open('recommendations_tfidf_naive_proper_input_less_features_1.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['user_id', 'recommendation'])
    for user_id, recommended_books in user_recommendations.items():
        writer.writerow([user_id, " ".join(map(str, recommended_books))])

# Score = 0.1558
print("Recommendations exported to recommendations_tfidf_naive_proper_input_less_features_1.csv")

Recommendations exported to recommendations_tfidf_naive_proper_input_less_features_1.csv
