# Importing the libraries

In [10]:
import pandas as pd
import numpy as np
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
import re
import csv

# Collaborative Filtering (first attempts)
Recommendations based on user-item interactions. 


### Train Test Split

In [11]:
# Load the data
interactions = pd.read_csv('kaggle_data/interactions_train.csv')
items = pd.read_csv('kaggle_data/items.csv')

interactions = interactions.sort_values(["u", "t"])

interactions["pct_rank"] = interactions.groupby("u")["t"].rank(pct=True, method='dense')
interactions.reset_index(inplace=True, drop=True)

train_data = interactions[interactions["pct_rank"] < 0.9]
test_data = interactions[interactions["pct_rank"] >= 0.9]


print("Training set size:", train_data.shape[0])
print("Testing set size:", test_data.shape[0])

Training set size: 73892
Testing set size: 13155


### Collaborative Filtering (simple approach)
Using the initial raw data

In [15]:
# Create a user-item interaction matrix with binary values (1 if read, 0 otherwise)
binary_interaction_matrix = train_data.pivot_table(index='u', columns='i', values='t', aggfunc='count')
binary_interaction_matrix = binary_interaction_matrix.notnull().astype(int)

binary_interaction_matrix

i,0,1,2,3,4,5,6,7,8,9,...,15279,15280,15282,15283,15284,15285,15287,15288,15289,15290
u,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7833,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7834,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7835,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7836,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Compute Cosine Similarity Between Users
user_similarity = cosine_similarity(binary_interaction_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=binary_interaction_matrix.index, columns=binary_interaction_matrix.index)
user_similarity_df

u,0,1,2,3,4,5,6,7,8,9,...,7828,7829,7830,7831,7832,7833,7834,7835,7836,7837
u,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.0,1.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101015
3,0.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7833,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.000000
7834,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000
7835,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000
7836,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000000


In [None]:
k = 10

# User-item scores for every user in one shot
scores = user_similarity_df.values @ binary_interaction_matrix.values        # (U, I)

# Indices of each user’s k best-scoring items (unsorted)
top_idx_unsorted = np.argpartition(-scores, k-1, axis=1)[:, :k]              # (U, k)

# Sort those k items per user so they’re really rank-ordered
rows   = np.arange(scores.shape[0])[:, None]                                  # (U, 1)
order  = np.argsort(-scores[rows, top_idx_unsorted], axis=1)
top_idx = top_idx_unsorted[rows, order]                                       # (U, k) sorted

# Look up the *labels* with NumPy → 2-D array → DataFrame
item_labels = binary_interaction_matrix.columns.to_numpy()                    # (I,)
top_labels  = item_labels[top_idx]                                            # (U, k)

recommendations = pd.DataFrame(
    top_labels,                      
    index=binary_interaction_matrix.index, 
    columns=range(k)                        
)

# quick peek
print(recommendations.head())

     0    1    2    3    4    5    6    7    8    9
u                                                  
0   13    4   12   15    8   11   14   10    5   17
1   38   34   31   30   37   29   32   33   36   35
2   46   58   53   49   56   64   91   82   71   45
3  149  163  128  143  133  138   40  155  142  156
4  202  203  198  191  193  201  195  197  205  196


In [18]:
# Convert each row of recommendations to a space-separated string
recommendations_str = recommendations.apply(lambda row: ' '.join(row.astype(str)), axis=1)

# Export to CSV with a single-column header
recommendations_str.to_csv('recommendations.csv', index=True, header=['recommendation'])

### Collaborative Filtering (better predictions)

In [19]:
# Apply TruncatedSVD to reduce dimensionality
svd = TruncatedSVD(n_components=50, random_state=42)
user_factors = svd.fit_transform(binary_interaction_matrix.values)
item_factors = svd.components_

# Reconstruct approximate interaction scores
approx_scores = user_factors @ item_factors

# Convert reconstructed scores into a DataFrame (same indexes and columns as original)
approx_scores_df = pd.DataFrame(approx_scores, index=binary_interaction_matrix.index, columns=binary_interaction_matrix.columns)

# Get top 10 items for each user
k = 10
recommendations_svd = approx_scores_df.apply(lambda row: row.nlargest(k).index.tolist(), axis=1)
recommendations_svd.head()

u
0    [611, 46, 4, 8999, 794, 3407, 3811, 685, 13, 2...
1    [611, 789, 4220, 5140, 2959, 769, 796, 176, 33...
2    [46, 323, 56, 2130, 5748, 3055, 66, 8999, 70, 64]
3    [163, 149, 618, 611, 128, 466, 119, 4, 2614, 143]
4    [424, 323, 201, 2225, 428, 423, 976, 422, 324,...
dtype: object

# Collaborative Filtering (the best scores)
Recommendations based on user-item interactions. 

In [None]:
# Enable or disable the upscaling of individual book weights for users with low book counts
upscale_low_book = True

# Define the threshold for the number of books below which upscaling will be applied
threshold_low_book = 2

In [None]:
# Load the data
interactions = pd.read_csv('kaggle_data/interactions_train.csv')
items = pd.read_csv('kaggle_data/items.csv')

# Aggregate interactions to calculate the number of unique items each user has interacted with
interactions_agg = interactions.groupby('u').agg(
    {
        'i': pd.Series.nunique,
    }
).reset_index().rename(
    columns={
        'i': 'n_items'
    }
)

# Check the loaded data
# Ensure the interactions dataset contains the required columns and is not empty
assert 'u' in interactions.columns and 'i' in interactions.columns and 't' in interactions.columns, "Interactions file missing required columns"
assert not interactions.empty, "Interactions dataset is empty"

# Ensure the items dataset contains the required columns and is not empty
assert 'Title' in items.columns and 'i' in items.columns, "Items file missing required columns"
assert not items.empty, "Items dataset is empty"

# Following adjustments
interactions = interactions.sort_values(["u", "t"], ascending=[True, False])

# Calculate the percentile rank of each interaction within a user's interactions
interactions["pct_rank"] = interactions.groupby("u")["t"].rank(pct=True, method='dense')
interactions.reset_index(inplace=True, drop=True)

# Merge the interactions dataset with the aggregated data to include the number of unique items per user
interactions = interactions.merge(interactions_agg, on='u', how='left')
interactions['weight'] = interactions["pct_rank"]

# If the upscale_low_book flag is True, assign a weight of 1 to interactions for users with a number of unique items less than or equal to the threshold
if upscale_low_book:
    interactions.loc[interactions['n_items'] <= threshold_low_book, 'weight'] = 1

inter_data = interactions.copy()

# Create binary interaction matrix
binary_interaction_matrix = inter_data.pivot_table(index='u', columns='i', values='weight', aggfunc='sum')
binary_interaction_matrix = binary_interaction_matrix.fillna(0)  # Fill NaN with 0

# Compute cosine similarity and test symmetry
user_similarity = cosine_similarity(binary_interaction_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=binary_interaction_matrix.index, columns=binary_interaction_matrix.index)

assert np.allclose(user_similarity_df, user_similarity_df.T), "User similarity matrix is not symmetric"

# Top-10 Recommendations from this approach
k = 10
scores = user_similarity_df.values @ binary_interaction_matrix.values
top_idx_unsorted = np.argpartition(-scores, k-1, axis=1)[:, :k]              
rows = np.arange(scores.shape[0])[:, None]
order = np.argsort(-scores[rows, top_idx_unsorted], axis=1)
top_idx = top_idx_unsorted[rows, order]
item_labels = binary_interaction_matrix.columns.to_numpy()                    
top_labels = item_labels[top_idx]                                            
recommendations = pd.DataFrame(top_labels, index=binary_interaction_matrix.index, columns=range(k))
assert recommendations.shape[0] == binary_interaction_matrix.shape[0], "Number of recommendation rows differs from number of users"
assert recommendations.shape[1] == k, "Each user should have 10 recommendations"

In [None]:
# Convert each row of recommendations to a space-separated string
recommendations_str = recommendations.apply(lambda row: ' '.join(row.astype(str)), axis=1)

# Export recommendations to a CSV file
if upscale_low_book:
    recommendations_str.to_csv(
        f'recommendations_collab_weight_pct_upscale_1_nitems_{threshold_low_book}.csv',
        index=True,
        header=['recommendation'],
        index_label='user_id'
    )
else:
    recommendations_str.to_csv(
        'recommendations_collab_weight_pct.csv',
        index=True,
        header=['recommendation'],
        index_label='user_id'
    )

# recommendations_collab_weight_pct_upscale_1_nitems_2.csv = 0.1642 (the best score)
# recommendations_collab_weight_pct.csv = 0.1637 (the second best score): without upscaling

# TFIDF-Based Book Recommendations (not cleaned dataset)
This section demonstrates how to generate book recommendations for each user based on the similarity of books they have already read using the TFIDF approach

In [None]:
# Create a TF-IDF vectorizer using French stop words and fit-transform the 'combined_features' column of the books
tfidf_vectorizer = TfidfVectorizer(stop_words=french_stop_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(books['combined_features'])

In [None]:
# naive_limit_bool = True, if you want to limit the recommendations, otherwise set it to False
naive_limit_bool = True

interactions_agg = interactions.groupby('u').agg(
    min_book_id = ('i', min),
    max_book_id = ('i', max),
).reset_index()


# Compute recommendations for each user
user_recommendations = {}
for user_id, group in interactions.groupby('u'):
    read_books = group['i'].values
    read_books_indices = [books[books['i'] == book_id].index[0] for book_id in read_books]
    read_books_tfidf = tfidf_matrix[read_books_indices]
    similarity_scores = cosine_similarity(read_books_tfidf, tfidf_matrix)
    avg_similarity = np.mean(similarity_scores, axis=0)
    recommended_indices = avg_similarity.argsort()[-15288:][::-1]

    recommended_books = books.iloc[recommended_indices]['i'].values

    if naive_limit_bool:
        lower_bound = interactions_agg[interactions_agg['u'] == user_id]['min_book_id'].values[0]
        upper_bound = interactions_agg[interactions_agg['u'] == user_id]['max_book_id'].values[0]
        
        if upper_bound > lower_bound:
            upper_bound = min(upper_bound+11, 15290)
            recommended_books = recommended_books[(recommended_books >= lower_bound) & (recommended_books <= upper_bound)]

        if len(recommended_books) < 10:
            print(f"User {user_id} has less than 10 recommendations. Found: {len(recommended_books)}")
            print("upper_bound:", upper_bound)
            print("lower_bound:", lower_bound)
    
    user_recommendations[user_id] = recommended_books[:10]

In [None]:
# Display recommendations for a sample user
sample_user_id = list(user_recommendations.keys())[0]
print(f"Recommendations for User {sample_user_id}:")
print(user_recommendations[sample_user_id])

In [None]:
# Export recommendations to a CSV file
with open('recommendations_tfidf_naive_proper_input_less_features.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['user_id', 'recommendation'])
    for user_id, recommended_books in user_recommendations.items():
        writer.writerow([user_id, " ".join(map(str, recommended_books))])

# Score = 0.1560
print("Recommendations exported to recommendations_tfidf_naive_proper_input_less_features.csv")

In [None]:
# For each user, who has read only one unique book (does not matter once or several times), recommend 10 times the same book
user_embeddings_df = interactions.copy()

user_embeddings_df = user_embeddings_df.sort_values(by=["u", "t"],ascending=[True, False]).reset_index(drop=True)
user_embeddings_df['last_unique_books'] = user_embeddings_df\
        .groupby('u')['i']\
        .apply(lambda x: (~pd.Series(x).duplicated()).cumsum()).reset_index(drop=True)

# Get users who have read exactly 1 unique book
users_with_1_unique_book = user_embeddings_df.groupby('u')['i'].nunique()
users_with_1_unique_book = users_with_1_unique_book[users_with_1_unique_book == 1].index
u_1 = user_embeddings_df[user_embeddings_df['u'].isin(users_with_1_unique_book)].drop_duplicates(subset=['u', 'i'])
u_1

In [None]:
for user_id in u_1['u'].unique():
    book_id = u_1[u_1['u'] == user_id]['i'].values[0]
    # Create a recommendation list with 10 copies of the book ID
    user_recommendations[user_id] = [book_id] * 10

In [None]:
# Export recommendations to a CSV file
with open('recommendations_tfidf_naive_proper_input_less_features_1.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['user_id', 'recommendation'])
    for user_id, recommended_books in user_recommendations.items():
        writer.writerow([user_id, " ".join(map(str, recommended_books))])

# Score = 0.1558
print("Recommendations exported to recommendations_tfidf_naive_proper_input_less_features_1.csv")

# TFIDF-Based Book Recommendations (cleaned dataset)
This section demonstrates how to generate book recommendations for each user based on the similarity of books they have already read using the TFIDF approach

### Cleaning the data (same as in the EPFL_Apple_EDA file)

In [None]:
# Load book complete data
books = pd.read_csv('kaggle_data/books_complete.csv')

# Download the NLTK stopwords dataset
nltk.download('stopwords')

# Load the list of French stop words from the NLTK stopwords corpus
french_stop_words = stopwords.words('french')

# Function to get the index of the first digit in a given sequence
def get_first_digit_index(sequence):
    match = re.search(r"\d", str(sequence))
    if match:
        return match.start()
    else:
        return -1
    
# Function to extract a substring from a given sequence starting from a specified index
def substring_from_index(sequence, index):
    if index != -1:
        # If a valid index is provided, return the substring starting from the index, stripped of leading/trailing spaces
        return sequence[:index].strip()
    elif type(sequence) == str:
        return sequence.strip()
    else:
        return sequence

# Function to remove specific words from a given sequence
def remove_words(sequence, list_words):
    if type(sequence) == str:
        for word in list_words:
            sequence = sequence.replace(word, "")
        return sequence.strip()
    else:
        return sequence

# Function to remove the last comma from a given sequence
def remove_last_comma(sequence):
    if type(sequence) == str:
        return sequence.strip().rstrip(",")
    else:
        return sequence

# Drop unnecessary columns from the books dataframe
df_relevant_cols = books.drop(
    columns=["dewey_decimal", "image", "image_original", "dimensions_structured",
             "msrp", "binding", "edition", "related", "dimensions",]
)

# Fix 'Author' column
stop_words = [
        'artiste','actuaire,','avocat,','illustrateur,','juriste,',
        'dr. en droit,','dr en droit,','actuaire,','saint','Juge cantonal,','juge cantonal,',
        "auteur d'un ouvrage sur l'art publicitaire",
        "Docteur ès sciences politiques",
        "rédactrice en chef d'un magazine sur la nature",
        'archéologue','auteur de BD','dit Benedictus de', 'économiste',
        'enseignant','sociologue',  'psychiatre', 'juriste', 'docteur',
        "Historien de l'art",'photographe','politicien', 'agrégé de lettres',
        'historien du canton de Vaud','écrivaine','illustratrice', 'cancérologue',
        'journaliste.','romancier','Dr en droit','ca.','dr en droit','pédagogue',
        'Dr. en droit','dr en droit. Berne','journaliste de loisirs',
        'illustrateur.','historien','historienne','historien de l\'art',
        'Études cinématographiques et Audiovisuelles', 'Morges',
        'mathématicien',
        " collaboratrice de recherche à l'université de Lausanne",
        ' Physicien', '\u200f', '\u200f \u200e', ' travailleuse sociale',
        ' psychologue', ' journaliste scientifique', ' musicologue',
        ' évêque de Césarée', ' auteur culinaire',
        ' agrégé de philosophie.', ' aut', ' Esthétique', ' avocat',
        ' animatrice et auteure de bande dessinée', ' Dr en histoire',
        ' guérisseur', ' médecin', ' rédacteur',
        ' professeure à l’Université Côte d’Azur.', ' journaliste',
        ' auteur jeunesse', ' comtesse de', '  et professeur',
        ' Dr en sc. pol.', ' architecte', ' ne', ' cartographe',
        " professeur d'histoire", ' Pédagogue', ' ingénieur agronome',
        ' e en droit', ' dr en lettres', ' juge cantonal',
        ' professeur de littérature française',
        " directeur de la Chambre vaudoise d'agriculture", ' politologue',
        ' latiniste', ' dr en philosophie', " formateur d'adultes",
        ' égyptologue', ' écrivain', " l'Ancien",
        ' Dr ès lettres', ' Dr en lettres', ' Dr en histoire de l\’art',
        'animatrice eteure de bande dessinée'
]

# Remove specific stop words from the 'Author' column
df_relevant_cols['Author'] = df_relevant_cols['Author'].apply(remove_words, list_words=stop_words)
df_relevant_cols['Author'] = df_relevant_cols['Author'].apply(remove_last_comma)


df_relevant_cols['index_first_digit'] = df_relevant_cols['Author'].apply(get_first_digit_index)
# Extract the substring from the 'Author' column starting from the first digit index
df_relevant_cols['Author'] = df_relevant_cols.apply(
    lambda row: substring_from_index(row['Author'], row['index_first_digit']), axis=1
)

# Remove any trailing commas from the 'Author' column
df_relevant_cols['Author'] = df_relevant_cols['Author'].apply(remove_last_comma)
df_relevant_cols = df_relevant_cols.drop(columns=['index_first_digit'])

# Remove specific phrases or titles from the 'Author' column
df_relevant_cols['Author'] = df_relevant_cols['Author'].str.replace("gastro-entérologue","")\
    .str.replace("professeur de chimie","").str.replace("-chercheur en histoire de l'art","")\
    .str.replace("professeur de littérature","").str.replace("professeur de littérature française","")\
    .str.replace("professeur de littérature française et de culture générale","").str.replace("professeur de littérature française et de culture générale à l'Université de Lausanne","")\
    .str.replace("professeur d'histoire de l'art","").str.replace("professeur d'histoire de l'art à l'Université de Lausanne","")\
    .str.replace("professeur d'histoire de l'art à l'Université de Lausanne et directeur du Musée cantonal des beaux-arts","")\
    .str.replace("professeur en histoire moderne","").str.replace("blogueur spécialiste des questions climatiques","")\
    .str.replace("chercheur en biologie végétale","").str.replace("ingénieure de formation","").str.replace("en géopolitique","")\
    .str.replace("professeur de lettres modernes","").str.replace("illustrateur et","").str.replace("directrice de recherche CNRS","")\
    .str.replace("e-chercheuse en arabe","").str.replace("experte en intelligence artificielle","").str.replace("en sciences de gestion","")\
    .str.replace("homme politique","").str.replace("expert en stratégie","").str.replace("psychothérapeute","")\
    .str.replace("professeure de yoga","").str.replace("maître de conférence","").str.replace("styliste culinaire","")\
    .str.replace("professeur de didactique du français langue étrangère","").str.replace("anatomiste-","").str.replace("professeur de linguistique française","")\
    .str.replace("e-chercheuse en lettres","").str.replace("en chirurgie dentaire","").str.replace("réalisateur et mangaka","").str.replace("ingénieur forestier et chercheur","")\
    .str.replace("spécialisé dans l'alimentation","").str.replace("en sciences du langage","").str.replace("d'un mémoire de master en histoire","")\
    .str.replace("professeur d'anglais","").str.replace("e en sciences de l'éducation","").str.replace("inspecteur de l'Education Nationale honoraire","")\
    .str.replace("professeur des Universités - Praticien hospitalier en pharmacologie","").str.replace("professeur émérite en Arts plastiques","").str.replace("professeur en Histoire de l'art","")\
    .str.replace("astrophysicienne","").str.replace("psychanalyste","").str.replace("helléniste","").str.replace("professeur","")\
    .str.replace("consultant en écoconception","").str.replace("directrice artistique indépendante","").str.replace("en psychologie cognitive","").str.replace("diplômée en histoire","")\
    .str.replace("de cardiologue","").str.replace("professeure-formatrice","").str.replace("en traitement du signal","").str.replace("réflexologue","")\
    .str.replace("et  en archéologie","").str.replace("designer de logos","").str.replace("chercheur en psychologie cognitive","").str.replace("pour la jeunesse","")\
    .str.replace("philosophe","").str.replace("helléniste","").str.replace("en sciences et techniques des activités physiques et sportives","")\
    .str.replace('e-formatrice',"").str.replace("chercheur","").str.strip()

# Fix 'authors' columns
df_relevant_cols.loc[df_relevant_cols['authors'] == "[]", 'authors'] = "<NA>"
df_relevant_cols.loc[df_relevant_cols['authors'] == "['unknown author']", 'authors'] = "<NA>"
df_relevant_cols.loc[df_relevant_cols['authors'] == "['No author stated']", 'authors'] = "<NA>"
df_relevant_cols['authors'] = df_relevant_cols['authors'].fillna("<NA>")

# Clean up the 'authors' column by removing specific phrases related to directors
df_relevant_cols.loc[df_relevant_cols['authors'].str.contains("dir"),'authors'] = (
    df_relevant_cols.loc[df_relevant_cols['authors'].str.contains("dir"),'authors']\
    .str.replace("(dir.)","").str.replace("(dir)","")\
        .str.replace("(sous la direction de)","").str.replace("(sous la direction)","")\
            .str.replace("Sous la direction de","").str.replace("()","")
)

# Clean up the 'authors' column by removing specific phrases related to authors
df_relevant_cols.loc[df_relevant_cols['authors'].str.contains("author"), 'authors'] = (
    df_relevant_cols.loc[df_relevant_cols['authors'].str.contains("author"), 'authors']\
    .str.replace("(author.)", "").str.replace("(author)", "")
)

# Remove commas from the 'Author' column
df_relevant_cols['Author'] = df_relevant_cols['Author'].str.replace(",", "")
df_relevant_cols.loc[df_relevant_cols['authors'] == "<NA>", 'authors'] = pd.NA

# Combine the 'authors' and 'Author' columns, using backfill to fill missing values, then clean the resulting column
df_relevant_cols["prep_author"] = df_relevant_cols[['authors', 'Author']].fillna(method='bfill', axis=1).iloc[:, 0]\
    .str.replace("[", '').str.replace("]", '').str.replace("'", '').str.replace("unknown author", '<NA>')\
    .str.replace("<NA>", '').fillna('')

# If the 'authors' column contains numbers and the 'Author' column is not null, replace the 'prep_author' column with the value from the 'Author' column
cond_fix_numbers_authors = df_relevant_cols['authors'].astype(str).str.contains(r"[1|2|3|4|5|6|7|8|9|0]")
cond_notna_Author = ~df_relevant_cols['Author'].isna()
df_relevant_cols.loc[
    cond_fix_numbers_authors & cond_notna_Author, 'prep_author'
] = df_relevant_cols.loc[
    cond_fix_numbers_authors & cond_notna_Author, 'Author'
]

# Manually fix specific cases in the 'prep_author' column where incorrect formatting or data exists
cond_manual_fix_1 = df_relevant_cols['prep_author'].astype(str).str.contains(r"Camille Perrier, 1983, Joëlle Vuille")
cond_manual_fix_2 = df_relevant_cols['prep_author'].astype(str).str.contains(r"Michel Hottelier, Maya Hertig, 1973, Alexandre Flückiger")
# Correct the 'prep_author' column for the identified cases
df_relevant_cols.loc[cond_manual_fix_1, 'prep_author'] = "Camille Perrier, Joëlle Vuille"
df_relevant_cols.loc[cond_manual_fix_2, 'prep_author'] = "Michel Hottelier, Maya Hertig, Alexandre Flückiger"

df_relevant_cols["prep_synopsis"] = df_relevant_cols['synopsis'].str.replace("<br/>"," ")\
    .str.replace("\n"," ").str.replace("<br>","").fillna("<NA>")

df_relevant_cols["prep_publisher"]  = df_relevant_cols[['publisher','Publisher']].fillna(method='bfill', axis=1).iloc[:, 0]\
    .fillna("<NA>").str.replace("<NA>",'')

df_relevant_cols['Title'] =  df_relevant_cols['Title'].str.replace("/","").str.strip()
df_relevant_cols["prep_title"] = df_relevant_cols[['title_long','Title']].fillna(method='bfill', axis=1).iloc[:, 0]\
    .fillna('<NA>').str.replace("<NA>",'')

df_relevant_cols["prep_language"] = df_relevant_cols['language'].fillna("<NA>").str.replace("<NA>",'')

df_relevant_cols.loc[df_relevant_cols['subjects'] == "['Subjects']",'subjects'] = pd.NA
df_relevant_cols['subjects'] = df_relevant_cols['subjects'].str.replace("[",'').str.replace("]",'').str.replace("'",'')
df_relevant_cols["prep_subjects"] = df_relevant_cols[['subjects','Subjects']].fillna(method='bfill', axis=1).iloc[:, 0]\
    .fillna('<NA>').str.replace("<NA>",'')

df_relevant_cols['prep_pages'] = df_relevant_cols['pages'].astype('Int64').astype(str).str.replace("<NA>",'')

df_relevant_cols['prep_isbn'] = "[" + df_relevant_cols['ISBN Valid'].fillna("<NA>").str.replace("<NA>",'').str.replace(";",',') + "]"

df_relevant_cols["prep_published"] = df_relevant_cols['date_published'].fillna("<NA>").str.replace("<NA>",'')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ushakov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df_relevant_cols["prep_author"] = df_relevant_cols[['authors', 'Author']].fillna(method='bfill', axis=1).iloc[:, 0]\
  df_relevant_cols["prep_author"] = df_relevant_cols[['authors', 'Author']].fillna(method='bfill', axis=1).iloc[:, 0]\
  df_relevant_cols["prep_publisher"]  = df_relevant_cols[['publisher','Publisher']].fillna(method='bfill', axis=1).iloc[:, 0]\
  df_relevant_cols["prep_title"] = df_relevant_cols[['title_long','Title']].fillna(method='bfill', axis=1).iloc[:, 0]\
  df_relevant_cols["prep_subjects"] = df_relevant_cols[['subjects','Subjects']].fillna(method='bfill', axis=1).iloc[:, 0]\
  df_relevant_cols["prep_subjects"] = df_relevant_cols[['subjects','Subjects']].fillna(method='bfill', axis=1).iloc[:, 0]\


In [None]:
# Preprocess book data for TFIDF
books['combined_features'] = (
    df_relevant_cols['prep_title'].fillna('') + ' ' +
    df_relevant_cols['prep_author'].fillna('') + ' ' +
    df_relevant_cols['prep_isbn'].fillna('') + ' ' +
    df_relevant_cols['prep_publisher'].fillna('') + ' ' +
    # df_relevant_cols['prep_subjects'].fillna('') + ' ' +
    # df_relevant_cols['prep_synopsis'].fillna('') + ' ' +
    # df_relevant_cols['prep_language'].fillna('') + ' ' +
    df_relevant_cols['prep_pages']
    # df_relevant_cols['prep_published'].fillna('')
)

### TFIDF

In [25]:
# Create a TF-IDF vectorizer using French stop words and fit-transform the 'combined_features' column of the books
tfidf_vectorizer = TfidfVectorizer(stop_words=french_stop_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(books['combined_features'])

In [26]:
# naive_limit_bool = True, if you want to limit the recommendations, otherwise set it to False
naive_limit_bool = True

interactions_agg = interactions.groupby('u').agg(
    min_book_id = ('i', min),
    max_book_id = ('i', max),
).reset_index()


# Compute recommendations for each user
user_recommendations = {}
for user_id, group in interactions.groupby('u'):
    read_books = group['i'].values
    read_books_indices = [books[books['i'] == book_id].index[0] for book_id in read_books]
    read_books_tfidf = tfidf_matrix[read_books_indices]
    similarity_scores = cosine_similarity(read_books_tfidf, tfidf_matrix)
    avg_similarity = np.mean(similarity_scores, axis=0)
    recommended_indices = avg_similarity.argsort()[-15288:][::-1]

    recommended_books = books.iloc[recommended_indices]['i'].values

    if naive_limit_bool:
        lower_bound = interactions_agg[interactions_agg['u'] == user_id]['min_book_id'].values[0]
        upper_bound = interactions_agg[interactions_agg['u'] == user_id]['max_book_id'].values[0]
        
        if upper_bound > lower_bound:
            upper_bound = min(upper_bound+11, 15290)
            recommended_books = recommended_books[(recommended_books >= lower_bound) & (recommended_books <= upper_bound)]

        if len(recommended_books) < 10:
            print(f"User {user_id} has less than 10 recommendations. Found: {len(recommended_books)}")
            print("upper_bound:", upper_bound)
            print("lower_bound:", lower_bound)
    
    user_recommendations[user_id] = recommended_books[:10]

  interactions_agg = interactions.groupby('u').agg(
  interactions_agg = interactions.groupby('u').agg(


In [27]:
# Display recommendations for a sample user
sample_user_id = list(user_recommendations.keys())[0]
print(f"Recommendations for User {sample_user_id}:")
print(user_recommendations[sample_user_id])

Recommendations for User 0:
[ 3 18 17 15 14 13 20  1 19  7]


In [None]:
# Export recommendations to a CSV file
with open('recommendations_tfidf_naive_proper_input_less_features_cleaned.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['user_id', 'recommendation'])
    for user_id, recommended_books in user_recommendations.items():
        writer.writerow([user_id, " ".join(map(str, recommended_books))])

# Score = 0.1554
print("Recommendations exported to recommendations_tfidf_naive_proper_input_less_features_cleaned.csv")
# Score = 0.1547
print("Recommendations exported to recommendations_tfidf_proper_input_less_features_cleaned.csv")

Recommendations exported to recommendations_tfidf_naive_proper_input_less_features_cleaned.csv


# Not successful attempts at all
There were more different attempts, for the display purposes we left couple of them

### Generate BERT Embeddings Locally

In [14]:
# # Load the pre-trained model and tokenizer
# from transformers import BertTokenizer, BertModel
# import torch

# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# # Generate embeddings for a given text
# def generate_embeddings(text):
#     if text is None or text == "" or pd.isna(text):
#         return np.zeros((768,))
#     inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
#     outputs = model(**inputs)
#     embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
#     return embeddings

In [16]:
# Apply this function to the Title and Subjects columns
# items['Title_Embeddings'] = items['Title'].apply(generate_embeddings)
# items['Subjects_Embeddings'] = items['Subjects'].apply(generate_embeddings)

In [None]:
# Save the df with embeddings to a CSV file
# items.to_csv('items_with_embeddings.csv', index=False)

In [34]:
# books_with_embeddings_df['Title_Embeddings'] = books_with_embeddings_df['Title_Embeddings'].apply(
#     lambda x: np.array(x.replace('[','').replace(']','').replace('\n', '').replace('  ', ' ').split(' '), dtype=float)
# )

# books_with_embeddings_df['Subjects_Embeddings'] = books_with_embeddings_df['Subjects_Embeddings'].apply(
#     lambda x: np.array(x.replace('[','').replace(']','').replace('\n', '').replace('  ', ' ').split(' '), dtype=float)
# )

In [35]:
# # Load the embeddings from the saved CSV file
# books_with_embeddings_df = pd.read_csv('items_with_embeddings.csv')

# books_with_embeddings_df['Title_Embeddings'] = books_with_embeddings_df['Title'].apply(generate_embeddings)
# books_with_embeddings_df['Subjects_Embeddings'] = books_with_embeddings_df['Subjects'].apply(generate_embeddings)

# # Combine the embeddings for a comprehensive representation
# books_with_embeddings_df['Combined_Embeddings'] = books_with_embeddings_df.apply(
#     lambda row: np.array(row['Title_Embeddings']) + np.array(row['Subjects_Embeddings']), axis=1
# )


In [36]:
# # Convert the combined embeddings to a matrix
# embeddings_matrix = np.stack(books_with_embeddings_df['Combined_Embeddings'].values)

# # Calculate cosine similarity between book embeddings
# similarity_matrix = cosine_similarity(embeddings_matrix)

# # Function to generate top-10 recommendations for each user
# def generate_recommendations(user_interactions, similarity_matrix, top_k=10):
#     recommendations = {}
#     for user_id, interacted_books in user_interactions.items():
#         # Calculate the mean similarity score for each book not interacted by the user
#         scores = np.mean(similarity_matrix[interacted_books], axis=0)
        
#         # Exclude books the user has already interacted with
#         scores[interacted_books] = -1
        
#         # Get the indices of the top-k books
#         recommended_books = np.argsort(scores)[-top_k:][::-1]
#         recommendations[user_id] = recommended_books
#     return recommendations

In [42]:
# # User interactions in the list format
# user_interactions = interactions.groupby('u').agg(
#     {'i': lambda x: list(x)}
# ).to_dict()

# user_interactions = user_interactions['i']

In [43]:
# # Generate recommendations for all users
# recommendations = generate_recommendations(user_interactions, similarity_matrix, top_k=10)

# # Prepare submission format
# submission_data = []
# for user_id, recommended_books in recommendations.items():
#     submission_data.append({'user_id': user_id, 'recommendation': ' '.join(map(str, recommended_books))})

# # Create a DataFrame for submission
# submission_df = pd.DataFrame(submission_data)

# # Save to CSV
# submission_df.to_csv('submission.csv', index=False)
# print("Submission file created: submission.csv")

Submission file created: submission.csv


### Collaborative Filtering with GridSearchCV

In [28]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import precision_score

In [232]:
interactions_df = pd.read_csv('kaggle_data/interactions_train.csv')

In [235]:
# Remove duplicates by taking the latest interaction timestamp for each user-item pair
interactions_train_df = interactions_df.sort_values('t').drop_duplicates(subset=['u', 'i'], keep='last')

# Prepare the interaction matrix
interaction_matrix = interactions_train_df.pivot(index='u', columns='i', values='t').fillna(0)

# Split the data into training and test sets
train_data, test_data = train_test_split(interactions_train_df, test_size=0.2, random_state=42)

# Create the training interaction matrix
train_interaction_matrix = train_data.pivot(index='u', columns='i', values='t').fillna(0)

# Create the test interaction matrix
test_interaction_matrix = test_data.pivot(index='u', columns='i', values='t').fillna(0)

# Ensure the matrices have the same columns
train_interaction_matrix = train_interaction_matrix.reindex(columns=interaction_matrix.columns, fill_value=0)
test_interaction_matrix = test_interaction_matrix.reindex(columns=interaction_matrix.columns, fill_value=0)


In [236]:
# Model Selection and Hyperparameter Tuning
param_grid = {
    'n_components': [10, 20, 50],
    'n_iter': [10, 20, 30]
}

svd = TruncatedSVD()
grid_search = GridSearchCV(svd, param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(train_interaction_matrix)

Traceback (most recent call last):
  File "/Users/ushakov/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/Users/ushakov/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/Users/ushakov/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 947, in _score
    scores = scorer(estimator, X_test, **score_params)
TypeError: __call__() missing 1 required positional argument: 'y_true'

Traceback (most recent call last):
  File "/Users/ushakov/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_val

In [245]:
# Best model
best_svd = grid_search.best_estimator_

# Model Training
best_svd.fit(train_interaction_matrix)

In [246]:
# Function to get top N recommendations
def get_top_n_recommendations(user_id, model, interaction_matrix, n=10):
    """
    Generate top-N item recommendations for a given user.
    """
    if user_id not in interaction_matrix.index:
        return []
    
    user_interactions = interaction_matrix.loc[user_id].values.reshape(1, -1)
    scores = model.transform(user_interactions).dot(model.components_)
    top_n_items = np.argsort(scores[0])[::-1][:n]
    return top_n_items

In [251]:
# Function to calculate Precision@10
def precision_at_k(test_matrix, train_matrix, model, k=10):
    """
    Computes the average Precision@K for all users in the test set.
    """
    precisions = []

    for user_id in test_matrix.index:
        # Get the top-N recommendations
        top_n = get_top_n_recommendations(user_id, model, train_matrix, n=k)
        
        # Get the actual items the user interacted with in the test set
        actual_items = test_matrix.loc[user_id]
        actual_items = actual_items[actual_items > 0].index.tolist()
        
        # Calculate the number of relevant items in top N
        relevant_items = set(top_n).intersection(set(actual_items))
        
        # Precision is the number of relevant items found divided by N
        precision = len(relevant_items) / k
        precisions.append(precision)
    
    # Return the average Precision@K across all users
    return np.mean(precisions)

In [252]:
precision_10 = precision_at_k(test_interaction_matrix, train_interaction_matrix, best_svd, k=10)
print(f"Precision@10: {precision_10:.4f}")

Precision@10: 0.0007
