In [None]:
import pandas as pd

In [None]:
ratings_df = pd.read_csv('../data/ratings.csv')
clusters_df = pd.read_csv('../data/clustered_users.csv')

In [None]:
clustered_ratings_df = pd.merge(ratings_df, clusters_df[['uid', 'cluster']], on='uid').sort_values(by=['uid', 'isbn'])
clustered_ratings_df.reset_index(inplace=True, drop=True)

clustered_ratings_df.to_csv('../data/clustered_ratings.csv', index=False)

clustered_ratings_df

In [None]:
book_cluster_ratings = {}
for index, row in clustered_ratings_df.iterrows():
    isbn = row['isbn']
    cluster = row['cluster']
    rating = row['rating']

    if isbn not in book_cluster_ratings:
        book_cluster_ratings[isbn] = {}

    if cluster not in book_cluster_ratings[isbn]:
        book_cluster_ratings[isbn][cluster] = []

    book_cluster_ratings[isbn][cluster].append(rating)

book_cluster_ratings

In [None]:
book_cluster_avg_ratings = {}
for isbn in book_cluster_ratings:
    book_cluster_avg_ratings[isbn] = {}
    for cluster in book_cluster_ratings[isbn]:
        avg_rating = sum(book_cluster_ratings[isbn][cluster]) / len(book_cluster_ratings[isbn][cluster])
        book_cluster_avg_ratings[isbn][cluster] = round(avg_rating, 1)

book_cluster_avg_ratings

In [None]:
user_book_ratings = {}
for index, row in clustered_ratings_df.iterrows():
    uid = row['uid']
    isbn = row['isbn']
    rating = row['rating']

    if uid not in user_book_ratings:
        user_book_ratings[uid] = {}

    user_book_ratings[uid][isbn] = rating

user_book_ratings

In [None]:
cluster_book_avg_ratings = {}
for isbn in book_cluster_avg_ratings:
    for cluster in book_cluster_avg_ratings[isbn]:
        if cluster not in cluster_book_avg_ratings:
            cluster_book_avg_ratings[cluster] = {}

        cluster_book_avg_ratings[cluster][isbn] = book_cluster_avg_ratings[isbn][cluster]

cluster_book_avg_ratings

In [None]:
user_clusters = {}
for index, row in clusters_df.iterrows():
    uid = row['uid']
    cluster = row['cluster']

    user_clusters[uid] = cluster

user_clusters

In [None]:
user_cluster_ratings = {}
for uid in user_clusters:
    cluster = user_clusters[uid]
    user_cluster_ratings[uid] = cluster_book_avg_ratings[cluster]

user_cluster_ratings

In [None]:
user_combined_ratings = user_cluster_ratings.copy()

# Populate user_combined_ratings with users' own ratings
for uid in user_book_ratings:
    for isbn, rating in user_book_ratings[uid].items():
        user_combined_ratings[uid][isbn] = rating
        
user_combined_ratings

In [None]:
def generate_df_from_dict(user_combined_ratings, chunk_size=10000):
    # This function will yield chunks of DataFrames from the dictionary
    records = []
    for uid, books in user_combined_ratings.items():
        for isbn, rating in books.items():
            records.append({'uid': uid, 'isbn': isbn, 'rating': rating})
            if len(records) == chunk_size:
                yield pd.DataFrame(records)
                records = []
    if records:  # Handling the last chunk which might be less than chunk_size
        yield pd.DataFrame(records)

In [None]:
# Specify the output CSV file path
output_path = '../data/combined_ratings.csv'

# Write the first chunk and set the CSV headers
first_chunk = True
for df_chunk in generate_df_from_dict(user_combined_ratings):
    if first_chunk:
        df_chunk.to_csv(output_path, index=False, mode='w') # 'w' mode will overwrite the existing file
        first_chunk = False
    else:
        df_chunk.to_csv(output_path, index=False, mode='a', header=False) # 'a' mode will append to the file