In [2]:
import numpy as np
import pandas as pd
import json
import math
import pickle
from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import os
from datetime import datetime

In [3]:
original_data = pd.read_csv('goodreads_interactions.csv')

#CSV columns:
#user_id | book_id | is_read | rating | is_reviewed

In [4]:
original_data.head()

Unnamed: 0,user_id,book_id,is_read,rating,is_reviewed
0,0,948,1.0,5.0,0.0
1,0,947,1.0,5.0,1.0
2,0,946,1.0,5.0,0.0
3,0,945,1.0,5.0,0.0
4,0,944,1.0,5.0,0.0


In [5]:
# Count how many times each book was interacted with
book_interactions = original_data.groupby('book_id').size().reset_index(name='n_interactions')

print(f"\nBasic Statistics:")
print(f"  Min interactions: {book_interactions['n_interactions'].min()}")
print(f"  Max interactions: {book_interactions['n_interactions'].max():,}")
print(f"  Mean interactions: {book_interactions['n_interactions'].mean():.2f}")
print(f"  Median interactions: {book_interactions['n_interactions'].median():.1f}")
print(f"  Std deviation: {book_interactions['n_interactions'].std():.2f}")


Basic Statistics:
  Min interactions: 1
  Max interactions: 129,935
  Mean interactions: 57.69
  Median interactions: 6.0
  Std deviation: 614.02


In [6]:
print(f"\nPercentiles:")
percentiles = [10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
    val = book_interactions['n_interactions'].quantile(p/100)
    print(f"  {p}th percentile: {val:.0f} interactions")


Percentiles:
  10th percentile: 1 interactions
  25th percentile: 2 interactions
  50th percentile: 6 interactions
  75th percentile: 22 interactions
  90th percentile: 76 interactions
  95th percentile: 162 interactions
  99th percentile: 780 interactions


In [7]:
print(f"Original: {len(original_data):,} interactions")


Original: 123,731,803 interactions


In [8]:
user_counts = original_data['user_id'].value_counts()
book_counts = original_data['book_id'].value_counts()
print(f"Users: {len(user_counts):,}")
print(f"Books: {len(book_counts):,}")

Users: 254,205
Books: 2,144,808


In [9]:
valid_users = set(user_counts[user_counts >= 10].index)
valid_books = set(book_counts[book_counts >= 50].index)

print(f"Valid users (>=10 interactions): {len(valid_users):,}")
print(f"Valid books (>=50 interactions): {len(valid_books):,}")

Valid users (>=10 interactions): 254,196
Valid books (>=50 interactions): 306,020


In [11]:
mask = original_data['user_id'].isin(valid_users) & original_data['book_id'].isin(valid_books)
data = original_data[mask].copy()
print(f"Filtered: {len(data):,} interactions")
print(f"Books: {data['book_id'].nunique():,}")
print(f"Users: {data['user_id'].nunique():,}")

Filtered: 106,826,044 interactions
Books: 306,020
Users: 254,178


In [12]:
print(f"Current size: {len(data):,} interactions")
# sampling it down more

Current size: 106,826,044 interactions


In [13]:
# Sample to 1 million interactions
target_size = 1_000_000

if len(data) > target_size:
    print(f"\nSampling to {target_size:,} interactions")
    data = data.sample(n=target_size, random_state=42)
    
    # Re-filter to ensure quality after sampling
    
    # Keep books with at least 20 interactions in sampled data
    book_counts = data['book_id'].value_counts()
    valid_books = book_counts[book_counts >= 20].index
    data = data[data['book_id'].isin(valid_books)].copy()
    
    # Keep users with at least 5 interactions in sampled data
    user_counts = data['user_id'].value_counts()
    valid_users = user_counts[user_counts >= 5].index
    data = data[data['user_id'].isin(valid_users)].copy()


Sampling to 1,000,000 interactions


In [16]:
print("FINAL SAMPLED DATASET:")
print(f"Interactions: {len(data):,}")
print(f"Books: {data['book_id'].nunique():,}")
print(f"Users: {data['user_id'].nunique():,}")
print(f"Sparsity: {100 * (1 - len(data) / (data['user_id'].nunique() * data['book_id'].nunique())):.2f}%")
print(f"Avg interactions/book: {len(data) / data['book_id'].nunique():.1f}")
print(f"Avg interactions/user: {len(data) / data['user_id'].nunique():.1f}")

FINAL SAMPLED DATASET:
Interactions: 144,343
Books: 7,208
Users: 20,780
Sparsity: 99.90%
Avg interactions/book: 20.0
Avg interactions/user: 6.9


In [17]:
print("\nSaving sampled dataset...")
folder = r"D:\GoodreadsData_NEW"
os.makedirs(folder, exist_ok=True)
data.to_csv(os.path.join(folder, 'sampled_1M_data.csv'), index=False)
print("Saved to: D:\\GoodreadsData_NEW\\sampled_1M_data.csv")


Saving sampled dataset...
Saved to: D:\GoodreadsData_NEW\sampled_1M_data.csv


In [20]:
# Create index mappings
users = sorted(data['user_id'].unique())
books = sorted(data['book_id'].unique())

user_to_idx = {user: idx for idx, user in enumerate(users)}
book_to_idx = {book: idx for idx, book in enumerate(books)}
idx_to_user = {idx: user for user, idx in user_to_idx.items()}
idx_to_book = {idx: book for book, idx in book_to_idx.items()}

print(f"Users: {len(users):,}")
print(f"Books: {len(books):,}")

Users: 20,780
Books: 7,208


In [22]:
row_indices = [user_to_idx[uid] for uid in data['user_id']]
col_indices = [book_to_idx[bid] for bid in data['book_id']]
ratings = data['rating'].values

rating_matrix = csr_matrix(
    (ratings, (row_indices, col_indices)),
    shape=(len(users), len(books)),
    dtype=np.float32
)

print(f"Matrix shape: {rating_matrix.shape}")
print(f"Non-zero entries: {rating_matrix.nnz:,}")
print(f"Memory: ~{rating_matrix.data.nbytes / (1024**3):.2f} GB")

print("Sparse matrix created!")

Matrix shape: (20780, 7208)
Non-zero entries: 144,343
Memory: ~0.00 GB
Sparse matrix created!


In [23]:
# Center Ratings

# Compute user means
user_rating_counts = np.diff(rating_matrix.indptr)
user_rating_sums = np.array(rating_matrix.sum(axis=1)).flatten()

# Avoid division by zero
user_means = np.divide(
    user_rating_sums, 
    user_rating_counts, 
    out=np.zeros_like(user_rating_sums), 
    where=user_rating_counts > 0
)

print(f"Mean user rating: {user_means[user_means > 0].mean():.3f}")

# Center ratings (subtract user means)
rating_matrix_centered = rating_matrix.copy().astype(np.float32)

for user_idx in range(rating_matrix_centered.shape[0]):
    start = rating_matrix_centered.indptr[user_idx]
    end = rating_matrix_centered.indptr[user_idx + 1]
    
    if end > start and user_means[user_idx] > 0:
        rating_matrix_centered.data[start:end] -= user_means[user_idx]

# Clean any NaN values
nan_count = np.isnan(rating_matrix_centered.data).sum()
if nan_count > 0:
    print(f"Cleaning {nan_count} NaN values...")
    rating_matrix_centered.data = np.nan_to_num(rating_matrix_centered.data, nan=0.0)

rating_matrix_centered.eliminate_zeros()

print(f"Ratings centered!")
print(f"Non-zero entries after centering: {rating_matrix_centered.nnz:,}")

Mean user rating: 2.109
Ratings centered!
Non-zero entries after centering: 126,435


In [29]:
# Check rating distribution
print("Rating distribution:")
print(data['rating'].value_counts().sort_index())
print(f"Zero ratings: {(data['rating'] == 0).sum()}")
print(f"Non-zero ratings: {(data['rating'] > 0).sum()}")

Rating distribution:
rating
0.0    78557
1.0     1336
2.0     3837
3.0    14723
4.0    24423
5.0    21467
Name: count, dtype: int64
Zero ratings: 78557
Non-zero ratings: 65786


In [24]:
# STEP 7: Compute Item-Item Similarity

start_time = datetime.now()

# Compute adjusted cosine similarity
item_similarity_sparse = cosine_similarity(
    rating_matrix_centered.T, 
    dense_output=False
)

elapsed = datetime.now() - start_time
print(f"Similarity computed in {elapsed}")
print(f"Shape: {item_similarity_sparse.shape}")
print(f"Non-zero similarities: {item_similarity_sparse.nnz:,}")
print(f"Memory: ~{item_similarity_sparse.data.nbytes / (1024**3):.2f} GB")

Similarity computed in 0:00:00.053827
Shape: (7208, 7208)
Non-zero similarities: 880,324
Memory: ~0.00 GB


In [25]:
# Filter by Minimum Common Users

# Compute co-occurrence matrix
print("  Computing co-occurrence...")
rating_binary = rating_matrix.copy()
rating_binary.data = np.ones_like(rating_binary.data)
cooccurrence = (rating_binary.T @ rating_binary).toarray()

# Convert to dense for filtering
item_similarity = item_similarity_sparse.toarray()

# Apply minimum common users filter
min_common = 5
print(f"  Applying filter (min_common={min_common})...")
item_similarity[cooccurrence < min_common] = 0
np.fill_diagonal(item_similarity, 1.0)

# Convert back to sparse
item_similarity_sparse = csr_matrix(item_similarity)
del item_similarity  # Free memory

print(f"Filtered!")
print(f"Non-zero similarities: {item_similarity_sparse.nnz:,}")

  Computing co-occurrence...
  Applying filter (min_common=5)...
Filtered!
Non-zero similarities: 7,232


In [26]:
# STEP 9: Compute Book Means

book_rating_counts = np.diff(rating_matrix.tocsc().indptr)
book_rating_sums = np.array(rating_matrix.sum(axis=0)).flatten()
book_means = np.divide(
    book_rating_sums, 
    book_rating_counts,
    out=np.zeros_like(book_rating_sums),
    where=book_rating_counts > 0
)

print(f"Mean book rating: {book_means[book_means > 0].mean():.3f}")
print("Book means computed!")

Mean book rating: 1.702
Book means computed!


In [27]:
# Save the Model

folder = r"D:\GoodreadsData_NEW"
os.makedirs(folder, exist_ok=True)

save_data = {
    'rating_matrix': rating_matrix,
    'item_similarity': item_similarity_sparse,
    'user_means': user_means,
    'book_means': book_means,
    'user_to_idx': user_to_idx,
    'book_to_idx': book_to_idx,
    'idx_to_user': idx_to_user,
    'idx_to_book': idx_to_book,
    'min_common': min_common
}

model_file = os.path.join(folder, 'sparse_itemcf_model.pkl')
print(f"Saving to: {model_file}")

with open(model_file, 'wb') as f:
    pickle.dump(save_data, f, protocol=pickle.HIGHEST_PROTOCOL)

file_size = os.path.getsize(model_file) / (1024**2)
print(f"Model saved! Size: {file_size:.1f} MB")

Saving to: D:\GoodreadsData_NEW\sparse_itemcf_model.pkl
Model saved! Size: 2.2 MB


In [28]:
# STEP 11: Define Prediction Function & Test

def predict_rating(user_id, book_id, k=25):
    """Predict rating for a user-book pair"""
    if user_id not in user_to_idx or book_id not in book_to_idx:
        return book_means.mean() if len(book_means) > 0 else 3.0
    
    user_idx = user_to_idx[user_id]
    book_idx = book_to_idx[book_id]
    
    # Get user's rated books
    user_ratings = rating_matrix[user_idx].toarray().flatten()
    rated_mask = user_ratings > 0
    
    if not rated_mask.any():
        return book_means[book_idx]
    
    # Get similarities to target book
    similarities = item_similarity_sparse[book_idx].toarray().flatten()
    similarities = similarities * rated_mask
    
    # Get top-k neighbors
    neighbor_indices = np.argsort(np.abs(similarities))[-k:]
    neighbor_indices = neighbor_indices[similarities[neighbor_indices] != 0]
    
    if len(neighbor_indices) == 0:
        return book_means[book_idx]
    
    # Weighted average prediction
    numerator = 0.0
    denominator = 0.0
    
    for idx in neighbor_indices:
        sim = similarities[idx]
        rating = user_ratings[idx]
        baseline = book_means[idx]
        numerator += sim * (rating - baseline)
        denominator += abs(sim)
    
    if denominator == 0:
        return book_means[book_idx]
    
    return np.clip(book_means[book_idx] + numerator / denominator, 0, 5)


def get_recommendations(user_id, n=10, k=25):
    """Get top N recommendations for a user"""
    if user_id not in user_to_idx:
        # Cold start: return most popular books
        top_books = np.argsort(book_means)[-n:][::-1]
        return [(idx_to_book[idx], book_means[idx]) for idx in top_books]
    
    user_idx = user_to_idx[user_id]
    
    # Get rated books
    user_ratings = rating_matrix[user_idx].toarray().flatten()
    rated_books = set(np.where(user_ratings > 0)[0])
    
    # Get unrated books
    all_books = set(range(len(book_means)))
    unrated_books = list(all_books - rated_books)
    
    # Predict ratings
    predictions = []
    for book_idx in unrated_books:
        book_id = idx_to_book[book_idx]
        pred_rating = predict_rating(user_id, book_id, k=k)
        predictions.append((book_id, pred_rating))
    
    # Sort and return top N
    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:n]


# Test with a random user
test_user = list(user_to_idx.keys())[0]
print(f"\nTest User: {test_user}")

# Get user's actual ratings
user_idx = user_to_idx[test_user]
user_ratings = rating_matrix[user_idx].toarray().flatten()
rated_indices = np.where(user_ratings > 0)[0]
rated_books = [(idx_to_book[idx], user_ratings[idx]) for idx in rated_indices]

print(f"\nUser has rated {len(rated_books)} books")
print("Sample ratings:")
for book_id, rating in rated_books[:5]:
    print(f"  Book {book_id}: {rating:.1f} stars")

# Test prediction on a rated book
if len(rated_books) > 0:
    test_book = rated_books[0][0]
    actual_rating = rated_books[0][1]
    predicted = predict_rating(test_user, test_book, k=25)
    print(f"\nPrediction test:")
    print(f"  Book {test_book}")
    print(f"  Actual rating: {actual_rating:.1f}")
    print(f"  Predicted rating: {predicted:.2f}")

# Get recommendations
print(f"\nTop 10 Recommendations for User {test_user}:")
recs = get_recommendations(test_user, n=10, k=25)
for i, (book_id, pred_rating) in enumerate(recs, 1):
    print(f"  {i:2d}. Book {book_id:6d} - Predicted: {pred_rating:.2f} stars")

print("Model Summary:")
print(f"Dataset: {len(data):,} interactions")
print(f"Users: {len(users):,}")
print(f"Books: {len(books):,}")
print(f"Model size: {file_size:.1f} MB")
print(f"Prediction function: predict_rating(user_id, book_id, k=25)")
print(f"Recommendation function: get_recommendations(user_id, n=10, k=25)")


Test User: 5

User has rated 2 books
Sample ratings:
  Book 6955: 4.0 stars
  Book 7057: 5.0 stars

Prediction test:
  Book 6955
  Actual rating: 4.0
  Predicted rating: 4.00

Top 10 Recommendations for User 5:
   1. Book  23153 - Predicted: 5.00 stars
   2. Book  23368 - Predicted: 5.00 stars
   3. Book  59387 - Predicted: 5.00 stars
   4. Book  15362 - Predicted: 4.86 stars
   5. Book  19410 - Predicted: 4.86 stars
   6. Book    270 - Predicted: 4.75 stars
   7. Book  20658 - Predicted: 4.75 stars
   8. Book    665 - Predicted: 4.50 stars
   9. Book  14479 - Predicted: 4.50 stars
  10. Book  19488 - Predicted: 4.50 stars
Model Summary:
Dataset: 144,343 interactions
Users: 20,780
Books: 7,208
Model size: 2.2 MB
Prediction function: predict_rating(user_id, book_id, k=25)
Recommendation function: get_recommendations(user_id, n=10, k=25)


==========================================================================

In [15]:
# print("Filtering books with minimum 20 interactions...")
# data = original_data.groupby('book_id').filter(lambda x: len(x) >= 20)

Filtering books with minimum 20 interactions...


In [17]:
# print(f"Filtered dataset:")
# print(f"Total interactions: {len(data):,}")
# print(f"Unique users: {data['user_id'].nunique():,}")
# print(f"Unique books: {data['book_id'].nunique():,}")
# print(f"Sparsity: {100 * (1 - len(data) / (data['user_id'].nunique() * data['book_id'].nunique())):.2f}%")


Filtered dataset:
Total interactions: 115,346,986
Unique users: 254,194
Unique books: 578,590
Sparsity: 99.92%


In [19]:
#creating sparse rating matrix

# Create index mappings
users = sorted(data['user_id'].unique())
books = sorted(data['book_id'].unique())

user_to_idx = {user: idx for idx, user in enumerate(users)}
book_to_idx = {book: idx for idx, book in enumerate(books)}
idx_to_user = {idx: user for user, idx in user_to_idx.items()}
idx_to_book = {idx: book for book, idx in book_to_idx.items()}

print(f"Number of users: {len(users):,}")
print(f"Number of books: {len(books):,}")

# Create sparse matrix
row_indices = [user_to_idx[uid] for uid in data['user_id']]
col_indices = [book_to_idx[bid] for bid in data['book_id']]
ratings = data['rating'].values

rating_matrix = csr_matrix(
    (ratings, (row_indices, col_indices)),
    shape=(len(users), len(books)),
    dtype=np.float32
)

print(f"Rating matrix shape: {rating_matrix.shape}")
print(f"Non-zero entries: {rating_matrix.nnz:,}")
print(f"Matrix memory: ~{rating_matrix.data.nbytes / (1024**3):.2f} GB")

Number of users: 254,194
Number of books: 578,590
Rating matrix shape: (254194, 578590)
Non-zero entries: 115,346,986
Matrix memory: ~0.43 GB


In [20]:
# Computing user means and centering ratings

# Compute user means
user_rating_counts = np.diff(rating_matrix.indptr)
user_rating_sums = np.array(rating_matrix.sum(axis=1)).flatten()
user_means = np.divide(user_rating_sums, user_rating_counts, 
                      out=np.zeros_like(user_rating_sums), 
                      where=user_rating_counts!=0)

print(f"Mean user rating: {user_means.mean():.3f}")
print(f"Starting to center ratings...")

# Center ratings
rating_matrix_centered = rating_matrix.copy().astype(np.float32)

for user_idx in range(rating_matrix_centered.shape[0]):
    start = rating_matrix_centered.indptr[user_idx]
    end = rating_matrix_centered.indptr[user_idx + 1]
    
    if end > start:
        rating_matrix_centered.data[start:end] -= user_means[user_idx]
    
    # Progress indicator every 50K users
    if (user_idx + 1) % 50000 == 0:
        print(f"  Centered {user_idx + 1:,} / {len(users):,} users ({100*(user_idx+1)/len(users):.1f}%)...")

print("Ratings centered successfully ---> DONE")

Mean user rating: nan
Starting to center ratings...
  Centered 50,000 / 254,194 users (19.7%)...
  Centered 100,000 / 254,194 users (39.3%)...
  Centered 150,000 / 254,194 users (59.0%)...
  Centered 200,000 / 254,194 users (78.7%)...
  Centered 250,000 / 254,194 users (98.4%)...
Ratings centered successfully ---> DONE


In [21]:
#SAVING CHECKPOINT!

folder = r"D:\GoodreadsData_SPARSE"
os.makedirs(folder, exist_ok=True)

checkpoint_data = {
    'rating_matrix': rating_matrix,
    'rating_matrix_centered': rating_matrix_centered,
    'user_means': user_means,
    'user_to_idx': user_to_idx,
    'book_to_idx': book_to_idx,
    'idx_to_user': idx_to_user,
    'idx_to_book': idx_to_book,
    'users': users,
    'books': books
}

checkpoint_file = os.path.join(folder, 'checkpoint_before_similarity.pkl')

print(f"Saving checkpoint to: {checkpoint_file}")
with open(checkpoint_file, 'wb') as f:
    pickle.dump(checkpoint_data, f, protocol=pickle.HIGHEST_PROTOCOL)

print("Checkpoint saved successfully!")
print(f"File size: {os.path.getsize(checkpoint_file) / (1024**3):.2f} GB")

Saving checkpoint to: D:\GoodreadsData_SPARSE\checkpoint_before_similarity.pkl
Checkpoint saved successfully!
File size: 1.75 GB


In [None]:
# print("Loading checkpoint...")

# folder = r"D:\GoodreadsData"
# checkpoint_file = os.path.join(folder, 'checkpoint_before_similarity.pkl')

# with open(checkpoint_file, 'rb') as f:
#     checkpoint_data = pickle.load(f)

# # Restore all variables
# rating_matrix = checkpoint_data['rating_matrix']
# rating_matrix_centered = checkpoint_data['rating_matrix_centered']
# user_means = checkpoint_data['user_means']
# user_to_idx = checkpoint_data['user_to_idx']
# book_to_idx = checkpoint_data['book_to_idx']
# idx_to_user = checkpoint_data['idx_to_user']
# idx_to_book = checkpoint_data['idx_to_book']
# users = checkpoint_data['users']
# books = checkpoint_data['books']

# print("âœ“ Checkpoint loaded successfully!")
# print(f"  Rating matrix: {rating_matrix.shape}")
# print(f"  Users: {len(users):,}")
# print(f"  Books: {len(books):,}")
# print("\nYou can now continue with Step 4!")

In [23]:
# Checking for NaN values
print(f"NaN values in rating_matrix_centered: {np.isnan(rating_matrix_centered.data).sum():,}")

if np.isnan(rating_matrix_centered.data).any():
    print("Cleaning NaN values (replacing with 0)...")
    rating_matrix_centered.data = np.nan_to_num(rating_matrix_centered.data, nan=0.0)
    print("NaN values cleaned!")
else:
    print("No NaN values found!")

print(f"NaN count: {np.isnan(rating_matrix_centered.data).sum()}")

NaN values in rating_matrix_centered: 220
Cleaning NaN values (replacing with 0)...
NaN values cleaned!
NaN count: 0


In [24]:
rating_matrix_centered.eliminate_zeros()
print(f"Non-zero entries: {rating_matrix_centered.nnz:,}")

Non-zero entries: 114,903,391


In [25]:
#Computing adjusted cosine similarity

start_time = datetime.now()

try:
    item_similarity_sparse = cosine_similarity(
        rating_matrix_centered.T, 
        dense_output=False
    )
    
    elapsed = datetime.now() - start_time
    print(f"DONEE! Similarity computed in {elapsed}")
    print(f"Shape: {item_similarity_sparse.shape}")
    print(f"Non-zero similarities: {item_similarity_sparse.nnz:,}")
    print(f"Memory: ~{item_similarity_sparse.data.nbytes / (1024**3):.2f} GB")
    
    # Save immediately
    folder = r"D:\GoodreadsData_SPARSE"
    with open(os.path.join(folder, 'similarity_matrix_raw.pkl'), 'wb') as f:
        pickle.dump(item_similarity_sparse, f, protocol=pickle.HIGHEST_PROTOCOL)
    print("Similarity matrix saved")
    
except MemoryError:
    print("OUT OF MEMORY!")
    
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()

OUT OF MEMORY!
Suggestion: Increase threshold to 30 and restart


In [29]:
# SAMPLING STRATEGY: Hybrid (Popular Books + Active Users)

# Keep books with at least 50 interactions (stricter filter)
data = data.groupby('book_id').filter(lambda x: len(x) >= 50)
print(f"After filtering: {len(data):,} interactions, {data['book_id'].nunique():,} books")

# Keep users with at least 10 interactions (active users)
data = data.groupby('user_id').filter(lambda x: len(x) >= 10)
print(f"After filtering: {len(data):,} interactions, {data['user_id'].nunique():,} users")

# If still too large, sample to target size
target_max = 1_000_000  # 1M interactions
if len(data) > target_max:
    print(f"Dataset still large ({len(data):,}), sampling to {target_max:,}...")
    sample_fraction = target_max / len(data)
    data = data.sample(frac=sample_fraction, random_state=42)
    print(f"Sampled to: {len(data):,} interactions")

print("FINAL DATASET:")
print(f"Total interactions: {len(data):,}")
print(f"Unique books: {data['movie_id'].nunique():,}")
print(f"Unique users: {data['user_id'].nunique():,}")
print(f"Sparsity: {100 * (1 - len(data) / (data['user_id'].nunique() * data['movie_id'].nunique())):.2f}%")
print(f"Avg interactions per book: {len(data) / data['movie_id'].nunique():.1f}")
print(f"Avg interactions per user: {len(data) / data['user_id'].nunique():.1f}")

After filtering: 106,826,056 interactions, 306,020 books


MemoryError: Unable to allocate 2.39 GiB for an array with shape (3, 106825306) and data type float64

-----------------------------------------

In [6]:
# avoids calculating similarity for millions of books
data = original_data.groupby('movie_id').filter(lambda x: len(x) >= 30)

In [8]:
data.head()

Unnamed: 0,user_id,movie_id,is_read,rating,is_reviewed
0,0,948,1.0,5.0,0.0
1,0,947,1.0,5.0,1.0
2,0,946,1.0,5.0,0.0
3,0,945,1.0,5.0,0.0
4,0,944,1.0,5.0,0.0


In [9]:
len(data.movie_id.unique())

444259

In [10]:
len(data)

112120510

In [12]:
print("Before filtering:", len(original_data))
print("After filtering:", len(data))

Before filtering: 123731803
After filtering: 112120510


In [13]:
print("Users:", data.user_id.nunique())
print("Books:", data.movie_id.nunique())
print("Interactions:", len(data))

Users: 254191
Books: 444259
Interactions: 112120510


In [14]:
user_to_movie = data.groupby('user_id')['movie_id'].apply(list).to_dict()
movie_to_user = data.groupby('movie_id')['user_id'].apply(list).to_dict()

In [15]:
user_to_movie[0][0]

948

In [16]:
user_to_ratings = data.groupby('user_id')['rating'].apply(list).to_dict()

In [23]:
user_to_ratings[0][0]

5.0

In [29]:
import pickle

# Replace this path with your folder on D: drive
folder = r"D:\GoodreadsData"

# Make sure the folder exists
import os
os.makedirs(folder, exist_ok=True)

# Save dictionaries
with open(os.path.join(folder, "user_to_movie.pkl"), "wb") as f:
    pickle.dump(user_to_movie, f, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(folder, "user_to_ratings.pkl"), "wb") as f:
    pickle.dump(user_to_ratings, f, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(folder, "movie_to_user.pkl"), "wb") as f:
    pickle.dump(movie_to_user, f, protocol=pickle.HIGHEST_PROTOCOL)

In [30]:
user_to_movie_rating = (
    data.groupby('user_id')
        .apply(lambda x: list(zip(x['movie_id'], x['rating'])))
        .to_dict()
)

In [38]:
user_to_movie_rating[0]

[(948, 5.0),
 (947, 5.0),
 (946, 5.0),
 (945, 5.0),
 (944, 5.0),
 (943, 5.0),
 (942, 5.0),
 (941, 5.0),
 (940, 5.0),
 (939, 5.0),
 (938, 5.0),
 (937, 4.0),
 (936, 4.0),
 (935, 4.0),
 (934, 5.0),
 (933, 4.0),
 (932, 4.0),
 (931, 5.0),
 (930, 2.0),
 (929, 4.0),
 (928, 4.0),
 (927, 5.0),
 (924, 5.0),
 (923, 5.0),
 (922, 5.0),
 (921, 4.0),
 (920, 5.0),
 (919, 5.0),
 (918, 5.0),
 (917, 3.0),
 (916, 5.0),
 (915, 5.0),
 (914, 4.0),
 (912, 5.0),
 (910, 0.0),
 (909, 4.0),
 (908, 4.0),
 (907, 0.0),
 (906, 0.0),
 (905, 4.0),
 (904, 0.0),
 (903, 5.0),
 (901, 4.0),
 (900, 4.0),
 (899, 4.0),
 (898, 3.0),
 (897, 5.0),
 (896, 0.0),
 (895, 5.0),
 (893, 4.0),
 (892, 5.0),
 (891, 4.0),
 (890, 5.0),
 (889, 5.0),
 (888, 4.0),
 (887, 0.0),
 (886, 3.0),
 (885, 0.0),
 (884, 0.0),
 (883, 4.0),
 (882, 5.0),
 (881, 0.0),
 (880, 5.0),
 (879, 5.0),
 (878, 4.0),
 (877, 5.0),
 (876, 4.0),
 (875, 4.0),
 (874, 3.0),
 (873, 4.0),
 (872, 3.0),
 (871, 2.0),
 (870, 3.0),
 (869, 3.0),
 (868, 0.0),
 (867, 3.0),
 (866, 4.0),

In [32]:
with open(os.path.join(folder, "user_to_movie_rating.pkl"), "wb") as f:
    pickle.dump(user_to_movie_rating, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# with open(r"D:\GoodreadsData\user_to_movie.pkl", "rb") as f:
#     user_to_movie = pickle.load(f)

# with open(r"D:\GoodreadsData\user_to_ratings.pkl", "rb") as f:
#     user_to_ratings = pickle.load(f)

# with open(r"D:\GoodreadsData\movie_to_user.pkl", "rb") as f:
#     movie_to_user = pickle.load(f)

# with open(r"D:\GoodreadsData\user_to_movie_rating.pkl", "rb") as f:
#     user_to_movie_rating = pickle.load(f)

In [34]:
user_mean = {}
for user, movie_ratings in user_to_movie_rating.items():
    total = 0
    count = 0
    for movie, rating in movie_ratings:
        total += rating
        count += 1
    user_mean[user] = total / count

In [36]:
user_mean[0]

2.1421911421911424

In [40]:
movie_mean = data.groupby('movie_id')['rating'].mean().to_dict()

In [41]:
movie_mean[0]

0.029017857142857144

In [42]:
import pickle

# Save user_mean
with open("user_mean.pkl", "wb") as f:
    pickle.dump(user_mean, f, protocol=pickle.HIGHEST_PROTOCOL)

# Save movie_mean
with open("movie_mean.pkl", "wb") as f:
    pickle.dump(movie_mean, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# import pickle

# with open("user_mean.pkl", "rb") as f:
#     user_mean = pickle.load(f)

# with open("movie_mean.pkl", "rb") as f:
#     movie_mean = pickle.load(f)

In [43]:
def adjusted_cosine_similarity(movie_i, movie_j, min_common=5):
    users_i = set(movie_to_user[movie_i])
    users_j = set(movie_to_user[movie_j])
    common_users = users_i.intersection(users_j)

    if len(common_users) < min_common:
        return 0

    numerator = 0
    denominator_i = 0
    denominator_j = 0

    for user in common_users:
        rating_i = user_movie_to_rating[(user, movie_i)]
        rating_j = user_movie_to_rating[(user, movie_j)]
        diff_i = rating_i - user_mean[user]
        diff_j = rating_j - user_mean[user]

        numerator += diff_i * diff_j
        denominator_i += diff_i ** 2
        denominator_j += diff_j ** 2

    return numerator / (math.sqrt(denominator_i) * math.sqrt(denominator_j))


In [44]:
def compute_similarity_matrix(min_common=5):
    # 1. Precompute co-rated pairs (scales well)
    co_rated = defaultdict(set)

    for user, movies in user_to_movie.items():
        for i in range(len(movies)):
            for j in range(i+1, len(movies)):
                a, b = movies[i], movies[j]
                co_rated[a].add(b)
                co_rated[b].add(a)

    # 2. Compute similarity only for those pairs
    movies = list(movie_to_user.keys())
    similarity_matrix = {}

    for idx, movie_i in enumerate(movies):
        similarity_matrix[movie_i] = {}

        for movie_j in co_rated[movie_i]:
            sim = adjusted_cosine_similarity(movie_i, movie_j, min_common)
            similarity_matrix[movie_i][movie_j] = sim

        # Save incrementally to avoid memory blowup
        with open(f'sim_{movie_i}.pkl', 'wb') as f:
            pickle.dump(similarity_matrix[movie_i], f)

        if idx % 200 == 0:
            print(f"{idx} movies processed...")

    return similarity_matrix


In [45]:
import pickle
import os

folder = r"D:\GoodreadsData"

similarity_matrix = {}

for movie in movie_to_user.keys():
    filepath = os.path.join(folder, f"sim_{movie}.pkl")
    
    try:
        with open(filepath, "rb") as f:
            similarity_matrix[movie] = pickle.load(f)
    except FileNotFoundError:
        similarity_matrix[movie] = {}

In [49]:
similarity_matrix

{0: {},
 1: {},
 2: {},
 3: {},
 5: {},
 6: {},
 7: {},
 8: {},
 9: {},
 10: {},
 11: {},
 12: {},
 13: {},
 14: {},
 15: {},
 16: {},
 17: {},
 18: {},
 19: {},
 20: {},
 21: {},
 22: {},
 23: {},
 25: {},
 26: {},
 27: {},
 28: {},
 29: {},
 30: {},
 31: {},
 32: {},
 33: {},
 34: {},
 35: {},
 36: {},
 37: {},
 38: {},
 39: {},
 40: {},
 41: {},
 42: {},
 43: {},
 44: {},
 46: {},
 47: {},
 48: {},
 49: {},
 50: {},
 51: {},
 52: {},
 53: {},
 55: {},
 56: {},
 57: {},
 58: {},
 59: {},
 60: {},
 61: {},
 62: {},
 63: {},
 64: {},
 65: {},
 66: {},
 67: {},
 68: {},
 69: {},
 70: {},
 71: {},
 72: {},
 74: {},
 75: {},
 76: {},
 77: {},
 78: {},
 79: {},
 80: {},
 81: {},
 82: {},
 83: {},
 84: {},
 85: {},
 86: {},
 87: {},
 88: {},
 89: {},
 90: {},
 92: {},
 93: {},
 95: {},
 96: {},
 97: {},
 98: {},
 99: {},
 100: {},
 101: {},
 103: {},
 104: {},
 105: {},
 106: {},
 108: {},
 109: {},
 110: {},
 111: {},
 112: {},
 113: {},
 115: {},
 116: {},
 117: {},
 118: {},
 119: {},
 1

In [46]:
def predict_rating(user, target_movie, k=25):
    rated_movies = user_to_movie[user]
    similarities = []

    for movie in rated_movies:
        if (target_movie in similarity_matrix) and (movie in similarity_matrix[target_movie]):
            similarity = similarity_matrix[target_movie][movie]
        elif (movie in similarity_matrix) and (target_movie in similarity_matrix[movie]):
            similarity = similarity_matrix[movie][target_movie]
        else:
            similarity = 0

        similarities.append((movie, similarity))

    similarities.sort(key=lambda x: abs(x[1]), reverse=True)
    neighbors = similarities[:k]

    numerator = 0
    denominator = 0
    for movie, sim in neighbors:
        rating = user_movie_to_rating[(user, movie)]
        base = movie_mean[movie]
        numerator += sim * (rating - base)
        denominator += abs(sim)

    return movie_mean[target_movie] + numerator / (denominator + 1e-9)


In [47]:
target_user = 948   # example from your CSV

all_movies = set(movie_to_user.keys())
rated_movies = set(user_to_movie[target_user])
candidate_movies = all_movies - rated_movies

predictions = {}

for idx, movie in enumerate(candidate_movies):
    if idx % 200 == 0:
        print(f"{idx} movies processed...")

    predictions[movie] = predict_rating(target_user, movie)


0 movies processed...


NameError: name 'user_movie_to_rating' is not defined

In [None]:
top_5 = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:5]

print("\nTop 5 Recommendations:")
for movie, pred in top_5:
    print(f"Book {movie}, Predicted Rating: {round(pred, 3)}")