In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import pickle


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Building the Item Similarity Matrix

In [3]:
def build_item_embeddings(reviews_df, n_factors=50):
    # Get unique users and items
    unique_users = reviews_df['reviewerID'].unique()
    unique_items = reviews_df['asin'].unique()

    print(f"Found {len(unique_users)} users and {len(unique_items)} items")

    # Create mapping dictionaries
    user_to_idx = {user: i for i, user in enumerate(unique_users)}
    item_to_idx = {item: i for i, item in enumerate(unique_items)}
    idx_to_item = {i: item for item, i in item_to_idx.items()}

    # Create user-item interaction matrix
    user_indices = [user_to_idx[user] for user in reviews_df['reviewerID']]
    item_indices = [item_to_idx[item] for item in reviews_df['asin']]
    ratings = reviews_df['overall'].values

    n_users = len(unique_users)
    n_items = len(unique_items)
    print(f"Creating sparse matrix of shape ({n_users}, {n_items})")

    user_item_sparse = csr_matrix((ratings, (user_indices, item_indices)),
                                 shape=(n_users, n_items))

    print("Applying SVD...")
    # Apply SVD to reduce dimensionality
    svd = TruncatedSVD(n_components=n_factors, random_state=42)
    item_embeddings = svd.fit_transform(user_item_sparse.T)

    result = {
        'embeddings': item_embeddings,
        'item_to_idx': item_to_idx,
        'idx_to_item': idx_to_item
    }

    print("Item embeddings created successfully")
    return result

Saving to Drive

In [4]:
def save_embeddings_to_drive(embeddings_data, file_path):
    import os
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)

    print(f"Saving embeddings to {file_path}...")
    with open(file_path, 'wb') as f:
        pickle.dump(embeddings_data, f, protocol=4)

    print("Embeddings saved successfully to Google Drive")


In [5]:
if __name__ == "__main__":
    import pandas as pd
    import pickle
    import os

    # Load reviews data
    reviews_path = "/content/drive/MyDrive/bt4222data/Reviews Data Cleaned/cleaned_reviews.csv"
    reviews_df = pd.read_csv(reviews_path)

    # Build item embeddings instead of similarity matrix
    item_embeddings = build_item_embeddings(reviews_df, n_factors=50)

    # Save embeddings to drive
    embeddings_path = "/content/drive/MyDrive/bt4222data/embeddings/item_embeddings.pkl"

    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(embeddings_path), exist_ok=True)

    # Save to drive
    print(f"Saving item embeddings to {embeddings_path}...")
    with open(embeddings_path, 'wb') as f:
        pickle.dump(item_embeddings, f, protocol=4)

    print("Item embeddings saved successfully to Google Drive")

Found 192403 users and 63001 items
Creating sparse matrix of shape (192403, 63001)
Applying SVD...
Item embeddings created successfully
Saving item embeddings to /content/drive/MyDrive/bt4222data/embeddings/item_embeddings.pkl...
Item embeddings saved successfully to Google Drive
