# Train Matrix Factorization Model (Retrieval Stage)

This notebook trains the Matrix Factorization model for the retrieval stage of the two-stage recommendation system.


In [None]:
import sys
import os
import pandas as pd
import pickle
import io
import numpy as np
from typing import Dict, Tuple, List
from datetime import datetime

# Install surprise library if not already installed
try:
    import surprise
except ImportError:
    print("Installing surprise library...")
    import subprocess
    subprocess.check_call(["pip", "install", "scikit-surprise"])
    import surprise

import boto3
import sagemaker
from sklearn.neighbors import NearestNeighbors
from surprise import SVD, Dataset, Reader

# Configuration
def get_config():
    """Get configuration dictionary."""
    return {
        'region': "ap-south-1",
        'role_arn': "arn:aws:iam::487512486150:role/recommendationsystem-sagemaker-role",
        'bucket': "amazon-sagemaker-local-dev-store",
        'user_feature_group_name': "all-beauty-users-1766218384",
        'item_feature_group_name': "all-beauty-items-1766218384"
    }

def initialize_sessions(config: dict = None):
    """Initialize AWS and SageMaker sessions."""
    if config is None:
        config = get_config()
    
    boto_session = boto3.Session(region_name=config['region'])
    sagemaker_session = sagemaker.Session(
        boto_session=boto_session,
        default_bucket=config['bucket']
    )
    featurestore_runtime = boto_session.client(
        service_name='sagemaker-featurestore-runtime',
        region_name=config['region']
    )
    sagemaker_client = boto_session.client(
        service_name='sagemaker',
        region_name=config['region']
    )
    
    print(f"Initialized SageMaker session in {config['region']}")
    print(f"Default bucket: {config['bucket']}")
    
    return {
        'boto_session': boto_session,
        'sagemaker_session': sagemaker_session,
        'featurestore_runtime': featurestore_runtime,
        'sagemaker_client': sagemaker_client,
        'config': config
    }

# Matrix Factorization Functions
def prepare_mf_training_data(interactions_df, output_s3_path, bucket, boto_session):
    """Prepare training data for Matrix Factorization using Surprise library."""
    unique_users = sorted(interactions_df['user_id'].unique())
    unique_items = sorted(interactions_df['parent_asin'].unique())
    
    user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
    item_to_idx = {item: idx for idx, item in enumerate(unique_items)}
    
    num_users = len(unique_users)
    num_items = len(unique_items)
    
    print(f"Number of unique users: {num_users}")
    print(f"Number of unique items: {num_items}")
    
    training_data = interactions_df[['user_id', 'parent_asin', 'rating']].copy()
    local_file = '/tmp/mf_training_data.csv'
    training_data.to_csv(local_file, index=False, header=False)
    
    s3_client = boto_session.client('s3')
    s3_path = output_s3_path.replace(f's3://{bucket}/', '')
    s3_client.upload_file(local_file, bucket, s3_path)
    
    s3_uri = f"s3://{bucket}/{s3_path}"
    print(f"Training data saved to: {s3_uri}")
    
    mappings = {
        'user_to_idx': user_to_idx,
        'item_to_idx': item_to_idx,
        'idx_to_user': {idx: user for user, idx in user_to_idx.items()},
        'idx_to_item': {idx: item for item, idx in item_to_idx.items()},
        'num_users': num_users,
        'num_items': num_items
    }
    
    mappings_key = s3_path.replace('.csv', '_mappings.pkl')
    mappings_buffer = io.BytesIO()
    pickle.dump(mappings, mappings_buffer)
    mappings_buffer.seek(0)
    s3_client.upload_fileobj(mappings_buffer, bucket, mappings_key)
    
    return s3_uri, mappings

def train_matrix_factorization(training_data_s3_uri, mappings, bucket, sagemaker_session, boto_session, 
                               n_factors=64, n_epochs=20, lr_all=0.005, reg_all=0.02):
    """Train a Matrix Factorization model using Surprise library's SVD algorithm."""
    s3_client = boto_session.client('s3')
    s3_path = training_data_s3_uri.replace('s3://', '').replace(f'{bucket}/', '')
    local_file = '/tmp/mf_training_data.csv'
    
    print(f"Downloading training data from {training_data_s3_uri}...")
    s3_client.download_file(bucket, s3_path, local_file)
    
    reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5))
    data = Dataset.load_from_file(local_file, reader=reader)
    trainset = data.build_full_trainset()
    
    model = SVD(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all, random_state=42)
    
    print("Starting Matrix Factorization training with Surprise SVD...")
    model.fit(trainset)
    print("Training complete!")
    
    num_users = mappings['num_users']
    num_items = mappings['num_items']
    
    user_embeddings = np.zeros((num_users, n_factors))
    item_embeddings = np.zeros((num_items, n_factors))
    
    for inner_uid in range(trainset.n_users):
        try:
            raw_uid = trainset.to_raw_uid(inner_uid)
            if raw_uid in mappings['user_to_idx']:
                user_idx = mappings['user_to_idx'][raw_uid]
                user_embeddings[user_idx] = model.pu[inner_uid]
        except (KeyError, IndexError):
            continue
    
    for inner_iid in range(trainset.n_items):
        try:
            raw_iid = trainset.to_raw_iid(inner_iid)
            if raw_iid in mappings['item_to_idx']:
                item_idx = mappings['item_to_idx'][raw_iid]
                item_embeddings[item_idx] = model.qi[inner_iid]
        except (KeyError, IndexError):
            continue
    
    print(f"User embeddings shape: {user_embeddings.shape}")
    print(f"Item embeddings shape: {item_embeddings.shape}")
    
    embeddings_s3_key = f"mf-embeddings/user_embeddings.npy"
    items_embeddings_s3_key = f"mf-embeddings/item_embeddings.npy"
    
    user_buffer = io.BytesIO()
    np.save(user_buffer, user_embeddings)
    user_buffer.seek(0)
    s3_client.upload_fileobj(user_buffer, bucket, embeddings_s3_key)
    
    item_buffer = io.BytesIO()
    np.save(item_buffer, item_embeddings)
    item_buffer.seek(0)
    s3_client.upload_fileobj(item_buffer, bucket, items_embeddings_s3_key)
    
    print(f"Embeddings saved to S3")
    return user_embeddings, item_embeddings

def build_knn_index(item_embeddings, item_mappings, n_neighbors=100):
    """Build a K-NN index for item embeddings."""
    knn = NearestNeighbors(
        n_neighbors=min(n_neighbors + 1, len(item_embeddings)),
        metric='cosine',
        algorithm='brute'
    )
    knn.fit(item_embeddings)
    idx_to_item = {idx: item for item, idx in item_mappings.items()}
    print(f"K-NN index built with {len(item_embeddings)} items")
    return knn, idx_to_item

def retrieve_top_k_candidates(user_id, user_embeddings, item_embeddings, user_mappings, idx_to_item, 
                              k=100, exclude_interacted=True, user_interactions=None):
    """Retrieve top-K candidate items for a given user using MF embeddings."""
    if user_id not in user_mappings:
        raise ValueError(f"User {user_id} not found in embeddings")
    
    user_idx = user_mappings[user_id]
    user_embedding = user_embeddings[user_idx]
    similarity_scores = np.dot(item_embeddings, user_embedding)
    top_k_indices = np.argsort(similarity_scores)[::-1][:k+1]
    
    candidate_items = []
    for idx in top_k_indices:
        if idx in idx_to_item:
            candidate_items.append(idx_to_item[idx])
    
    if exclude_interacted and user_interactions is not None:
        user_items = set(user_interactions[user_interactions['user_id'] == user_id]['parent_asin'].unique())
        candidate_items = [item for item in candidate_items if item not in user_items]
    
    return candidate_items[:k]

# Model Registry and DynamoDB Functions
def save_mf_model_to_registry(model_artifacts_s3_uri, model_name, model_package_group_name, 
                              role_arn, sagemaker_session, boto_session, model_description):
    """Save Matrix Factorization model to SageMaker Model Registry."""
    sagemaker_client = boto_session.client('sagemaker')
    
    try:
        sagemaker_client.describe_model_package_group(ModelPackageGroupName=model_package_group_name)
    except sagemaker_client.exceptions.ResourceNotFound:
        sagemaker_client.create_model_package_group(
            ModelPackageGroupName=model_package_group_name,
            ModelPackageGroupDescription="Matrix Factorization retrieval models"
        )
        print(f"Created model package group: {model_package_group_name}")
    
    try:
        model_package_input_dict = {
            "ModelPackageGroupName": model_package_group_name,
            "ModelPackageDescription": model_description,
            "ModelApprovalStatus": "PendingManualApproval",
            "MetadataProperties": {
                "GeneratedBy": "SageMaker Pipeline",
                "ProjectId": "two-stage-recommendation-system"
            },
            "CustomerMetadataProperties": {
                "ModelArtifactsS3Uri": model_artifacts_s3_uri,
                "ModelType": "MatrixFactorization",
                "TrainingFramework": "Surprise"
            }
        }
        
        model_package_response = sagemaker_client.create_model_package(**model_package_input_dict)
        model_package_arn = model_package_response['ModelPackageArn']
        print(f"Model package created: {model_package_arn}")
        return model_package_arn
    except Exception as e:
        print(f"Error creating model package: {e}")
        return f"arn:aws:sagemaker:{boto_session.region_name}:model-package/{model_package_group_name}/manual"

def store_candidates_in_dynamodb(user_embeddings, item_embeddings, user_mappings, idx_to_item,
                                 interactions_df, dynamodb_table_name, boto_session, k=100, batch_size=25):
    """Generate candidates for all users and store them in DynamoDB."""
    dynamodb = boto_session.resource('dynamodb')
    table = dynamodb.Table(dynamodb_table_name)
    
    all_user_ids = list(user_mappings.keys())
    num_users = len(all_user_ids)
    
    print(f"Generating candidates for {num_users} users...")
    print(f"Storing in DynamoDB table: {dynamodb_table_name}")
    
    for batch_start in range(0, num_users, batch_size):
        batch_end = min(batch_start + batch_size, num_users)
        batch_users = all_user_ids[batch_start:batch_end]
        
        with table.batch_writer() as batch:
            for user_id in batch_users:
                try:
                    candidate_items = retrieve_top_k_candidates(
                        user_id=user_id,
                        user_embeddings=user_embeddings,
                        item_embeddings=item_embeddings,
                        user_mappings=user_mappings,
                        idx_to_item=idx_to_item,
                        k=k,
                        exclude_interacted=True,
                        user_interactions=interactions_df
                    )
                    
                    item = {
                        'user_id': user_id,
                        'candidates': candidate_items,
                        'num_candidates': len(candidate_items),
                        'timestamp': datetime.utcnow().isoformat()
                    }
                    batch.put_item(Item=item)
                except Exception as e:
                    print(f"Error processing user {user_id}: {e}")
                    continue
        
        if batch_end % 100 == 0 or batch_end == num_users:
            print(f"Processed {batch_end}/{num_users} users ({100*batch_end/num_users:.1f}%)...")
    
    print(f"\nCompleted! Stored candidates for {num_users} users in DynamoDB table: {dynamodb_table_name}")


In [None]:
# Initialize sessions
config = get_config()
sessions = initialize_sessions(config)

boto_session = sessions['boto_session']
sagemaker_session = sessions['sagemaker_session']
config = sessions['config']


## Step 1: Load Interaction Data


In [None]:
# Load interaction data
interactions_df = pd.read_parquet(
    "s3://recommendation-project-rapid/processed/all_beauty_dataset/",
    engine="pyarrow"
)

# Select only user_id, parent_asin, rating for MF training
mf_training_df = interactions_df[['user_id', 'parent_asin', 'rating']].copy()

print(f"Loaded {len(mf_training_df)} interactions")
print(f"Unique users: {mf_training_df['user_id'].nunique()}")
print(f"Unique items: {mf_training_df['parent_asin'].nunique()}")


## Step 2: Prepare Training Data


In [None]:
# Prepare MF training data
training_data_s3_uri, mappings = prepare_mf_training_data(
    interactions_df=mf_training_df,
    output_s3_path=f"s3://{config['bucket']}/mf-training/training_data.csv",
    bucket=config['bucket'],
    boto_session=boto_session
)


## Step 3: Train Matrix Factorization Model


In [None]:
# Train Matrix Factorization model using Surprise
user_embeddings, item_embeddings = train_matrix_factorization(
    training_data_s3_uri=training_data_s3_uri,
    mappings=mappings,
    bucket=config['bucket'],
    sagemaker_session=sagemaker_session,
    boto_session=boto_session,
    n_factors=64,      # Number of latent factors (embedding dimension)
    n_epochs=20,       # Number of training epochs
    lr_all=0.005,      # Learning rate
    reg_all=0.02       # Regularization term
)


## Step 4: Embeddings Ready

Embeddings are already extracted from the trained model.


In [None]:
# Embeddings are already available from training
print(f"User embeddings shape: {user_embeddings.shape}")
print(f"Item embeddings shape: {item_embeddings.shape}")


## Step 5: Build K-NN Index


In [None]:
# Build K-NN index for item embeddings
knn_model, idx_to_item = build_knn_index(
    item_embeddings=item_embeddings,
    item_mappings=mappings['item_to_idx'],
    n_neighbors=100
)


## Save Artifacts

Save the trained model artifacts for use in inference:


In [None]:
# Save model artifacts to S3 for Model Registry
s3_client = boto_session.client('s3')
mf_artifacts = {
    'mappings': mappings,
    'user_embeddings': user_embeddings,
    'item_embeddings': item_embeddings,
    'knn_model': knn_model,
    'idx_to_item': idx_to_item
}

# Save to S3
model_artifacts_key = f"mf-model-artifacts/mf_model_artifacts.pkl"
artifacts_buffer = io.BytesIO()
pickle.dump(mf_artifacts, artifacts_buffer)
artifacts_buffer.seek(0)
s3_client.upload_fileobj(artifacts_buffer, config['bucket'], model_artifacts_key)

model_artifacts_s3_uri = f"s3://{config['bucket']}/{model_artifacts_key}"
print(f"Model artifacts saved to S3: {model_artifacts_s3_uri}")
print(f"User embeddings shape: {user_embeddings.shape}")
print(f"Item embeddings shape: {item_embeddings.shape}")

## Step 6: Save Model to SageMaker Model Registry


In [None]:
# Save model to SageMaker Model Registry
model_name = "mf-retrieval-model"
model_package_group_name = "mf-retrieval-models"

model_package_arn = save_mf_model_to_registry(
    model_artifacts_s3_uri=model_artifacts_s3_uri,
    model_name=model_name,
    model_package_group_name=model_package_group_name,
    role_arn=config['role_arn'],
    sagemaker_session=sagemaker_session,
    boto_session=boto_session,
    model_description="Matrix Factorization model for candidate retrieval in two-stage recommendation system"
)

print(f"Model registered: {model_package_arn}")

## Step 7: Generate and Store Candidates in DynamoDB


In [None]:
# Load full interactions for excluding already-interacted items
full_interactions_df = pd.read_parquet(
    "s3://recommendation-project-rapid/processed/all_beauty_dataset/",
    engine="pyarrow"
)

# DynamoDB table name (create this table first if it doesn't exist)
dynamodb_table_name = "user-candidates"  # Update with your table name

# Generate and store candidates in DynamoDB
store_candidates_in_dynamodb(
    user_embeddings=user_embeddings,
    item_embeddings=item_embeddings,
    user_mappings=mappings['user_to_idx'],
    idx_to_item=idx_to_item,
    interactions_df=full_interactions_df,
    dynamodb_table_name=dynamodb_table_name,
    boto_session=boto_session,
    k=100,  # Top 100 candidates per user
    batch_size=25  # DynamoDB batch write limit
)
