# Train XGBoost Ranking Model

This notebook trains the XGBoost model for the ranking stage of the two-stage recommendation system.


In [None]:
import sys
import os
import pandas as pd
import json
import numpy as np
from typing import Tuple, List
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

import boto3
import sagemaker
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_store import FeatureStore

# Configuration
def get_config():
    """Get configuration dictionary from environment variables."""
    return {
        'region': os.getenv('AWS_REGION', 'ap-south-1'),
        'role_arn': os.getenv('AWS_ROLE_ARN', ''),
        'bucket': os.getenv('S3_BUCKET', ''),
        'user_feature_group_name': os.getenv('USER_FEATURE_GROUP_NAME', ''),
        'item_feature_group_name': os.getenv('ITEM_FEATURE_GROUP_NAME', ''),
        'dynamodb_table_name': os.getenv('DYNAMODB_TABLE_NAME', 'user-candidates')
    }

def initialize_sessions(config: dict = None):
    """Initialize AWS and SageMaker sessions."""
    if config is None:
        config = get_config()
    
    boto_session = boto3.Session(region_name=config['region'])
    sagemaker_session = sagemaker.Session(
        boto_session=boto_session,
        default_bucket=config['bucket']
    )
    featurestore_runtime = boto_session.client(
        service_name='sagemaker-featurestore-runtime',
        region_name=config['region']
    )
    sagemaker_client = boto_session.client(
        service_name='sagemaker',
        region_name=config['region']
    )
    feature_store = FeatureStore(sagemaker_session=sagemaker_session)
    
    print(f"Initialized SageMaker session in {config['region']}")
    print(f"Default bucket: {config['bucket']}")
    
    return {
        'boto_session': boto_session,
        'sagemaker_session': sagemaker_session,
        'featurestore_runtime': featurestore_runtime,
        'sagemaker_client': sagemaker_client,
        'feature_store': feature_store,
        'config': config
    }

# XGBoost Ranking Functions
def prepare_ranking_training_data(interactions_df, user_feature_group, item_feature_group,
                                   feature_store, bucket, sample_size=None):
    """Prepare training data for XGBoost ranking model."""
    if sample_size and len(interactions_df) > sample_size:
        interactions_df = interactions_df.sample(n=sample_size, random_state=42)
    
    print(f"Preparing ranking training data for {len(interactions_df)} interactions...")
    
    builder = feature_store.create_dataset(
        base=interactions_df,
        event_time_identifier_feature_name='event_time_seconds',
        record_identifier_feature_name='user_id',
        output_path=f"s3://{bucket}/ranking-training-datasets/"
    )
    
    builder = builder.with_feature_group(feature_group=user_feature_group)
    builder = builder.with_feature_group(feature_group=item_feature_group)
    
    s3_uri, query = builder.to_csv_file()
    
    print(f"Ranking training dataset created at: {s3_uri}")
    
    ranking_df = pd.read_csv(s3_uri)
    
    ranking_df.columns = ranking_df.columns.str.replace(r'.*users.*\.', '', regex=True)
    ranking_df.columns = ranking_df.columns.str.replace(r'.*items.*\.', '', regex=True)
    
    if 'rating' not in ranking_df.columns:
        raise ValueError("Rating column not found in training data")
    
    print(f"Training data shape: {ranking_df.shape}")
    print(f"Features: {[col for col in ranking_df.columns if col != 'rating']}")
    
    return ranking_df

def train_xgboost_ranker(training_df, role_arn, bucket, sagemaker_session, boto_session,
                         target_column='rating', instance_type='ml.m5.xlarge'):
    """Train an XGBoost model for ranking candidate items."""
    from sagemaker import image_uris
    
    feature_columns = [col for col in training_df.columns if col != target_column]
    feature_columns = [col for col in feature_columns if col not in ['user_id', 'parent_asin', 'event_time_seconds', 'calendar_date']]
    
    X = training_df[feature_columns].copy()
    y = training_df[target_column].copy()
    
    categorical_cols = X.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
    
    X = X.fillna(0)
    
    training_data = pd.concat([y, X], axis=1)
    
    local_file = '/tmp/xgboost_training.csv'
    training_data.to_csv(local_file, index=False, header=False)
    
    s3_key = 'xgboost-ranking/training_data.csv'
    s3_client = boto_session.client('s3')
    s3_client.upload_file(local_file, bucket, s3_key)
    
    training_s3_uri = f"s3://{bucket}/{s3_key}"
    
    feature_names_key = 'xgboost-ranking/feature_names.json'
    feature_names_json = json.dumps(list(X.columns))
    s3_client.put_object(
        Bucket=bucket,
        Key=feature_names_key,
        Body=feature_names_json.encode('utf-8')
    )
    
    container = image_uris.retrieve("xgboost", boto_session.region_name, version='1.7-1')
    
    xgb_estimator = sagemaker.estimator.Estimator(
        container,
        role=role_arn,
        instance_count=1,
        instance_type=instance_type,
        output_path=f"s3://{bucket}/xgboost-model-artifacts/",
        sagemaker_session=sagemaker_session
    )
    
    xgb_estimator.set_hyperparameters(
        objective='reg:squarederror',
        num_round=100,
        max_depth=6,
        eta=0.3,
        gamma=0,
        min_child_weight=1,
        subsample=0.8,
        silent=0
    )
    
    print("Starting XGBoost training...")
    xgb_estimator.fit({'train': training_s3_uri})
    
    print(f"XGBoost training complete! Model artifacts: {xgb_estimator.model_data}")
    
    return xgb_estimator, list(X.columns)


In [None]:
# Initialize sessions
config = get_config()
sessions = initialize_sessions(config)

boto_session = sessions['boto_session']
sagemaker_session = sessions['sagemaker_session']
feature_store = sessions['feature_store']
config = sessions['config']


## Step 1: Load Feature Groups


In [None]:
# Load Feature Groups
user_fg = FeatureGroup(
    name=config['user_feature_group_name'],
    sagemaker_session=sagemaker_session
)

item_fg = FeatureGroup(
    name=config['item_feature_group_name'],
    sagemaker_session=sagemaker_session
)

# Verify Feature Groups exist
try:
    user_fg.describe()
    item_fg.describe()
    print("Feature Groups loaded successfully")
except Exception as e:
    print(f"Error loading Feature Groups: {e}")
    print("Please update Feature Group names in config")


## Step 2: Load Interaction Data


In [None]:
# Load interaction data
interactions_df = pd.read_parquet(
    "s3://recommendation-project-rapid/processed/all_beauty_dataset/",
    engine="pyarrow"
)

print(f"Loaded {len(interactions_df)} interactions")


## Step 3: Prepare Ranking Training Data


In [None]:
# Prepare ranking training data with Feature Store joins
ranking_training_df = prepare_ranking_training_data(
    interactions_df=interactions_df,
    user_feature_group=user_fg,
    item_feature_group=item_fg,
    feature_store=feature_store,
    bucket=config['bucket'],
    sample_size=10000  # Use sample for faster training (remove for full dataset)
)


## Step 4: Train XGBoost Ranker


In [None]:
# Train XGBoost model
xgb_estimator, feature_names = train_xgboost_ranker(
    training_df=ranking_training_df,
    role_arn=config['role_arn'],
    bucket=config['bucket'],
    sagemaker_session=sagemaker_session,
    boto_session=boto_session,
    target_column='rating'
)


## Step 5: Deploy XGBoost Model


In [None]:
# Deploy XGBoost model
xgb_predictor = xgb_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    serializer=sagemaker.serializers.CSVSerializer(),
    deserializer=sagemaker.deserializers.CSVDeserializer()
)

print(f"XGBoost endpoint deployed: {xgb_predictor.endpoint_name}")


## Save Model Information

Save the model information for use in inference:


In [None]:
import json

# Save XGBoost model info
xgb_model_info = {
    'endpoint_name': xgb_predictor.endpoint_name,
    'feature_names': feature_names,
    'model_artifacts': xgb_estimator.model_data
}

# Save to JSON file
with open('xgb_model_info.json', 'w') as f:
    json.dump(xgb_model_info, f, indent=2)

print("XGBoost model info saved to 'xgb_model_info.json'")
print(f"Endpoint: {xgb_predictor.endpoint_name}")
print(f"Number of features: {len(feature_names)}")
