In [None]:
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker import get_execution_role, image_uris
from sagemaker.session import Session
from sagemaker.inputs import TrainingInput
from sagemaker.predictor import Predictor
from sklearn.preprocessing import StandardScaler 
import joblib  # For saving the scaler if needed

# --- 1. Load Data from S3 ---
s3_path = 's3://imba-chien-data-features-dev/features/train/'
train_df = pd.read_parquet(s3_path)
print("Loaded data shape:", train_df.shape)

# --- 2. Prepare Data: Label as first column ---
X = train_df.drop(columns=["user_id", "product_id", "reordered"])
y = train_df["reordered"]

# --- 2a. Scale the features ---
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Feature scaling complete. Mean (should be ~0):", X_scaled.mean(axis=0))

# (Optional) Save the scaler for inference use
joblib.dump(scaler, "scaler.pkl")
# Upload scaler.pkl to S3 if you want to use it in Lambda or elsewhere
bucket = "imba-chien-data-features-dev"
scaler_s3_key = "scale_models/scaler.pkl"
boto3.client('s3').upload_file("scaler.pkl", bucket, scaler_s3_key)
print(f"Uploaded scaler to s3://{bucket}/{scaler_s3_key}")

# --- 2b. Combine label and scaled features ---
train_data = np.column_stack((y, X_scaled))
train_df_sagemaker = pd.DataFrame(train_data)

# --- 3. Save as Parquet (no header, no index) ---
local_parquet = 'train_sagemaker.parquet'
train_df_sagemaker.to_parquet(local_parquet, index=False)
print(f"Saved local parquet: {local_parquet}")

# --- 4. Upload to S3 ---
bucket = 'imba-chien-data-features-dev'
s3_key = 'sagemaker/train.parquet'
boto3.client('s3').upload_file(local_parquet, bucket, s3_key)
print(f"Uploaded to s3://{bucket}/{s3_key}")

# --- 5. Set up SageMaker Training Job ---
region = boto3.Session().region_name
sagemaker_session = sagemaker.Session()
role = get_execution_role()
prefix = 'xgboost-model'
output_path = f's3://{bucket}/{prefix}/output'

hyperparameters = {
    "objective": "binary:logistic",
    "max_depth": 7,
    "eta": 0.2,
    "eval_metric": "logloss,auc",
    "num_round": 1000,
    "early_stopping_rounds": 20
}

xgboost_container = image_uris.retrieve("xgboost", region, "1.7-1")

estimator = sagemaker.estimator.Estimator(
    image_uri=xgboost_container,
    hyperparameters=hyperparameters,
    role=role,
    instance_count=1,
    instance_type='ml.m5.2xlarge',
    output_path=output_path,
    sagemaker_session=sagemaker_session
)

# --- 6. Define Training Input ---
train_input = TrainingInput(
    f's3://{bucket}/sagemaker/train.parquet',
    content_type='application/x-parquet'
)

# --- 7. Launch Training Job ---
print("Starting SageMaker training job...")
estimator.fit({'train': train_input})
print("Training job launched.") 

# Deploy Endpoint for Inference
### We will deploy endpoint by Terraform, so the following is for test only.

In [None]:
# --- 8. Deploy Endpoint ---
print("Deploying endpoint...") 
sm = boto3.client('sagemaker')

# Delete the endpoint (if it exists)
try:
    sm.delete_endpoint(EndpointName='xgboost-endpoint')
except sm.exceptions.ClientError:
    pass  # Already deleted

# Delete the endpoint configuration
try:
    sm.delete_endpoint_config(EndpointConfigName='xgboost-endpoint')
except sm.exceptions.ClientError:
    pass  # Already deleted



In [None]:
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',  # or another supported instance type
    endpoint_name='xgboost-endpoint'  # optional: custom endpoint name
)
print("Endpoint deployed.")

# --- 9. Test Endpoint ---
print("Testing endpoint...")

predictor = Predictor(endpoint_name='xgboost-endpoint')

s3_path = 's3://imba-chien-data-features-dev/features/test/'
test_df = pd.read_parquet(s3_path)

X_test = test_df.drop(columns=["user_id", "product_id"])
print(X_test.columns)
# Index(['user_orders_scaled', 'user_periods_scaled',
#        'user_mean_days_since_prior_scaled', 'user_products_scaled',
#        'user_distinct_products_scaled', 'user_reorder_ratio_scaled',
#        'prod_orders_scaled', 'prod_reorders_scaled',
#        'prod_first_orders_scaled', 'prod_second_orders_scaled'],
#       dtype='object')

sample = X_test.iloc[0].values.reshape(1, -1)
csv_str = ','.join(str(x) for x in sample[0]) + '\n'
response = predictor.predict(csv_str, initial_args={'ContentType': 'text/csv'})
print("Prediction:", response)