In [None]:
# Local model screening identified Linear Regression as goto option for Affinity and Random Forest for activity. 
# For simplicity Random Forest Regressor and classification were used.
%%writefile train.py
import argparse, os, joblib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, f1_score

if __name__ == '__main__':
    print("üß™ Script started...")
    parser = argparse.ArgumentParser()
    parser.add_argument('--n_estimators', type=int, default=100)
    parser.add_argument('--max_depth', type=int, default=None)
    parser.add_argument('--target_type', type=str, default='affinity')
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    args, _ = parser.parse_known_args()

    # 1. LOAD DATA
    df = pd.read_parquet(os.path.join(args.train, 'data.parquet'))
    
    # 2. SEPARATE FEATURES AND TARGET
    # Identify the target column based on the job type
    target_col = 'binding_affinity' if args.target_type == 'affinity' else 'active'
    y = df[target_col]

    # 3. BULLETPROOF FEATURE SELECTION
    # We drop the targets AND any column that contains strings (like PID_149)
    # This ensures only numeric chemistry data goes into the model
    X = df.drop(['binding_affinity', 'active'], axis=1, errors='ignore')
    X = X.select_dtypes(include=[np.number]) 
    
    print(f"‚úÖ Dropped non-numeric columns. Remaining features: {len(X.columns)}")

    # 4. PREPROCESSING
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(X)

    # 5. TRAIN/TEST SPLIT
    X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

    # 6. MODEL SELECTION
    if args.target_type == 'affinity':
        model = RandomForestRegressor(n_estimators=args.n_estimators, max_depth=args.max_depth)
    else:
        model = RandomForestClassifier(n_estimators=args.n_estimators, max_depth=args.max_depth)

    print(f"üöÄ Fitting {args.target_type} model...")
    model.fit(X_train, y_train)
    
    # 7. LOGGING FOR TUNER
    preds = model.predict(X_test)
    if args.target_type == 'affinity':
        print(f"MSE: {mean_squared_error(y_test, preds):.4f}")
    else:
        print(f"F1: {f1_score(y_test, preds):.4f}")

    # 8. SAVE OUTPUTS
    joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
    joblib.dump(imputer, os.path.join(args.model_dir, "imputer.joblib"))
    print("‚úÖ Model and Imputer saved successfully!")

Overwriting train.py


In [None]:
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.tuner import IntegerParameter, HyperparameterTuner

# 1. INITIALIZE SESSION & PERMISSIONS
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket_name = 'BUCKET PLACEHOLDER'
train_path = f"s3://{bucket_name}/gold/train_features"

print(f"üìç Region: {sagemaker_session.boto_region_name}")
print(f"üì¶ Data Path: {train_path}")

# 2. SHARED HYPERPARAMETER RANGES
hyperparameter_ranges = {
    'n_estimators': IntegerParameter(50, 250),
    'max_depth': IntegerParameter(5, 25)
}

# 3. CONFIGURE TRACK A: BINDING AFFINITY (Regression)
estimator_affinity = SKLearn(
    entry_point='train.py',
    dependencies=['requirements.txt'],  
    role=role,
    instance_type='ml.m5.xlarge', 
    framework_version='1.2-1',
    py_version='py3',
    hyperparameters={'target_type': 'affinity'}
)

tuner_affinity = HyperparameterTuner(
    estimator_affinity,
    objective_metric_name='mse',
    objective_type='Minimize',
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=[{'Name': 'mse', 'Regex': 'MSE: ([0-9\\.]+)'}],
    max_jobs=4,
    max_parallel_jobs=2,
    base_tuning_job_name='affinity-tune' # Changed underscore to hyphen (AWS preference)
)

# 4. CONFIGURE TRACK B: BIOLOGICAL ACTIVITY (Classification)
estimator_activity = SKLearn(
    entry_point='train.py',
    dependencies=['requirements.txt'], # Added dependencies here as well!
    role=role,
    instance_type='ml.m5.xlarge',
    framework_version='1.2-1',
    py_version='py3',
    hyperparameters={'target_type': 'activity'}
)

tuner_activity = HyperparameterTuner(
    estimator_activity,
    objective_metric_name='f1-score',
    objective_type='Maximize',
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=[{'Name': 'f1-score', 'Regex': 'F1: ([0-9\\.]+)'}],
    max_jobs=4,
    max_parallel_jobs=2,
    base_tuning_job_name='activity-tune'
)

# 5. TRIGGER BOTH SEARCHES! üöÄ
train_input = sagemaker.inputs.TrainingInput(s3_data=train_path, content_type='application/x-parquet')

print("‚ö° Launching Affinity Tuning Job...")
tuner_affinity.fit({'train': train_input}, wait=False) 

print("‚ö° Launching Activity Tuning Job...")
tuner_activity.fit({'train': train_input}, wait=True) 

print("üéâ Both tuning jobs are in progress.")

In [None]:
import boto3
from botocore.exceptions import ClientError

s3_client = boto3.client('s3')
target_bucket = 'BUCKET PLACEHOLDER'

try:
    # 1. Find the REAL region of the bucket
    loc = s3_client.get_bucket_location(Bucket=target_bucket)['LocationConstraint']
    actual_region = loc if loc else 'us-east-1'
    print(f"üåç The bucket actually lives in: {actual_region}")
    
    # 2. Test if the SageMaker Role can 'List' the bucket
    print(f"üïµÔ∏è Testing permissions for role...")
    objects = s3_client.list_objects_v2(Bucket=target_bucket, Prefix='gold/train_features', MaxKeys=5)
    
    if 'Contents' in objects:
        print("‚úÖ Success! Objects found:")
        for obj in objects['Contents']:
            print(f" - {obj['Key']}")
    else:
        print("‚ùå No objects found. Check the prefix/folder name.")

except ClientError as e:
    print(f"üö´ Permission Error: {e}")

In [16]:
%%writefile requirements.txt
pyarrow
pandas==2.0.3
scikit-learn==1.2.1
joblib

Writing requirements.txt


In [None]:
import os
import boto3

# 1. Setup local paths to mimic SageMaker environment
os.makedirs('model_dir', exist_ok=True)
train_dir = 'local_data'
os.makedirs(train_dir, exist_ok=True)

# 2. Download the data from S3 to this notebook instance
# This proves the notebook can 'see' the data
!aws s3 cp s3REGIONLOC/gold/train_features/data.parquet local_data/data.parquet

# 3. Run the script manually to see the REAL error inside train.py
# This is the "Truth Test" for your script logic
!python train.py --n_estimators 100 --max_depth 5 --target_type affinity --train local_data/ --model_dir model_dir/

In [26]:
%%writefile inference.py
import os
import joblib
import pandas as pd
import numpy as np
import json

def model_fn(model_dir):
    """Load the model and imputer from the model_dir"""
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    imputer = joblib.load(os.path.join(model_dir, "imputer.joblib"))
    return {"model": model, "imputer": imputer}

def input_fn(request_body, request_content_type):
    """Parse the incoming JSON data from the Agent/Lambda"""
    if request_content_type == 'application/json':
        data = json.loads(request_body)
        df = pd.DataFrame(data)
        return df
    else:
        raise ValueError(f"Unsupported content type: {request_content_type}")

def predict_fn(input_data, model_dict):
    """Apply the same preprocessing and make a prediction"""
    model = model_dict["model"]
    imputer = model_dict["imputer"]
    
    # 1. Only keep numeric columns (matches train.py logic)
    X = input_data.select_dtypes(include=[np.number])
    
    # 2. Apply the saved imputer (crucial for consistency!)
    X_imputed = imputer.transform(X)
    
    # 3. Predict
    prediction = model.predict(X_imputed)
    return prediction.tolist()

def output_fn(prediction, content_type):
    """Return the result as JSON"""
    return json.dumps(prediction)

Writing inference.py


In [None]:
import boto3
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.analytics import HyperparameterTuningJobAnalytics
from time import strftime, gmtime

s3_client = boto3.client('sagemaker')

def get_best_job_and_artifact(tuner_name, metric_is_minimized=True):
    """
    Analyzes a tuning job and returns the Best Job Name and its S3 Model Artifact URI.
    No more guessing S3 paths!
    """
    # 1. Get the Leaderboard
    tuner_analytics = HyperparameterTuningJobAnalytics(tuner_name)
    df = tuner_analytics.dataframe()
    
    # 2. Sort to find the winner
    # If minimizing MSE, sort Ascending (True). If maximizing F1, sort Descending (False).
    df = df.sort_values('FinalObjectiveValue', ascending=metric_is_minimized)
    best_job_name = df.iloc[0]['TrainingJobName']
    best_metric_val = df.iloc[0]['FinalObjectiveValue']
    
    # 3. ASK AWS for the exact S3 path (The "Pro" Move) üõ°Ô∏è
    describe_response = s3_client.describe_training_job(TrainingJobName=best_job_name)
    model_artifact = describe_response['ModelArtifacts']['S3ModelArtifacts']
    
    return best_job_name, best_metric_val, model_artifact

# --- 1. ANALYZE AFFINITY (Minimize MSE) ---
print("üìä Analyzing Affinity Tuning Job...")
best_aff_name, best_aff_mse, best_aff_s3 = get_best_job_and_artifact(
    tuner_affinity.latest_tuning_job.name, metric_is_minimized=True
)
print(f"   üèÜ Best Job: {best_aff_name}")
print(f"   üìâ Best MSE: {best_aff_mse:.4f}")
print(f"   üì¶ Artifact: {best_aff_s3}")

# --- 2. ANALYZE ACTIVITY (Maximize F1) ---
print("\nüìä Analyzing Activity Tuning Job...")
best_act_name, best_act_f1, best_act_s3 = get_best_job_and_artifact(
    tuner_activity.latest_tuning_job.name, metric_is_minimized=False
)
print(f"   üèÜ Best Job: {best_act_name}")
print(f"   üéØ Best F1:  {best_act_f1:.4f}")
print(f"   üì¶ Artifact: {best_act_s3}")

# --- 3. DEPLOY ---
timestamp = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
instance_type = 'ml.m5.large'

print(f"\nüöÄ Deploying endpoints (Timestamp: {timestamp})...")

# Deploy Affinity
affinity_model = SKLearnModel(
    model_data=best_aff_s3, # Use the exact path we found!
    role=role,
    entry_point='inference.py',
    framework_version='1.2-1',
    py_version='py3',
    dependencies=['requirements.txt'] 
)

predictor_affinity = affinity_model.deploy(
    instance_type=instance_type,
    initial_instance_count=1,
    endpoint_name=f'biotech-affinity-{timestamp}'
)

# Deploy Activity
activity_model = SKLearnModel(
    model_data=best_act_s3, # Use the exact path we found!
    role=role,
    entry_point='inference.py',
    framework_version='1.2-1',
    py_version='py3',
    dependencies=['requirements.txt']
)

predictor_activity = activity_model.deploy(
    instance_type=instance_type,
    initial_instance_count=1,
    endpoint_name=f'biotech-activity-{timestamp}'
)

print("\n‚úÖ Deployment Complete!")
print(f"üîó Affinity Endpoint: biotech-affinity-{timestamp}")
print(f"üîó Activity Endpoint: biotech-activity-{timestamp}")