In [47]:
from sagemaker import estimator, Session
from sagemaker.inputs import TrainingInput
import boto3

# Initialize SageMaker session and AWS session
session = boto3.Session(profile_name='sagemaker-user')
sagemaker_session = Session(boto_session=session)

# Role, bucket, and data location
role = 'arn:aws:iam::211125533708:role/sagemaker-stock'
bucket_name = 'stock-data1'
s3_key = 'stock_dataset/final_cleaned.parquet'
s3_data_path = f's3://{bucket_name}/{s3_key}'

# Use SageMaker's Scikit-Learn container to install LightGBM in the script
image_uri = '683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3'

# Define the estimator with the Scikit-Learn container
lgbm_estimator = estimator.Estimator(
    entry_point='lightgbm_train.py',
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type='ml.m5.4xlarge',
    sagemaker_session=sagemaker_session,
    hyperparameters={'N_ESTIMATORS': 10000}
)

# Define S3 input for training data
s3_input = TrainingInput(s3_data_path, content_type='application/x-parquet')

# Start training
lgbm_estimator.fit({'train': s3_input})


INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2024-10-27-07-31-30-448


2024-10-27 07:31:39 Starting - Starting the training job...
2024-10-27 07:31:53 Starting - Preparing the instances for training...
2024-10-27 07:32:20 Downloading - Downloading input data...........2024-10-27 07:34:38,250 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-10-27 07:34:38,253 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-10-27 07:34:38,296 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-10-27 07:34:38,451 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-10-27 07:34:38,464 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-10-27 07:34:38,477 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-10-27 07:34:38,487 sagemaker-training-toolkit INFO     Invoking user script
Training Env:
{
    "additional_framework_parameters": {},
    "channel_input_dirs

KeyboardInterrupt: 

In [46]:
%%writefile lightgbm_train.py

import subprocess
import sys

# Install LightGBM within the script before any other imports
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'lightgbm'])

import argparse
import os
import joblib
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression

# Parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('--N_ESTIMATORS', type=int, default=10000)
args = parser.parse_args()

# Load data and preprocess
bucket = os.environ['SM_CHANNEL_TRAIN']
file_path = os.path.join(bucket, 'final_cleaned.parquet')
df = pd.read_parquet(file_path)

# Sample 1% of the data for testing
df_sample = df.sample(frac=1.0, random_state=42)

# Define target and features
target_column = 'Close_x'
X = df_sample.select_dtypes(include=[np.number]).drop(columns=[target_column])
y = df_sample[target_column]

# Data cleaning
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(X.mean(), inplace=True)  # Fill NaNs with column means
X = X.clip(lower=-1e6, upper=1e6)  # Remove extreme values

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection with SelectKBest to ensure minimum 10 features selected by importance
def select_important_features(X, y, min_features=10):
    selector = SelectKBest(f_regression, k=min(max(min_features, X.shape[1]), X.shape[1]))
    X_reduced = selector.fit_transform(X, y)
    selected_columns = X.columns[selector.get_support()]
    return X[selected_columns], selected_columns

# Apply feature selection
X_train_reduced, selected_columns = select_important_features(X_train, y_train, min_features=10)
X_val_reduced = X_val[selected_columns]  # Match validation features to training features

# Set LightGBM parameters
# Set LightGBM parameters
lgbm_params = {
    'num_leaves': 128,
    'max_depth': -1,
    'learning_rate': 0.005,
    'n_estimators': args.N_ESTIMATORS,
    'min_child_weight': 5,
    'subsample': 0.6,
    'colsample_bytree': 0.6,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'lambda_l1': 0.2,
    'lambda_l2': 1.2,
    'n_jobs': -1,  # Enable multi-threading across all cores
    'force_col_wise': True  # Force column-wise parallelism
}


# Train model with validation check and manual early stopping
model = LGBMRegressor(**lgbm_params)
best_rmse = float("inf")
patience = 10
no_improve_count = 0

for i in range(1, args.N_ESTIMATORS + 1):
    model.n_estimators = i
    model.fit(X_train_reduced, y_train)
    val_preds = model.predict(X_val_reduced)
    val_rmse = mean_squared_error(y_val, val_preds, squared=False)
    
    # Early stopping logic
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        no_improve_count = 0
    else:
        no_improve_count += 1
        if no_improve_count >= patience:
            print(f"Early stopping at iteration {i} with best RMSE: {best_rmse}")
            break

print(f"Final Validation RMSE: {best_rmse}")

# Save model to output directory
model_dir = os.environ['SM_MODEL_DIR']
joblib.dump(model, os.path.join(model_dir, 'model.joblib'))


Overwriting lightgbm_train.py
