# NASA Asteroid Hazard Classification with SageMaker and XGBoost

### Imports

In [35]:
import setuptools
import sagemaker
from sklearn.model_selection import train_test_split 
import boto3
import pandas as pd
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sagemaker import get_execution_role

### Data Processing

In [93]:
data = pd.read_csv('./data/raw/neo_clean.csv')

In [94]:
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   absolute_magnitude             90000 non-null  float64
 1   est_diameter_min               90000 non-null  float64
 2   est_diameter_max               90000 non-null  float64
 3   close_approach_date            90000 non-null  object 
 4   relative_velocity              90000 non-null  float64
 5   miss_distance                  90000 non-null  float64
 6   orbit_uncertainty              89967 non-null  float64
 7   minimum_orbit_intersection     89967 non-null  float64
 8   orbital_period                 89967 non-null  float64
 9   mean_motion                    89967 non-null  float64
 10  eccentricity                   89967 non-null  float64
 11  perihelion_distance            89967 non-null  float64
 12  aphelion_distance              89967 non-null 

Unnamed: 0,absolute_magnitude,est_diameter_min,est_diameter_max,close_approach_date,relative_velocity,miss_distance,orbit_uncertainty,minimum_orbit_intersection,orbital_period,mean_motion,eccentricity,perihelion_distance,aphelion_distance,hazardous,velocity_rate_regression,velocity_rate_avg,miss_distance_rate_regression,miss_distance_rate_avg
0,16.7,1.21494,2.716689,31/05/2003,84574.50244,48800509.14,0.0,0.304744,1353.300542,0.266016,0.741697,0.618482,4.170324,False,-4.37574e-07,-5e-06,0.000452,0.004685
1,22.3,0.092163,0.206082,31/05/2003,76177.35114,48676618.1,0.0,0.011805,341.180111,1.055161,0.378715,0.593673,1.317442,False,9.683358e-07,3e-05,0.000518,0.064797
2,20.9,0.175612,0.392681,31/05/2003,40675.45864,20058862.17,1.0,0.130954,636.586551,0.565516,0.223179,1.12502,1.771452,False,-1.395668e-06,-1.2e-05,-0.000869,-0.006343
3,20.15,0.248059,0.554677,31/05/2003,55504.52545,41444758.85,1.0,0.253456,541.981518,0.664229,0.433385,0.737132,1.864749,False,-2.684856e-07,-7e-06,-0.00043,-0.009523
4,21.0,0.167708,0.375008,31/05/2003,45584.8931,37554794.25,6.0,0.219619,1276.938741,0.281924,0.466313,1.229328,3.377597,False,-2.495001e-06,-2e-06,-0.015808,-0.013043


In [95]:
data['hazardous'] = data['hazardous'].apply(lambda x: 0 if x == False else 1)
data.drop(columns = ['close_approach_date', 'absolute_magnitude', 'minimum_orbit_intersection'], inplace = True)
#data.drop(columns = ['close_approach_date'], inplace = True)
#data.dropna(inplace = True)

In [96]:
features = list(data.columns)
features.remove('hazardous')
print(features)

['est_diameter_min', 'est_diameter_max', 'relative_velocity', 'miss_distance', 'orbit_uncertainty', 'orbital_period', 'mean_motion', 'eccentricity', 'perihelion_distance', 'aphelion_distance', 'velocity_rate_regression', 'velocity_rate_avg', 'miss_distance_rate_regression', 'miss_distance_rate_avg']


In [97]:
X, Y = data[features], data['hazardous']

### Features and Labels

In [98]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 15 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   est_diameter_min               90000 non-null  float64
 1   est_diameter_max               90000 non-null  float64
 2   relative_velocity              90000 non-null  float64
 3   miss_distance                  90000 non-null  float64
 4   orbit_uncertainty              89967 non-null  float64
 5   orbital_period                 89967 non-null  float64
 6   mean_motion                    89967 non-null  float64
 7   eccentricity                   89967 non-null  float64
 8   perihelion_distance            89967 non-null  float64
 9   aphelion_distance              89967 non-null  float64
 10  hazardous                      90000 non-null  int64  
 11  velocity_rate_regression       83349 non-null  float64
 12  velocity_rate_avg              83349 non-null 

In [99]:
print("Features")
X.info()

Features
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   est_diameter_min               90000 non-null  float64
 1   est_diameter_max               90000 non-null  float64
 2   relative_velocity              90000 non-null  float64
 3   miss_distance                  90000 non-null  float64
 4   orbit_uncertainty              89967 non-null  float64
 5   orbital_period                 89967 non-null  float64
 6   mean_motion                    89967 non-null  float64
 7   eccentricity                   89967 non-null  float64
 8   perihelion_distance            89967 non-null  float64
 9   aphelion_distance              89967 non-null  float64
 10  velocity_rate_regression       83349 non-null  float64
 11  velocity_rate_avg              83349 non-null  float64
 12  miss_distance_rate_regression  83349 

In [100]:
X.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbit_uncertainty,orbital_period,mean_motion,eccentricity,perihelion_distance,aphelion_distance,velocity_rate_regression,velocity_rate_avg,miss_distance_rate_regression,miss_distance_rate_avg
0,1.21494,2.716689,84574.50244,48800509.14,0.0,1353.300542,0.266016,0.741697,0.618482,4.170324,-4.37574e-07,-5e-06,0.000452,0.004685
1,0.092163,0.206082,76177.35114,48676618.1,0.0,341.180111,1.055161,0.378715,0.593673,1.317442,9.683358e-07,3e-05,0.000518,0.064797
2,0.175612,0.392681,40675.45864,20058862.17,1.0,636.586551,0.565516,0.223179,1.12502,1.771452,-1.395668e-06,-1.2e-05,-0.000869,-0.006343
3,0.248059,0.554677,55504.52545,41444758.85,1.0,541.981518,0.664229,0.433385,0.737132,1.864749,-2.684856e-07,-7e-06,-0.00043,-0.009523
4,0.167708,0.375008,45584.8931,37554794.25,6.0,1276.938741,0.281924,0.466313,1.229328,3.377597,-2.495001e-06,-2e-06,-0.015808,-0.013043


In [101]:
print("Labels")
Y.info()

Labels
<class 'pandas.core.series.Series'>
RangeIndex: 90000 entries, 0 to 89999
Series name: hazardous
Non-Null Count  Dtype
--------------  -----
90000 non-null  int64
dtypes: int64(1)
memory usage: 703.3 KB


In [102]:
Y.head()

0    0
1    0
2    0
3    0
4    0
Name: hazardous, dtype: int64

In [103]:
features = list(X.columns)
features

['est_diameter_min',
 'est_diameter_max',
 'relative_velocity',
 'miss_distance',
 'orbit_uncertainty',
 'orbital_period',
 'mean_motion',
 'eccentricity',
 'perihelion_distance',
 'aphelion_distance',
 'velocity_rate_regression',
 'velocity_rate_avg',
 'miss_distance_rate_regression',
 'miss_distance_rate_avg']

In [104]:
labels = 'hazardous'
labels

'hazardous'

In [105]:
seed = 7
test_size = 0.2
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [106]:
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=test_size, random_state=seed)

In [107]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_val.shape)
print(Y_test.shape)

(57600, 14)
(14400, 14)
(18000, 14)
(57600,)
(14400,)
(18000,)


In [108]:
trainX = pd.DataFrame(X_train)
trainX[labels] = Y_train

valX = pd.DataFrame(X_val)
valX[labels] = Y_val

testX = pd.DataFrame(X_test)
testX[labels] = Y_test

In [109]:
# Check if the hazardous column contains only 0 and 1
assert trainX['hazardous'].isin([0, 1]).all(), "Training data has invalid labels"
assert valX['hazardous'].isin([0, 1]).all(), "Validation data has invalid labels"
assert testX['hazardous'].isin([0, 1]).all(), "Validation data has invalid labels"

In [110]:
train_path = './data/processed/train_v.1.csv'
trainX.to_csv(train_path, index=False)
val_path = './data/processed/val_v.1.csv'
valX.to_csv(val_path, index=False)
test_path = './data/processed/test_v.1.csv'
testX.to_csv(test_path, index=False)

### Upload Processed Data to S3

In [111]:
sagemaker_client = boto3.client("sagemaker")
session = sagemaker.Session()
region = session.boto_session.region_name
bucket = 's3-nasa-neo-watch'
prefix = "v1"
role = "arn:aws:iam::949672723150:role/qte4288_SageMakerExecutionRole"

In [112]:
trainpath = session.upload_data(path = train_path, bucket = bucket, key_prefix = prefix)
valpath = session.upload_data(path = val_path, bucket = bucket, key_prefix = prefix)
testpath = session.upload_data(path = test_path, bucket = bucket, key_prefix = prefix)

In [113]:
print(trainpath)
print(valpath)
print(testpath)

s3://s3-nasa-neo-watch/v1/train_v.1.csv
s3://s3-nasa-neo-watch/v1/val_v.1.csv
s3://s3-nasa-neo-watch/v1/test_v.1.csv


# Train

In [84]:
#XGBoost Algorithm Container
container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")
display(container)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1'

In [85]:
s3_input_train = TrainingInput(
    s3_data="s3://{}/{}/train_v.1".format(bucket, prefix), content_type="csv"
)
s3_input_validation = TrainingInput(
    s3_data="s3://{}/{}/val_v.1".format(bucket, prefix), content_type="csv"
)

In [86]:
session = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path="s3://{}/{}/output".format(bucket, prefix),
    sagemaker_session=session,
)
xgb.set_hyperparameters(
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.8,
    verbosity=0,
    objective="binary:logistic",
    num_round=100,
)

xgb.fit({"train": s3_input_train, "validation": s3_input_validation})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-07-31-07-15-51-850


2024-07-31 07:15:52 Starting - Starting the training job...
2024-07-31 07:16:08 Starting - Preparing the instances for training...
2024-07-31 07:16:35 Downloading - Downloading input data...
2024-07-31 07:17:10 Downloading - Downloading the training image......
2024-07-31 07:18:16 Training - Training image download completed. Training in progress..[2024-07-31 07:18:33.157 ip-10-2-222-55.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2024-07-31 07:18:33.182 ip-10-2-222-55.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.
[2024-07-31:07:18:33:INFO] Imported framework sagemaker_xgboost_container.training
[2024-07-31:07:18:33:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.
Returning the value itself
[2024-07-31:07:18:33:INFO] No GPUs detected (normal if no gpus installed)
[2024-07-31:07:18:33:INFO] Running XGBoost Sagemaker in algorithm mode
[2024-07-31:07:18:33:INFO] Determined 0 GPU(s) available on the inst

UnexpectedStatusException: Error for Training job sagemaker-xgboost-2024-07-31-07-15-51-850: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.8/site-packages/sagemaker_xgboost_container/algorithm_mode/train.py", line 319, in train_job
    bst = xgb.train(
  File "/miniconda3/lib/python3.8/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/miniconda3/lib/python3.8/site-packages/xgboost/training.py", line 185, in train
    bst.update(dtrain, i, obj)
  File "/miniconda3/lib/python3.8/site-packages/xgboost/core.py", line 1918, in update
    _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
  File "/miniconda3/lib/python3.8/site-packages/xgboost/core.py", line 279, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [07:18:33] ../src/objective/regression_obj.cu:148: label must be in [0,1] for logistic regression
Stack trace:
  [bt] (0) /miniconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x68eaf9) [0x7f2a0d58daf9]
  [bt] (1) /miniconda3/lib/python3.8/site-p

In [117]:
%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train_v.1.csv")
    parser.add_argument("--test-file", type=str, default="test_v.1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    Y_train = train_df[label]
    Y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(Y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(Y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, Y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    Y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(Y_test,Y_pred_test)
    test_rep = classification_report(Y_test,Y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)

Overwriting script.py


In [115]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)

In [116]:
# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)

INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-07-31-07-32-57-944


2024-07-31 07:32:59 Starting - Starting the training job...
2024-07-31 07:33:24 Starting - Preparing the instances for training......
2024-07-31 07:34:13 Downloading - Downloading input data...
2024-07-31 07:34:33 Downloading - Downloading the training image...
2024-07-31 07:35:29 Training - Training image download completed. Training in progress..2024-07-31 07:35:41,669 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-07-31 07:35:41,673 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-31 07:35:41,717 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-07-31 07:35:41,913 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-31 07:35:41,927 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-31 07:35:41,942 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-07-31 

UnexpectedStatusException: Error for Training job RF-custom-sklearn-2024-07-31-07-32-57-944: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_containers/_trainer.py", line 84, in train
    entrypoint()
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 39, in main
    train(environment.Environment())
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_sklearn_container/training.py", line 35, in train
    runner_type=runner.ProcessRunnerType)
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/entry_point.py", line 100, in run
    wait, capture_error
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 291, in run
    cwd=environment.code_dir,
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_training/process.py", line 208, in check_error
    info=extra_info,
sagemaker_training.errors.ExecuteUserScriptError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage ""
Command "/miniconda3/bin/python script.py --n_estimators 100 --r