# NASA Asteroid Hazard Classification with SageMaker and XGBoost

### Imports

In [1]:
import setuptools
import sagemaker
from sklearn.model_selection import train_test_split 
import boto3
import pandas as pd

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/shubhigupta/Library/Application Support/sagemaker/config.yaml


### Data Processing

In [17]:
data = pd.read_csv('./data/raw/neo_clean.csv')

In [18]:
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 18 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   absolute_magnitude             90000 non-null  float64
 1   est_diameter_min               90000 non-null  float64
 2   est_diameter_max               90000 non-null  float64
 3   close_approach_date            90000 non-null  object 
 4   relative_velocity              90000 non-null  float64
 5   miss_distance                  90000 non-null  float64
 6   orbit_uncertainty              89967 non-null  float64
 7   minimum_orbit_intersection     89967 non-null  float64
 8   orbital_period                 89967 non-null  float64
 9   mean_motion                    89967 non-null  float64
 10  eccentricity                   89967 non-null  float64
 11  perihelion_distance            89967 non-null  float64
 12  aphelion_distance              89967 non-null 

Unnamed: 0,absolute_magnitude,est_diameter_min,est_diameter_max,close_approach_date,relative_velocity,miss_distance,orbit_uncertainty,minimum_orbit_intersection,orbital_period,mean_motion,eccentricity,perihelion_distance,aphelion_distance,hazardous,velocity_rate_regression,velocity_rate_avg,miss_distance_rate_regression,miss_distance_rate_avg
0,16.7,1.21494,2.716689,31/05/2003,84574.50244,48800509.14,0.0,0.304744,1353.300542,0.266016,0.741697,0.618482,4.170324,False,-4.37574e-07,-5e-06,0.000452,0.004685
1,22.3,0.092163,0.206082,31/05/2003,76177.35114,48676618.1,0.0,0.011805,341.180111,1.055161,0.378715,0.593673,1.317442,False,9.683358e-07,3e-05,0.000518,0.064797
2,20.9,0.175612,0.392681,31/05/2003,40675.45864,20058862.17,1.0,0.130954,636.586551,0.565516,0.223179,1.12502,1.771452,False,-1.395668e-06,-1.2e-05,-0.000869,-0.006343
3,20.15,0.248059,0.554677,31/05/2003,55504.52545,41444758.85,1.0,0.253456,541.981518,0.664229,0.433385,0.737132,1.864749,False,-2.684856e-07,-7e-06,-0.00043,-0.009523
4,21.0,0.167708,0.375008,31/05/2003,45584.8931,37554794.25,6.0,0.219619,1276.938741,0.281924,0.466313,1.229328,3.377597,False,-2.495001e-06,-2e-06,-0.015808,-0.013043


In [19]:
data['hazardous'] = data['hazardous'].apply(lambda x: 0 if x == False else 1)
data.drop(columns = ['close_approach_date', 'absolute_magnitude', 'minimum_orbit_intersection'], inplace = True)
#data.drop(columns = ['close_approach_date'], inplace = True)
#data.dropna(inplace = True)

In [20]:
features = list(data.columns)
features.remove('hazardous')
print(features)

['est_diameter_min', 'est_diameter_max', 'relative_velocity', 'miss_distance', 'orbit_uncertainty', 'orbital_period', 'mean_motion', 'eccentricity', 'perihelion_distance', 'aphelion_distance', 'velocity_rate_regression', 'velocity_rate_avg', 'miss_distance_rate_regression', 'miss_distance_rate_avg']


In [23]:
X, Y = data[features], data['hazardous']

### Features and Labels

In [24]:
print("Features")
X.info()

Features
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 14 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   est_diameter_min               90000 non-null  float64
 1   est_diameter_max               90000 non-null  float64
 2   relative_velocity              90000 non-null  float64
 3   miss_distance                  90000 non-null  float64
 4   orbit_uncertainty              89967 non-null  float64
 5   orbital_period                 89967 non-null  float64
 6   mean_motion                    89967 non-null  float64
 7   eccentricity                   89967 non-null  float64
 8   perihelion_distance            89967 non-null  float64
 9   aphelion_distance              89967 non-null  float64
 10  velocity_rate_regression       83349 non-null  float64
 11  velocity_rate_avg              83349 non-null  float64
 12  miss_distance_rate_regression  83349 

In [26]:
X.head()

Unnamed: 0,est_diameter_min,est_diameter_max,relative_velocity,miss_distance,orbit_uncertainty,orbital_period,mean_motion,eccentricity,perihelion_distance,aphelion_distance,velocity_rate_regression,velocity_rate_avg,miss_distance_rate_regression,miss_distance_rate_avg
0,1.21494,2.716689,84574.50244,48800509.14,0.0,1353.300542,0.266016,0.741697,0.618482,4.170324,-4.37574e-07,-5e-06,0.000452,0.004685
1,0.092163,0.206082,76177.35114,48676618.1,0.0,341.180111,1.055161,0.378715,0.593673,1.317442,9.683358e-07,3e-05,0.000518,0.064797
2,0.175612,0.392681,40675.45864,20058862.17,1.0,636.586551,0.565516,0.223179,1.12502,1.771452,-1.395668e-06,-1.2e-05,-0.000869,-0.006343
3,0.248059,0.554677,55504.52545,41444758.85,1.0,541.981518,0.664229,0.433385,0.737132,1.864749,-2.684856e-07,-7e-06,-0.00043,-0.009523
4,0.167708,0.375008,45584.8931,37554794.25,6.0,1276.938741,0.281924,0.466313,1.229328,3.377597,-2.495001e-06,-2e-06,-0.015808,-0.013043


In [25]:
print("Labels")
Y.info()

Labels
<class 'pandas.core.series.Series'>
RangeIndex: 90000 entries, 0 to 89999
Series name: hazardous
Non-Null Count  Dtype
--------------  -----
90000 non-null  int64
dtypes: int64(1)
memory usage: 703.3 KB


In [27]:
Y.head()

0    0
1    0
2    0
3    0
4    0
Name: hazardous, dtype: int64

In [32]:
features = list(X.columns)
features

['est_diameter_min',
 'est_diameter_max',
 'relative_velocity',
 'miss_distance',
 'orbit_uncertainty',
 'orbital_period',
 'mean_motion',
 'eccentricity',
 'perihelion_distance',
 'aphelion_distance',
 'velocity_rate_regression',
 'velocity_rate_avg',
 'miss_distance_rate_regression',
 'miss_distance_rate_avg']

In [34]:
labels = 'hazardous'
labels

'hazardous'

In [28]:
seed = 7
test_size = 0.2
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [30]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(72000, 14)
(18000, 14)
(72000,)
(18000,)


In [35]:
trainX = pd.DataFrame(X_train)
trainX[labels] = Y_train

testX = pd.DataFrame(X_test)
testX[labels] = Y_test

In [37]:
train_path = './data/processed/train_v.1.csv'
trainX.to_csv(train_path)
test_path = './data/processed/test_v.1.csv'
testX.to_csv(test_path)

### Upload Processed Data to S3

In [41]:
sagemaker_client = boto3.client("sagemaker")
session = sagemaker.Session()
region = session.boto_session.region_name
bucket = 's3-nasa-neo-watch'
prefix = "sagemaker/nasa_neo_watch/sklearncontainer"

In [42]:
trainpath = session.upload_data(path = train_path, bucket = bucket, key_prefix = prefix)
testpath = session.upload_data(path = test_path, bucket = bucket, key_prefix = prefix)

In [43]:
print(trainpath)
print(testpath)

s3://s3-nasa-neo-watch/sagemaker/nasa_neo_watch/sklearncontainer/train_v.1.csv
s3://s3-nasa-neo-watch/sagemaker/nasa_neo_watch/sklearncontainer/test_v.1.csv
