In [2]:
import boto3
from datetime import datetime
import numpy as np
import os
import pandas as pd
import pickle as pkl 
import sagemaker
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
!pip install xgboost
import xgboost as xgb


### Get the data

In [44]:
data = pd.read_csv('ufo_fullset.csv', usecols=['shape', 'duration', 'witnesses', 'weather', 'physicalEvidence', 'contact', 'researchOutcome'])
data.head()


Unnamed: 0,shape,duration,witnesses,weather,physicalEvidence,contact,researchOutcome
0,circle,4,1,rain,N,N,explained
1,disk,4,1,partly cloudy,Y,N,explained
2,circle,49,1,clear,N,N,explained
3,disk,13,1,partly cloudy,N,N,explained
4,circle,17,1,mostly cloudy,N,N,explained


### Clean the data

In [57]:
data_clean = data.fillna('unknown')
data_clean.replace(' ', '_', regex=True, inplace=True)
data_clean.replace(('Y', 'N'), (1, 0), inplace=True)

data_clean = pd.get_dummies(data_clean, columns=['researchOutcome', 'shape', 'weather'])
data_clean.drop(columns=['researchOutcome_unexplained', 'shape_unknown', 'weather_clear'], inplace=True)
data_clean.columns = data_clean.columns.str.replace("researchOutcome_", "")
data_clean.columns = data_clean.columns.str.replace("shape_", "")
data_clean.columns = data_clean.columns.str.replace("weather_", "")
headers = ['explained'] + [c for c in data_clean.columns if c != 'explained']
data_clean = data_clean[headers]
print(data_clean.head())
print(data_clean.info())
data_np = data_clean.values

   explained  duration  witnesses  physicalEvidence  contact  probable  box  \
0          1         4          1                 0        0         0    0   
1          1         4          1                 1        0         0    0   
2          1        49          1                 0        0         0    0   
3          1        13          1                 0        0         0    0   
4          1        17          1                 0        0         0    0   

   circle  disk  light  ...  pyramid  sphere  square  triangle  fog  \
0       1     0      0  ...        0       0       0         0    0   
1       0     1      0  ...        0       0       0         0    0   
2       1     0      0  ...        0       0       0         0    0   
3       0     1      0  ...        0       0       0         0    0   
4       1     0      0  ...        0       0       0         0    0   

   mostly_cloudy  partly_cloudy  rain  snow  stormy  
0              0              0     1     0 

### Save the data to train, validate and test csv files

In [66]:
indices = np.arange(data_np.shape[0])
np.random.shuffle(indices)
train_end = int(data_np.shape[0]*0.7)
validate_end = int(data_np.shape[0]*0.9)
train = data_np[indices[:train_end]]
validate = data_np[indices[train_end:validate_end]]
test = data_np[indices[validate_end:]]
print(train.shape)
print(validate.shape)
print(test.shape)

(12600, 21)
(3600, 21)
(1800, 21)


In [70]:
s3_client = boto3.client('s3')
bucket_name = 'acg-ml-certification-df'

train_name = 'train.csv'
validate_name = 'validate.csv'
test_name = 'test.csv'
test_true_name = 'test-true.csv'

np.savetxt(train_name, train.astype(int), fmt='%i', delimiter=',')
np.savetxt(validate_name, validate.astype(int), fmt='%i', delimiter=',')
np.savetxt(test_name, test[:, 1:].astype(int), fmt='%i', delimiter=',')
np.savetxt(test_true_name, test[:, 0].astype(int), fmt='%i', delimiter=',')

for name in [train_name, validate_name, test_name, test_true_name]:
    s3_client.upload_file(name, bucket_name, name)

## Train XGBoost

In [88]:
output_bucket = sagemaker.Session().default_bucket()
output_path = f"s3://{output_bucket}"
job_name = f'xgboost-{datetime.now().strftime("%Y%m%d%H%M%S")}'

hyperparameters = {
    'num_round': 50,
    'eta': 0.2,
    'objective':"binary:logistic"}

xgboost_container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "1.2-2")

estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, 
                                          output_path=output_path)

# define the data type and paths to the training and validation datasets
train_input = sagemaker.inputs.TrainingInput(f"s3://{bucket_name}/train.csv", content_type="csv")
validation_input = sagemaker.inputs.TrainingInput(f"s3://{bucket_name}/validate.csv", content_type="csv")

# execute the XGBoost training job
estimator.fit(inputs={'train': train_input, 'validation': validation_input}, job_name=job_name)


2021-05-29 03:06:36 Starting - Starting the training job...
2021-05-29 03:07:02 Starting - Launching requested ML instancesProfilerReport-1622257596: InProgress
......
2021-05-29 03:08:03 Starting - Preparing the instances for training......
2021-05-29 03:09:03 Downloading - Downloading input data...
2021-05-29 03:09:33 Training - Training image download completed. Training in progress..[34m[2021-05-29 03:09:35.323 ip-10-0-183-194.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-05-29:03:09:35:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2021-05-29:03:09:35:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2021-05-29:03:09:35:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-05-29:03:09:35:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2021-05-29:03:09:35:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-05-2

### Evaluate the model

In [89]:
model_key = job_name + '/output/model.tar.gz'
boto3.resource('s3').Bucket(output_bucket).download_file(model_key, 'model.tar.gz')
os.system('gtar -xvf model.tar.gz')
with open("xgboost-model", "rb") as f:
    booster = pkl.load(f)
pred = (booster.predict(xgb.DMatrix(test[:, 1:].astype(int))) + 0.5).astype(int)

In [100]:
print(f"Accuracy:  {accuracy_score(test[:, 0], pred):.4f}")
print(f"Recall:    {recall_score(test[:, 0], pred):.4f}")
print(f"Precision: {precision_score(test[:, 0], pred):.4f}")
print(f"f1:        {f1_score(test[:, 0], pred):.4f}")


Accuracy:  0.9744
Recall:    0.9837
Precision: 0.9806
f1:        0.9822
