In [1]:
import boto3
from datetime import datetime
import numpy as np
import os
import pandas as pd
import pickle as pkl 
import sagemaker
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
!pip install xgboost
import xgboost as xgb




### Get the data

In [11]:
data = pd.read_csv('ufo_fullset.csv', 
                   usecols=['shape', 'duration', 'witnesses', 'weather', 'physicalEvidence', 
                            'contact', 'researchOutcome', 'longitude', 'latitude'])
data.head()


Unnamed: 0,shape,duration,witnesses,weather,latitude,longitude,physicalEvidence,contact,researchOutcome
0,circle,4,1,rain,47.329444,-122.578889,N,N,explained
1,disk,4,1,partly cloudy,52.664913,-1.034894,Y,N,explained
2,circle,49,1,clear,38.951667,-92.333889,N,N,explained
3,disk,13,1,partly cloudy,41.496944,-71.367778,N,N,explained
4,circle,17,1,mostly cloudy,47.606389,-122.330833,N,N,explained


### Clean the data

In [12]:
data_clean = data.copy()
data_clean['shape'] = data_clean['shape'].fillna('circle')
data_clean.replace(' ', '_', regex=True, inplace=True)
data_clean.replace(('Y', 'N'), (1, 0), inplace=True)

outcome_map = {v: i for i, v in enumerate(data_clean.researchOutcome.unique())}
data_clean.researchOutcome = data_clean.researchOutcome.replace(outcome_map)

data_clean = pd.get_dummies(data_clean, columns=['shape', 'weather'])
data_clean.drop(columns=['shape_triangle', 'weather_clear'], inplace=True)
data_clean.columns = data_clean.columns.str.replace("shape_", "")
data_clean.columns = data_clean.columns.str.replace("weather_", "")

headers = ['researchOutcome'] + [c for c in data_clean.columns if c != 'researchOutcome']
data_clean = data_clean[headers]

print(data_clean.head())
print(data_clean.info())
data_np = data_clean.values

   researchOutcome  duration  witnesses   latitude   longitude  \
0                0         4          1  47.329444 -122.578889   
1                0         4          1  52.664913   -1.034894   
2                0        49          1  38.951667  -92.333889   
3                0        13          1  41.496944  -71.367778   
4                0        17          1  47.606389 -122.330833   

   physicalEvidence  contact  box  circle  disk  ...  oval  pyramid  sphere  \
0                 0        0    0       1     0  ...     0        0       0   
1                 1        0    0       0     1  ...     0        0       0   
2                 0        0    0       1     0  ...     0        0       0   
3                 0        0    0       0     1  ...     0        0       0   
4                 0        0    0       1     0  ...     0        0       0   

   square  fog  mostly_cloudy  partly_cloudy  rain  snow  stormy  
0       0    0              0              0     1     0     

### Save the data to train, validate and test csv files

In [13]:
indices = np.arange(data_np.shape[0])
np.random.shuffle(indices)
train_end = int(data_np.shape[0]*0.7)
validate_end = int(data_np.shape[0]*0.9)
train = data_np[indices[:train_end]]
validate = data_np[indices[train_end:validate_end]]
test = data_np[indices[validate_end:]]
print(train.shape)
print(validate.shape)
print(test.shape)

(12600, 21)
(3600, 21)
(1800, 21)


In [14]:
s3_client = boto3.client('s3')
bucket_name = 'acg-ml-certification-df'

train_name = 'train-3.csv'
validate_name = 'validate-3.csv'

np.savetxt(train_name, train, delimiter=',')
np.savetxt(validate_name, validate, delimiter=',')

for name in [train_name, validate_name]:
    s3_client.upload_file(name, bucket_name, name)

## Train XGBoost

In [15]:
output_bucket = sagemaker.Session().default_bucket()
output_path = f"s3://{output_bucket}"
job_name = f'xgboost-{datetime.now().strftime("%Y%m%d%H%M%S")}'

hyperparameters = {
    'num_class': 3,
    'num_round': 100,
    'objective':"multi:softmax"}

xgboost_container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "1.2-2")

estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m4.xlarge', 
                                          volume_size=5, 
                                          output_path=output_path)

# define the data type and paths to the training and validation datasets
train_input = sagemaker.inputs.TrainingInput(f"s3://{bucket_name}/train-3.csv", content_type="csv")
validation_input = sagemaker.inputs.TrainingInput(f"s3://{bucket_name}/validate-3.csv", content_type="csv")

# execute the XGBoost training job
estimator.fit(inputs={'train': train_input, 'validation': validation_input}, job_name=job_name)


2021-05-29 04:22:18 Starting - Starting the training job...
2021-05-29 04:22:20 Starting - Launching requested ML instancesProfilerReport-1622262138: InProgress
......
2021-05-29 04:23:32 Starting - Preparing the instances for training.........
2021-05-29 04:25:12 Downloading - Downloading input data
2021-05-29 04:25:12 Training - Downloading the training image.....[34m[2021-05-29 04:25:55.566 ip-10-0-130-241.ec2.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-05-29:04:25:55:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2021-05-29:04:25:55:INFO] Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34m[2021-05-29:04:25:55:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-05-29:04:25:55:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2021-05-29:04:25:55:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-05-29:04:25:55:INFO] Determi

Training seconds: 73
Billable seconds: 73


### Evaluate the model

In [16]:
model_key = job_name + '/output/model.tar.gz'
boto3.resource('s3').Bucket(output_bucket).download_file(model_key, 'model.tar.gz')
os.system('gtar -xvf model.tar.gz')
with open("xgboost-model", "rb") as f:
    booster = pkl.load(f)
pred = booster.predict(xgb.DMatrix(test[:, 1:])).astype(int)

#### Testing stats

In [17]:
pred = booster.predict(xgb.DMatrix(test[:, 1:])).astype(int)
print(f"Accuracy:  {accuracy_score(test[:, 0], pred):.4f}")
print(f"Recall:    {recall_score(test[:, 0], pred, average='macro'):.4f}")
print(f"Precision: {precision_score(test[:, 0], pred, average='macro'):.4f}")
print(f"f1:        {f1_score(test[:, 0], pred, average='macro'):.4f}")

Accuracy:  0.9422
Recall:    0.9118
Precision: 0.9029
f1:        0.9073


#### Validation Stats

In [18]:
pred = booster.predict(xgb.DMatrix(validate[:, 1:])).astype(int)
print(f"Accuracy:  {accuracy_score(validate[:, 0], pred):.4f}")
print(f"Recall:    {recall_score(validate[:, 0], pred, average='macro'):.4f}")
print(f"Precision: {precision_score(validate[:, 0], pred, average='macro'):.4f}")
print(f"f1:        {f1_score(validate[:, 0], pred, average='macro'):.4f}")

Accuracy:  0.9389
Recall:    0.9102
Precision: 0.8933
f1:        0.9014
