In [2]:
import boto3
from datetime import datetime
import numpy as np
import os
import pandas as pd
import sagemaker


### Get the data

In [44]:
data = pd.read_csv('ufo_fullset.csv', usecols=['shape', 'duration', 'witnesses', 'weather', 'physicalEvidence', 'contact', 'researchOutcome'])
data.head()


Unnamed: 0,shape,duration,witnesses,weather,physicalEvidence,contact,researchOutcome
0,circle,4,1,rain,N,N,explained
1,disk,4,1,partly cloudy,Y,N,explained
2,circle,49,1,clear,N,N,explained
3,disk,13,1,partly cloudy,N,N,explained
4,circle,17,1,mostly cloudy,N,N,explained


### Clean the data

In [57]:
data_clean = data.fillna('unknown')
data_clean.replace(' ', '_', regex=True, inplace=True)
data_clean.replace(('Y', 'N'), (1, 0), inplace=True)

data_clean = pd.get_dummies(data_clean, columns=['researchOutcome', 'shape', 'weather'])
data_clean.drop(columns=['researchOutcome_unexplained', 'shape_unknown', 'weather_clear'], inplace=True)
data_clean.columns = data_clean.columns.str.replace("researchOutcome_", "")
data_clean.columns = data_clean.columns.str.replace("shape_", "")
data_clean.columns = data_clean.columns.str.replace("weather_", "")
headers = ['explained'] + [c for c in data_clean.columns if c != 'explained']
data_clean = data_clean[headers]
print(data_clean.head())
print(data_clean.info())
data_np = data_clean.values

   explained  duration  witnesses  physicalEvidence  contact  probable  box  \
0          1         4          1                 0        0         0    0   
1          1         4          1                 1        0         0    0   
2          1        49          1                 0        0         0    0   
3          1        13          1                 0        0         0    0   
4          1        17          1                 0        0         0    0   

   circle  disk  light  ...  pyramid  sphere  square  triangle  fog  \
0       1     0      0  ...        0       0       0         0    0   
1       0     1      0  ...        0       0       0         0    0   
2       1     0      0  ...        0       0       0         0    0   
3       0     1      0  ...        0       0       0         0    0   
4       1     0      0  ...        0       0       0         0    0   

   mostly_cloudy  partly_cloudy  rain  snow  stormy  
0              0              0     1     0 

### Save the data to train, validate and test csv files

In [66]:
indices = np.arange(data_np.shape[0])
np.random.shuffle(indices)
train_end = int(data_np.shape[0]*0.7)
validate_end = int(data_np.shape[0]*0.9)
train = data_np[indices[:train_end]]
validate = data_np[indices[train_end:validate_end]]
test = data_np[indices[validate_end:]]
print(train.shape)
print(validate.shape)
print(test.shape)

(12600, 21)
(3600, 21)
(1800, 21)


In [70]:
s3_client = boto3.client('s3')
bucket_name = 'acg-ml-certification-df'

train_name = 'train.csv'
validate_name = 'validate.csv'
test_name = 'test.csv'
test_true_name = 'test-true.csv'

np.savetxt(train_name, train.astype(int), fmt='%i', delimiter=',')
np.savetxt(validate_name, validate.astype(int), fmt='%i', delimiter=',')
np.savetxt(test_name, test[:, 1:].astype(int), fmt='%i', delimiter=',')
np.savetxt(test_true_name, test[:, 0].astype(int), fmt='%i', delimiter=',')

for name in [train_name, validate_name, test_name, test_true_name]:
    s3_client.upload_file(name, bucket_name, name)