In [3]:
import os

In [4]:
import sklearn
import pandas as pd

import sagemaker
import boto3

___

In [5]:
from sklearn.datasets import load_boston
boston = load_boston()

In [6]:
print(boston['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [7]:
X = pd.DataFrame(data=boston['data'], columns=boston['feature_names'])
y = pd.Series(boston['target'], name = 'MEDV')

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)


- The data need to have the target variable in the first column.
- Amazon SageMaker requires that a CSV file does not have a header record and that the target variable is in the first column.
- More in: https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html

In [9]:
train = pd.concat([y_train, X_train], axis=1)
validation = pd.concat([y_val, X_val], axis=1)
test = pd.concat([y_test, X_test], axis=1)

In [10]:
train

Unnamed: 0,MEDV,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
458,14.9,7.75223,0.0,18.10,0.0,0.713,6.301,83.7,2.7831,24.0,666.0,20.2,272.21,16.23
36,20.0,0.09744,0.0,5.96,0.0,0.499,5.841,61.4,3.3779,5.0,279.0,19.2,377.56,11.41
383,12.3,7.99248,0.0,18.10,0.0,0.700,5.520,100.0,1.5331,24.0,666.0,20.2,396.90,24.56
352,18.6,0.07244,60.0,1.69,0.0,0.411,5.884,18.5,10.7103,4.0,411.0,18.3,392.33,7.79
154,17.0,1.41385,0.0,19.58,1.0,0.871,6.129,96.0,1.7494,5.0,403.0,14.7,321.02,15.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,26.7,0.35809,0.0,6.20,1.0,0.507,6.951,88.5,2.8617,8.0,307.0,17.4,391.70,9.71
469,20.1,13.07510,0.0,18.10,0.0,0.580,5.713,56.7,2.8237,24.0,666.0,20.2,396.90,14.76
133,18.4,0.32982,0.0,21.89,0.0,0.624,5.822,95.4,2.4699,4.0,437.0,21.2,388.69,15.03
439,12.8,9.39063,0.0,18.10,0.0,0.740,5.627,93.9,1.8172,24.0,666.0,20.2,396.90,22.88


In [11]:
# Convert the Train and Validation Datasets to CSV Files
train.to_csv('train.csv', index=False, header=False)
validation.to_csv('validation.csv', index=False, header=False)

## Upload files to s3
___

In [12]:
#Upload the Datasets to Amazon S3
bucket = sagemaker.Session().default_bucket()
bucket

'sagemaker-eu-west-3-260598086981'

In [13]:
prefix = "demo-sagemaker-boston"

In [14]:
s3_client = boto3.client('s3')
s3_client.upload_file('train.csv', bucket, f'{prefix}/data/train.csv')
s3_client.upload_file('validation.csv', bucket, f'{prefix}/data/validation.csv')

In [15]:
! aws s3 ls {bucket}/{prefix}/data/

2021-10-21 15:52:37      23427 train.csv
2021-10-21 15:52:37       7818 validation.csv


___
- For Autopilot we need the headers in the csvs

In [16]:
# Convert the Train and Validation Datasets to CSV Files
train.to_csv('train_header.csv', index=False, header=True)
validation.to_csv('validation_header.csv', index=False, header=True)

In [17]:
prefix = "demo-sagemaker-boston-autopilot"

s3_client = boto3.client('s3')
s3_client.upload_file('train_header.csv', bucket, f'{prefix}/data/train_header.csv')
s3_client.upload_file('validation_header.csv', bucket, f'{prefix}/data/validation_header.csv')

In [18]:
! aws s3 ls {bucket}/{prefix}/data --recursive

2021-10-21 15:52:52      23490 demo-sagemaker-boston-autopilot/data/train_header.csv
2021-10-21 15:52:52       7881 demo-sagemaker-boston-autopilot/data/validation_header.csv
