# Introduction to Data Formats

In [1]:
import numpy as np
import pandas as pd
import boto3

import sagemaker.amazon.common as smac

In [2]:
np.random.seed(5)

In [3]:
!aws s3 ls

2019-10-17 07:03:33 aws-deepracer-ef5d6995-11ca-4800-a5bf-694244303368
2019-09-06 03:55:51 cdktoolkit-stagingbucket-cfb7w421pk09
2019-09-06 03:55:51 cdktoolkit-stagingbucket-ohzm67k4ffb7
2019-09-17 04:42:44 cloud9-507922848584-sam-deployments-us-east-1
2019-08-01 18:08:18 cloudtrail-awslogs-507922848584-psq7khfg-isengard-do-not-delete
2019-08-16 07:23:22 codepipeline-us-east-2-531781646601
2019-11-07 18:11:10 debnsuma-ml-sagemaker
2019-10-14 08:03:04 deeplens-sagemaker-cd804e45-2557-44bf-889d-441b68c2593c
2019-08-01 18:23:18 do-not-delete-gatedgarden-audit-507922848584
2019-08-14 10:44:27 elasticbeanstalk-us-east-2-507922848584
2019-09-06 04:00:46 hello-cdk-1-myhelloconstructbucket0daec57e1-yiqg4smxb5bc
2019-09-06 04:00:46 hello-cdk-1-myhelloconstructbucket18d9883be-dg8s1cspliig
2019-09-06 04:00:47 hello-cdk-1-myhelloconstructbucket2c1da3656-12vb7vvjv834y
2019-09-06 04:00:46 hello-cdk-1-myhelloconstructbucket398a5de67-1qmaqal6sde8m
2019-10-25 13:19:02 my-ai-bucket-suman


In [4]:
s3_bucket_name = "debnsuma-ml-sagemaker"

## Sample DataSet

In [19]:
n = 10

x1 = np.random.random_sample(n)           # n floating point numbers between 0 and 1
x2 = np.random.randint(1, 100, n)         # n integers between 1 and 100
x3 = np.random.random_sample(n) * 10      # n floating point numbers between 0 and 10

y = np.random.randint(0, 2, n)            # Lable > i.e. response variable which will hold value between 0 or 1

#### Lets create a DataFrame using this dataset

In [22]:
df = pd.DataFrame({"x1": x1,
                 "x2": x2,
                 "x3": x3,
                 "y": y})

In [23]:
df

Unnamed: 0,x1,x2,x3,y
0,0.207779,68,1.343637,1
1,0.362459,80,3.018861,1
2,0.944759,18,1.364801,1
3,0.742238,88,3.177002,1
4,0.427671,12,6.791827,1
5,0.165975,31,6.013412,0
6,0.528426,27,9.972255,1
7,0.109175,9,5.609146,1
8,0.703617,52,5.487262,0
9,0.784509,62,6.423495,1


#### Save this dataset in CSV into this SageMaker Notebook

In [26]:
df.to_csv("myfile.csv", index=False)

#### Function to read a file from Amazon S3

In [35]:
def download_from_s3(bucket, key, filename):
    
    s3_client = boto3.client('s3')
    s3_client.download_file(bucket, key, filename)
        

#### Function to write a file from Amazon S3

In [32]:
def upload_to_s3(file_name, bucket, key):
    
    s3_client = boto3.client('s3')
    s3_client.upload_file(file_name, bucket, key)

Uploading the file ```myfile.csv``` to S3 Bucket 

In [33]:
upload_to_s3("myfile.csv", s3_bucket_name, "myfile_destination.csv" )

Download the file ```myfile_destination.csv``` to SageMaker Instance

In [36]:
download_from_s3(s3_bucket_name, "myfile_destination.csv", "myfile_from_s3.csv")

### RecordIO Format


We will use SageMaker SDK write_numpy_to_dense_tensor() method to create RecordIO files


Data Types: Int32, Float32, Float64  

Reference:
https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/amazon/common.py

In [38]:
df.head()

Unnamed: 0,x1,x2,x3,y
0,0.207779,68,1.343637,1
1,0.362459,80,3.018861,1
2,0.944759,18,1.364801,1
3,0.742238,88,3.177002,1
4,0.427671,12,6.791827,1


In [44]:
data = df[['x1', 'x2', 'x3']].to_numpy()

In [45]:
data

array([[ 0.20777913, 68.        ,  1.34363728],
       [ 0.36245928, 80.        ,  3.01886098],
       [ 0.94475876, 18.        ,  1.36480094],
       [ 0.74223792, 88.        ,  3.17700203],
       [ 0.42767057, 12.        ,  6.79182681],
       [ 0.16597462, 31.        ,  6.01341156],
       [ 0.52842608, 27.        ,  9.97225467],
       [ 0.10917502,  9.        ,  5.6091459 ],
       [ 0.70361658, 52.        ,  5.48726197],
       [ 0.78450887, 62.        ,  6.4234954 ]])

In [46]:
type(data)

numpy.ndarray

In [47]:
response = df[['y']].to_numpy()

In [48]:
response

array([[1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1]])

In [51]:
# Flatten to a single dimension array of 10 elements
response = response.ravel()

In [52]:
response

array([1, 1, 1, 1, 1, 0, 1, 1, 0, 1])

#### Function to create a RecordIO_File

In [55]:
def write_recordio_file(filename, x, y=None):
    with open(filename, 'wb') as fh:
        smac.write_numpy_to_dense_tensor(fh, data, response)

#### Function to read a RecordIO_File

In [63]:
def read_recordio_file(filename, recordToPrint=10):
    with open(filename, 'rb') as fh:
        record = smac.read_records(fh)
        
        for i, r in enumerate(record):
            if i >= recordToPrint:
                break
            print(f"Record: {i}")
            print(r)

In [60]:
write_recordio_file("my_file.recordio", data, response)

In [59]:
df.head(3)

Unnamed: 0,x1,x2,x3,y
0,0.207779,68,1.343637,1
1,0.362459,80,3.018861,1
2,0.944759,18,1.364801,1


In [69]:
read_recordio_file("my_file.recordio", 1)

Record: 0
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.20777913154073901
      values: 68.0
      values: 1.3436372785235862
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 1
    }
  }
}



In [71]:
read_recordio_file("my_file.recordio", 2)

Record: 0
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.20777913154073901
      values: 68.0
      values: 1.3436372785235862
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 1
    }
  }
}

Record: 1
features {
  key: "values"
  value {
    float64_tensor {
      values: 0.3624592780248084
      values: 80.0
      values: 3.018860977327299
    }
  }
}
label {
  key: "values"
  value {
    int32_tensor {
      values: 1
    }
  }
}



Writing this recordio file ```my_file.recordio``` to S3

In [73]:
upload_to_s3("my_file.recordio", s3_bucket_name, "my_file.recordio")

In [74]:
download_from_s3(s3_bucket_name, "my_file.recordio", "my_file_from_s3.recordio")