In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os

<h1>PCA Cloud Prediction Invocation Template</h1>
<h4>Invoke SageMaker Prediction Service</h4>

In [2]:
import boto3
import re
from sagemaker import get_execution_role
import sagemaker

# SDK 2 serializers and deserializers
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

In [3]:
# SDK 2.0

# RealTimePredictor renamed to Predictor
# https://sagemaker.readthedocs.io/en/stable/v2.html

# Create a predictor and point to an existing endpoint

endpoint_name = 'pca-biketrain-v1'
predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)

In [4]:
predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

In [5]:
# We are not going to use numeric features: 'temp','atemp','humidity','windspeed'
# Instead, we are going to use new components (aka features) generated by PCA for model training and testing
columns = ['count', 'season', 'holiday', 'workingday', 'weather','year', 'month', 'day', 'dayofweek','hour']

# PCA Training
colums_for_pca = ['temp','atemp','humidity','windspeed']

In [6]:
df = pd.read_csv('train_normalized.csv')
df_test = pd.read_csv('test_normalized.csv')

In [7]:
df.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2.833213,1,0,0,1,0.118761,0.173736,0.977605,0.0,2011,1,1,5,0
1,3.713572,1,0,0,1,0.110467,0.166986,0.979751,0.0,2011,1,1,5,1


In [8]:
df_test.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2011-01-20 00:00:00,1,0,1,1,0.167404,0.178475,0.879419,0.408344,2011,1,20,3,0
1,2011-01-20 01:00:00,1,0,1,1,0.181869,0.232625,0.95541,0.0,2011,1,20,3,1


In [9]:
df[colums_for_pca].head()

Unnamed: 0,temp,atemp,humidity,windspeed
0,0.118761,0.173736,0.977605,0.0
1,0.110467,0.166986,0.979751,0.0
2,0.110467,0.166986,0.979751,0.0
3,0.127792,0.186947,0.974023,0.0
4,0.127792,0.186947,0.974023,0.0


In [10]:
# test = df[colums_for_pca].head().as_matrix()
test = df[colums_for_pca].head().values

In [11]:
result = predictor.predict(test)

In [12]:
result

{'projections': [{'projection': [-0.09002441167831421,
    -0.3131008744239807,
    -0.9453434944152832]},
  {'projection': [-0.08605290949344635,
    -0.32269516587257385,
    -0.9424567222595215]},
  {'projection': [-0.08605290949344635,
    -0.32269516587257385,
    -0.9424567222595215]},
  {'projection': [-0.09603984653949738,
    -0.2985489070415497,
    -0.9494175910949707]},
  {'projection': [-0.09603984653949738,
    -0.2985489070415497,
    -0.9494175910949707]}]}

In [13]:
l = [values['projection'] for values in result['projections']]

In [14]:
l

[[-0.09002441167831421, -0.3131008744239807, -0.9453434944152832],
 [-0.08605290949344635, -0.32269516587257385, -0.9424567222595215],
 [-0.08605290949344635, -0.32269516587257385, -0.9424567222595215],
 [-0.09603984653949738, -0.2985489070415497, -0.9494175910949707],
 [-0.09603984653949738, -0.2985489070415497, -0.9494175910949707]]

In [15]:
df_temp = pd.DataFrame(l)

In [16]:
df_temp

Unnamed: 0,0,1,2
0,-0.090024,-0.313101,-0.945343
1,-0.086053,-0.322695,-0.942457
2,-0.086053,-0.322695,-0.942457
3,-0.09604,-0.298549,-0.949418
4,-0.09604,-0.298549,-0.949418


In [17]:
# For large number of predictions, we can split the input data and
# Query the prediction service.
# array_split is convenient to specify how many splits are needed
def get_projection(arr_features):
    projections = []
    for arr in np.array_split(arr_features,100):        
        if arr.shape[0] > 0:
            print (arr.shape)
            result = predictor.predict(arr)
            projections += [values['projection'] for values in result['projections']]
    return projections
        

In [18]:
def replace_features(predictor, df, colums_for_pca):
    
    # arr_features = df[colums_for_pca].as_matrix()
    arr_features = df[colums_for_pca].values
    
    projections = get_projection(arr_features)
    df_projection = pd.DataFrame(projections)
    
    tcols = []
    # New column names
    for i in range(df_projection.shape[1]):       
        tcols.append('component_' + str(i))
    
    df_projection.columns = tcols
    print ('components:',tcols)
    
    
    for col in df_projection.columns:
        df[col] = df_projection[col]
    
    df.drop(colums_for_pca, inplace=True, axis=1)
    
    return tcols

In [19]:
df.head(2)

Unnamed: 0,count,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,dayofweek,hour
0,2.833213,1,0,0,1,0.118761,0.173736,0.977605,0.0,2011,1,1,5,0
1,3.713572,1,0,0,1,0.110467,0.166986,0.979751,0.0,2011,1,1,5,1


In [20]:
new_cols = replace_features(predictor,df,colums_for_pca)

(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(109, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
(108, 4)
components: ['component_0', 'component_1', 'component_2']


In [21]:
replace_features(predictor,df_test,colums_for_pca)

(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(65, 4)
(64, 4)
(64, 4)
(64, 4)
(64, 4)
(64, 4)
(64, 4)
(64, 4)
components: ['component_0', 'component_1', 'component_2']


['component_0', 'component_1', 'component_2']

In [22]:
for col in new_cols:
    columns.append(col)

In [23]:
columns

['count',
 'season',
 'holiday',
 'workingday',
 'weather',
 'year',
 'month',
 'day',
 'dayofweek',
 'hour',
 'component_0',
 'component_1',
 'component_2']

In [24]:
## Training, Validation and Test Set
### Target Variable as first column followed by input features
### Training, Validation files do not have a column header

In [25]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [26]:
rows = df.shape[0]
train = int(.7 * rows)
test = int(.3 * rows)

In [27]:
rows, train, test

(10886, 7620, 3265)

In [28]:
# Write Training Set
df[:train].to_csv('bike_train_pca.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [29]:
# Write Validation Set
df[train:].to_csv('bike_validation_pca.csv'
                          ,index=False,header=False
                          ,columns=columns)

In [30]:
# Test Data has only input features
df_test.to_csv('bike_test_pca.csv',index=False)

In [31]:
# Write Column List
with open('bike_train_column_list_pca.txt','w') as f:
    f.write(','.join(columns))

In [32]:
# Cleanup Endpoint to avoid unnecessary charges
predictor.delete_endpoint()