In [23]:
import io
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import boto3
import sagemaker
from sagemaker import get_execution_role, PCA

In [24]:
session = sagemaker.Session()
print(session)

role = get_execution_role()
print(role)

bucket = session.default_bucket()

<sagemaker.session.Session object at 0x7fe919b1ef98>
arn:aws:iam::240038582877:role/service-role/AmazonSageMaker-ExecutionRole-20191028T202433


### Customers

In [11]:
# load data
prefix = 'segmentation'

data_key= prefix + '/customers_scaled.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

<sagemaker.session.Session object at 0x7fe91a566b70>
arn:aws:iam::240038582877:role/service-role/AmazonSageMaker-ExecutionRole-20191028T202433


In [12]:
tmp_list = []

for chunk in pd.read_csv(data_location, chunksize=100000):
    tmp_list.append(chunk)
    
df = pd.concat(tmp_list, axis=0)
del tmp_list

In [19]:
# prepare space to store model artifacts
prefix = 'pca'
output_path = 's3://{}/{}/'.format(bucket, prefix)
output_path

's3://sagemaker-us-east-2-240038582877/pca/'

In [25]:
#current features less 1
N_COMPONENTS = 94
pca = PCA(role=role,
          train_instance_count=1,
          train_instance_type='ml.c4.xlarge',
          output_path=output_path,
          num_components=N_COMPONENTS,
          sagemaker_session=session)

In [26]:
#must convert to RecordSet for built in sagemaker models

#convert df to np array
train_data_np = df.values.astype('float32')

#convert to RecordSet format
formatted_train_data = pca.record_set(train_data_np)

In [27]:
pca.fit(formatted_train_data)

2020-04-07 03:45:18 Starting - Starting the training job...
2020-04-07 03:45:19 Starting - Launching requested ML instances...
2020-04-07 03:46:18 Starting - Preparing the instances for training......
2020-04-07 03:47:00 Downloading - Downloading input data...
2020-04-07 03:47:41 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[04/07/2020 03:47:57 INFO 140434131449664] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'_num_gpus': u'auto', u'_log_level': u'info', u'subtract_mean': u'true', u'force_dense': u'true', u'epochs': 1, u'algorithm_mode': u'regular', u'extra_components': u'-1', u'_kvstore': u'dist_sync', u'_num_kv_servers': u'auto'}[0m
[34m[04/07/2020 03:47:57 INFO 140434131449664] Reading provided configuration from /opt/ml/input/config/hyperparameters.json: {u'feature_dim': u'95', u'mini_batch_s


2020-04-07 03:48:11 Uploading - Uploading generated training model
2020-04-07 03:48:11 Completed - Training job completed
Training seconds: 71
Billable seconds: 71


In [30]:
#get from AWS console
training_job_name = 'pca-2020-04-07-03-45-18-512'
model_key = os.path.join(prefix, training_job_name, 'output/model.tar.gz')
print(model_key)

boto3.resource('s3').Bucket(bucket).download_file(model_key, 'model.tar.gz')

os.system('tar -zxvf model.tar.gz')
os.system('unzip model_algo-1')

pca/pca-2020-04-07-03-45-18-512/output/model.tar.gz


2304

In [32]:
#!pip install mxnet
import mxnet as mx

pca_model_params = mx.ndarray.load('model_algo-1')

print(pca_model_params)

Collecting mxnet
[?25l  Downloading https://files.pythonhosted.org/packages/81/f5/d79b5b40735086ff1100c680703e0f3efc830fa455e268e9e96f3c857e93/mxnet-1.6.0-py2.py3-none-any.whl (68.7MB)
[K    100% |████████████████████████████████| 68.7MB 707kB/s eta 0:00:01
[?25hCollecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading https://files.pythonhosted.org/packages/53/39/4ab213673844e0c004bed8a0781a0721a3f6bb23eb8854ee75c236428892/graphviz-0.8.4-py2.py3-none-any.whl
Collecting numpy<2.0.0,>1.16.0 (from mxnet)
[?25l  Downloading https://files.pythonhosted.org/packages/07/08/a549ba8b061005bb629b76adc000f3caaaf881028b963c2e18f811c6edc1/numpy-1.18.2-cp36-cp36m-manylinux1_x86_64.whl (20.2MB)
[K    100% |████████████████████████████████| 20.2MB 2.5MB/s eta 0:00:01
[31mfastai 1.0.60 requires nvidia-ml-py3, which is not installed.[0m
Installing collected packages: graphviz, numpy, mxnet
  Found existing installation: numpy 1.15.4
    Uninstalling numpy-1.15.4:
      Successfully uninstalled

In [None]:
#mean - mean subtracted from a component in order to center it
#v - makeup of the principal component (same as components_)
# singular values of the components for the PCA transformation, % of variance from the projected future space