In [6]:
import io
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import boto3
import sagemaker
from sagemaker import get_execution_role, PCA

In [7]:
session = sagemaker.Session()
print(session)

role = get_execution_role()
print(role)

bucket = session.default_bucket()

<sagemaker.session.Session object at 0x7f0a72379160>
arn:aws:iam::240038582877:role/service-role/AmazonSageMaker-ExecutionRole-20191028T202433


### Customers

In [8]:
# load data
prefix = 'segmentation'

data_key= prefix + '/customers_scaled.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

In [9]:
tmp_list = []

for chunk in pd.read_csv(data_location, chunksize=100000):
    tmp_list.append(chunk)
    
df = pd.concat(tmp_list, axis=0).drop(['Unnamed: 0'], axis=1)
print(df.shape)
del tmp_list

(191652, 94)


In [10]:
df.head()

Unnamed: 0,LNR,AGER_TYP,D19_BANKEN_ANZ_12,D19_BANKEN_ANZ_24,D19_BANKEN_DATUM,D19_BANKEN_DIREKT,D19_BANKEN_GROSS,D19_BANKEN_LOKAL,D19_BANKEN_OFFLINE_DATUM,D19_BANKEN_ONLINE_DATUM,...,SEMIO_REL,SEMIO_SOZ,SEMIO_TRADV,SEMIO_VERT,SHOPPER_TYP,VERS_TYP,ZABEOTYP,ONLINE_PURCHASE,ANREDE_KZ,ALTERSKATEGORIE_GROB
0,0.050221,0.75,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.166667,0.833333,0.0,0.833333,1.0,0.666667,0.4,0.0,0.0,0.375
1,0.050232,0.0,0.0,0.166667,0.555556,0.0,0.833333,0.0,1.0,1.0,...,0.166667,0.333333,0.0,1.0,1.0,0.666667,0.4,0.0,0.0,0.375
2,0.750693,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.166667,0.0,0.333333,0.5,1.0,0.4,0.0,1.0,0.375
3,0.750698,0.5,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.333333,0.833333,0.5,1.0,0.25,0.666667,0.0,0.0,0.0,0.375
4,0.750703,0.0,0.166667,0.333333,0.222222,0.714286,0.0,0.428571,1.0,0.666667,...,0.5,0.5,0.5,0.666667,0.5,1.0,0.0,0.0,0.0,0.25


In [11]:
# prepare space to store model artifacts
prefix = 'pca'
output_path = 's3://{}/{}/'.format(bucket, prefix)
output_path

's3://sagemaker-us-east-2-240038582877/pca/'

In [15]:
N_COMPONENTS = 10
pca = PCA(role=role,
          train_instance_count=1,
          train_instance_type='ml.c4.xlarge',
          output_path=output_path,
          num_components=N_COMPONENTS,
          sagemaker_session=session)

In [99]:
#must convert to RecordSet for built in sagemaker models

#convert df to np array
train_data_np = df.values.astype('float32')

#convert to RecordSet format
formatted_train_data = pca.record_set(train_data_np)

In [None]:
# pca.fit(formatted_train_data)

2020-04-09 02:09:26 Starting - Starting the training job...
2020-04-09 02:09:27 Starting - Launching requested ML instances...
2020-04-09 02:10:24 Starting - Preparing the instances for training......
2020-04-09 02:11:26 Downloading - Downloading input data...
2020-04-09 02:11:56 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[04/09/2020 02:11:59 INFO 140505833207616] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'_num_gpus': u'auto', u'_log_level': u'info', u'subtract_mean': u'true', u'force_dense': u'true', u'epochs': 1, u'algorithm_mode': u'regular', u'extra_components': u'-1', u'_kvstore': u'dist_sync', u'_num_kv_servers': u'auto'}[0m
[34m[04/09/2020 02:11:59 INFO 140505833207616] Reading provided configuration from /opt/ml/input/config/hyperparameters.json: {u'feature_di


2020-04-09 02:12:12 Uploading - Uploading generated training model
2020-04-09 02:12:12 Completed - Training job completed


In [12]:
#get from AWS console
training_job_name = 'pca-2020-04-09-02-04-47-061'
model_key = os.path.join(prefix, training_job_name, 'output/model.tar.gz')
print(model_key)

boto3.resource('s3').Bucket(bucket).download_file(model_key, 'model.tar.gz')

os.system('tar -zxvf model.tar.gz')
os.system('unzip model_algo-1')

pca/pca-2020-04-09-02-04-47-061/output/model.tar.gz


2304

In [13]:
# !pip install mxnet
import mxnet as mx

pca_model_params = mx.ndarray.load('model_algo-1')

print(pca_model_params)

{'s': 
[141.1204  143.32965 145.11024 146.76447 149.50151 151.03342 157.1557
 158.33104 167.0297  188.32408 193.03432 221.53874 319.5286  337.85233
 675.31635]
<NDArray 15 @cpu(0)>, 'v': 
[[ 7.0923059e-03 -5.8247638e-04  3.1522124e-03 ... -1.3421000e-03
   3.8028313e-04  5.9330574e-04]
 [ 2.4519821e-01 -9.0075172e-02  5.8993749e-02 ...  5.7734650e-02
  -1.7181677e-01  1.2691927e-01]
 [-1.9225318e-02 -1.3404696e-02  8.6637484e-03 ... -9.3044837e-05
   2.0812510e-02  9.0478146e-03]
 ...
 [ 1.2520333e-02  3.1468025e-03 -2.0302340e-02 ... -2.4974789e-02
   8.4419683e-02  1.3095209e-03]
 [ 6.9954127e-02  1.1673045e-02 -1.6498663e-02 ...  5.7309365e-01
   3.6814865e-02 -8.5172832e-02]
 [-4.5499880e-02 -2.4861801e-02  4.9221837e-03 ...  1.8885460e-02
  -7.0295915e-02  5.8576241e-02]]
<NDArray 94x15 @cpu(0)>, 'mean': 
[[0.49999985 0.33608964 0.01517333 0.02658724 0.92973363 0.09230943
  0.07074629 0.01899574 0.98516315 0.95562726 0.0634968  0.13981819
  0.2182542  0.14265218 0.07692737 0.34914

In [16]:
#mean - mean subtracted from a component in order to center it
#v - makeup of the principal component (same as components_)
#s - singular values of the components for the PCA transformation, % of variance from the projected future space

#explained-variance-ratio ~= square(s) / sum(square(s))

s=pd.DataFrame(pca_model_params['s'].asnumpy())
v=pd.DataFrame(pca_model_params['v'].asnumpy())

n_principal_components = 5

start_idx = N_COMPONENTS - n_principal_components

# s = s.apply(lambda x: '%.5f' % x, axis=1)

print(s.iloc[start_idx:])

             0
5   151.033417
6   157.155701
7   158.331039
8   167.029694
9   188.324081
10  193.034317
11  221.538742
12  319.528595
13  337.852325
14  675.316345


In [17]:
#what is the smallest number or principal comonents that captures at least 80% of the
#total variance in the dataset?

def explained_variance(s, n_top_components):
#     exp_variance = np.square(s.iloc[start_idx:])
    exp_variance = np.square(s.iloc[start_idx:]).sum() / np.square(s).sum()
    return exp_variance

In [20]:
n_top_components = 10
exp_variance = explained_variance(s, n_top_components)
print(exp_variance)

0    0.894593
dtype: float32
