In [1]:
import io
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import boto3
import sagemaker
from sagemaker import get_execution_role, PCA

In [2]:
session = sagemaker.Session()
print(session)

role = get_execution_role()
print(role)

bucket = session.default_bucket()

<sagemaker.session.Session object at 0x7f456ccbe940>
arn:aws:iam::240038582877:role/service-role/AmazonSageMaker-ExecutionRole-20191028T202433


In [None]:
# load data
prefix = 'segmentation'

data_key= prefix + '/azdias_scaled.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

In [None]:
tmp_list = []

for chunk in pd.read_csv(data_location, chunksize=100000):
    tmp_list.append(chunk)
    
df = pd.concat(tmp_list, axis=0).drop(['Unnamed: 0'], axis=1)
print(df.shape)
del tmp_list

In [None]:
df.head()

In [None]:
# prepare space to store model artifacts
prefix = 'pca'
output_path = 's3://{}/{}/'.format(bucket, prefix)
output_path

In [None]:
N_COMPONENTS = 10
pca = PCA(role=role,
          train_instance_count=1,
          train_instance_type='ml.c4.xlarge',
          output_path=output_path,
          num_components=N_COMPONENTS,
          sagemaker_session=session)

In [None]:
#must convert to RecordSet for built in sagemaker models

#convert df to np array (pass this to the deployed PCA model later)
train_data_np = df.values.astype('float32')

#convert to RecordSet format
formatted_train_data = pca.record_set(train_data_np)

In [None]:
pca.fit(formatted_train_data)

In [None]:
#get from AWS console
training_job_name = 'pca-2020-04-19-17-25-20-625'
model_key = os.path.join(prefix, training_job_name, 'output/model.tar.gz')
print(model_key)

boto3.resource('s3').Bucket(bucket).download_file(model_key, 'model.tar.gz')

os.system('tar -zxvf model.tar.gz')
os.system('unzip model_algo-1')

In [None]:
import mxnet as mx

pca_model_params = mx.ndarray.load('model_algo-1')

# print(pca_model_params)

In [None]:
#mean - mean subtracted from a component in order to center it
#v - makeup of the principal component (same as components_)
#s - singular values of the components for the PCA transformation, % of variance from the projected future space

#explained-variance-ratio ~= square(s) / sum(square(s))

s=pd.DataFrame(pca_model_params['s'].asnumpy())
v=pd.DataFrame(pca_model_params['v'].asnumpy())

n_principal_components = 5

start_idx = N_COMPONENTS - n_principal_components

print(s.iloc[start_idx:])

In [None]:
#what is the smallest number or principal comonents that captures at least 80% of the
#total variance in the dataset?

def explained_variance(s, n_top_components):
    exp_variance = np.square(s.iloc[start_idx:]).sum() / np.square(s).sum()
    return exp_variance

In [None]:
n_top_components = 10
exp_variance = explained_variance(s, n_top_components)
print(exp_variance)

In [None]:
import seaborn as sns

def display_component(v, features_list, component_num, n_weights=10):
        row_idx = N_COMPONENTS-component_num
        v_l_row = v.iloc[:, row_idx]
        v_1 = np.squeeze(v_l_row.values)
        
        compos = pd.DataFrame(list(zip(v_1, features_list)), columns=['weights', 'features'])
        
        # get absolute value
        #weights can be neg/pos and we'll sort by magnitude
        
        compos['abs_weights'] = compos['weights'].apply(lambda x: abs(x))
        sorted_compos = compos.sort_values('abs_weights', ascending=False).head(n_weights)
        
        ax=plt.subplots(figsize=(10,6))
        ax=sns.barplot(data=sorted_compos, x="weights", y="features", palette="Blues_d")
        ax.set_title("PCA Component Makeup Component #{}".format(component_num))
        plt.show()

In [None]:
display_component(v, df.columns.values, component_num=1, n_weights=10)

In [None]:
training_job_name = pca.latest_training_job.name
training_job_name

In [None]:
attached_estimator = pca.attach(training_job_name)

In [None]:
%%time

# pca_predictor = pca.deploy(initial_instance_count=1, instance_type='ml.t2.medium')
pca_predictor = attached_estimator.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

In [None]:
%%time

#try chunking data to PCA endpoint
train_pca = []

for chunk in np.array_split(train_data_np, 5000):
    train_pca.append(pca_predictor.predict(chunk))

In [None]:
flattened_train_pca = [list for sublist in train_pca for list in sublist]
del train_pca

In [None]:
def create_transformed_df(flattened_train_pca, scaled_df, n_top_components):
    start_idx = N_COMPONENTS - n_top_components
    new_values = [x.label['projection'].float32_tensor.values for x in flattened_train_pca]
    df = pd.DataFrame(index=scaled_df.index.values, data=new_values)
    
    top_components = df.iloc[:,start_idx:]
    return top_components.iloc[:,::-1]

In [None]:
flattened_train_pca = flattened_train_pca[:-1]
len(flattened_train_pca)

In [None]:
df = df[:-1]
df.shape[0]

In [None]:
len(flattened_train_pca) == df.shape[0]

In [None]:
import numpy as np
set1, set2, set3, set4 = np.array_split(flattened_train_pca, 4)

In [None]:
print(len(set1))
print(len(set2))
print(len(set3))
print(len(set4))

In [None]:
%%time

import math
start_idx = 668415
end_idx = 891220
chunk_num = 4

top_n=5
cols = ['c1', 'c2', 'c3', 'c4', 'c5']


print('beginning', start_idx, end_idx)
print('scaled df index values: %.2d and x shape %.2d' % (df.iloc[start_idx:end_idx].shape[0], set1.shape[0]))
print()
df_transformed = create_transformed_df(set1, df.iloc[start_idx:end_idx], n_top_components=top_n)
df_transformed.columns = cols

file_name = '{}_transformed.csv'.format(chunk_num)
print(file_name)
df_transformed.to_csv(file_name)

In [None]:
print(df.iloc[0:222805].shape[0])
print(df.iloc[222805:445610].shape[0])
print(df.iloc[445610:668415].shape[0])
print(df.iloc[668415:891220].shape[0])
print(df.shape[0])

In [None]:
len(flattened_train_pca)

In [None]:
session.delete_endpoint(pca_predictor.endpoint)

### KMeans Training

In [11]:
df1_transformed = pd.read_csv('1_transformed.csv')

In [12]:
df2_transformed = pd.read_csv('2_transformed.csv')

In [14]:
df3_transformed = pd.read_csv('3_transformed.csv')

In [16]:
df4_transformed = pd.read_csv('4_transformed.csv')

In [17]:
df1_transformed = df1_transformed.append(df4_transformed)
print(df1_transformed.shape[0])

del df4_transformed

891220


In [19]:
df1_transformed.head()

Unnamed: 0.1,Unnamed: 0,c1,c2,c3,c4,c5
0,0,-1.72167,0.423881,0.200654,-1.47056,-0.037066
1,1,-1.050703,0.961696,0.697668,1.330895,0.107373
2,2,-0.168961,-0.205433,0.822409,1.033169,1.122331
3,3,0.038208,-1.279235,-0.058969,0.382375,0.860651
4,4,2.664234,0.44764,-0.592593,-0.372019,-1.152814


In [20]:
# KMeans

from sagemaker import KMeans

prefix = 'segmentation'
output_path = 's3://{}/{}/'.format(bucket, prefix)
print(output_path)

s3://sagemaker-us-east-2-240038582877/segmentation/


In [21]:
kmeans = KMeans(
    role=role,
    train_instance_count=1,
    train_instance_type='ml.c4.xlarge',
    output_path=output_path,
    sagemaker_session=session,
    k=8
)

In [23]:
%%time
train_data_np = df1_transformed.values.astype('float32')
formatted_train_data = kmeans.record_set(train_data_np)

CPU times: user 19.8 s, sys: 118 ms, total: 19.9 s
Wall time: 20.8 s


In [24]:
%%time
kmeans.fit(formatted_train_data, logs=True)

2020-04-19 18:17:29 Starting - Starting the training job...
2020-04-19 18:17:30 Starting - Launching requested ML instances...
2020-04-19 18:18:26 Starting - Preparing the instances for training......
2020-04-19 18:19:14 Downloading - Downloading input data......
2020-04-19 18:20:28 Training - Training image download completed. Training in progress.
2020-04-19 18:20:28 Uploading - Uploading generated training model
2020-04-19 18:20:28 Completed - Training job completed
[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[04/19/2020 18:20:18 INFO 140392115726144] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs':

Training seconds: 74
Billable seconds: 74
CPU times: user 430 ms, sys: 24.8 ms, total: 455 ms
Wall time: 3min 11s


In [25]:
%%time
kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

---------------!CPU times: user 267 ms, sys: 5.28 ms, total: 272 ms
Wall time: 7min 31s


In [26]:
%%time

#try chunking data to PCA endpoint
cluster_info = []

for chunk in np.array_split(train_data_np, 2000):
    cluster_info.append(kmeans_predictor.predict(chunk))

CPU times: user 1min 4s, sys: 971 ms, total: 1min 5s
Wall time: 2min 47s


In [28]:
#explore clusters
data_idx = 0

print(df1_transformed.index[data_idx])
cluster_info[data_idx]

0


[label {
   key: "closest_cluster"
   value {
     float32_tensor {
       values: 5.0
     }
   }
 }
 label {
   key: "distance_to_cluster"
   value {
     float32_tensor {
       values: 403.1842956542969
     }
   }
 }, label {
   key: "closest_cluster"
   value {
     float32_tensor {
       values: 5.0
     }
   }
 }
 label {
   key: "distance_to_cluster"
   value {
     float32_tensor {
       values: 402.1838684082031
     }
   }
 }, label {
   key: "closest_cluster"
   value {
     float32_tensor {
       values: 5.0
     }
   }
 }
 label {
   key: "distance_to_cluster"
   value {
     float32_tensor {
       values: 401.1827697753906
     }
   }
 }, label {
   key: "closest_cluster"
   value {
     float32_tensor {
       values: 5.0
     }
   }
 }
 label {
   key: "distance_to_cluster"
   value {
     float32_tensor {
       values: 400.1821594238281
     }
   }
 }, label {
   key: "closest_cluster"
   value {
     float32_tensor {
       values: 5.0
     }
   }
 }
 label {
 

In [29]:
flattened_cluster_info = [list for sublist in cluster_info for list in sublist]

In [30]:
len(flattened_cluster_info)

891220

In [31]:
cluster_labels = [c.label['closest_cluster'].float32_tensor.values[0] for c in flattened_cluster_info]

In [32]:
cluster_df = pd.DataFrame(cluster_labels)[0].value_counts()
cluster_df.to_csv('german_cluster_df.csv')
print('Cluster Membership')
print(cluster_df)

Cluster Membership
1.0    664862
6.0    222121
5.0       776
0.0       775
4.0       736
2.0       692
7.0       640
3.0       618
Name: 0, dtype: int64


  from ipykernel import kernelapp as app


In [None]:
session.delete_endpoint(kmeans_predictor.endpoint)