In [None]:
import pandas as pd
import numpy as np
from sagemaker import get_execution_role

role = get_execution_role()
bucket='change-me'

In [None]:
assert(bucket!='change-me'), "Please change your bucket id"

In [None]:
# NOTE: this step is not strictly required if you've already run the 01_... notebook
!rm -f /tmp/ml-100k.zip
!rm -rf /tmp/ml-100k
!wget -O /tmp/ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -j -o /tmp/ml-100k.zip -d /tmp/ml-100k

In [None]:
users = pd.read_csv("/tmp/ml-100k/u.user", sep='|', header=None, index_col=['userid'],
    names=['userid', 'age', 'gender', 'occupation', 'postal_code'])
users = users.drop('postal_code',1)
users.head()

In [None]:
users_onehot=pd.get_dummies(users)
users_onehot.head()

In [None]:
users_onehot_df = users_onehot.values.astype(np.float32) # built-in K-Means requires float32
print(users_onehot_df)

In [None]:
from sagemaker import KMeans

data_location = 's3://{}/recommender_workshop/kmeans/data'.format(bucket)
output_location = 's3://{}/recommender_workshop/kmeans/output'.format(bucket)

print('training data will be uploaded to: {}'.format(data_location))
print('training artifacts will be uploaded to: {}'.format(output_location))

#!aws s3 cp /tmp/ml-100k/u.user $data_location/u.user

k_value=5 #number of clusters
kmeans = KMeans(role=role,
                train_instance_count=1,
                train_instance_type='ml.c4.2xlarge',
                output_path=output_location,
                k=k_value,
                data_location=data_location)

In [None]:
%%time

kmeans.fit(kmeans.record_set(users_onehot_df))

In [None]:
%%time

kmeans_predictor = kmeans.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

In [None]:
pd.get_dummies(users.head())

In [None]:
result = kmeans_predictor.predict(users_onehot_df[0:5])
print(result)

In [None]:
result = kmeans_predictor.predict(users_onehot_df)

cluster=[]
for i in range(k_value):
    cluster.append([r.label['distance_to_cluster'].float32_tensor.values[0] for r in result if r.label['closest_cluster'].float32_tensor.values[0] == i])

cluster_zip = sorted(zip())


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

for i in range(k_value):
    fig,ax = plt.subplots()
    ax.hist(cluster[i])

plt.plot()

In [None]:
import sagemaker
sagemaker.Session().delete_endpoint(kmeans_predictor.endpoint)