In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

import boto3
from sagemaker import get_execution_role
import sagemaker.amazon.common as smac

Get data from S3

In [2]:
role = get_execution_role()
bucket = 'ml-labs-saras'
prefix = 'UFO-dataset'
data_key = 'UFO-fullset.csv'
data_location = 's3://{}/{}/{}'.format(bucket, prefix, data_key)

df =pd.read_csv(data_location, low_memory=False)


In [3]:
df.head()

Unnamed: 0,reportedTimestamp,eventDate,eventTime,shape,duration,witnesses,weather,firstName,lastName,latitude,longitude,sighting,physicalEvidence,contact,researchOutcome
0,1977-04-04T04:02:23.340Z,1977-03-31,23:46,circle,4,1,rain,Ila,Bashirian,47.329444,-122.578889,Y,N,N,explained
1,1982-11-22T02:06:32.019Z,1982-11-15,22:04,disk,4,1,partly cloudy,Eriberto,Runolfsson,52.664913,-1.034894,Y,Y,N,explained
2,1992-12-07T19:06:52.482Z,1992-12-07,19:01,circle,49,1,clear,Miller,Watsica,38.951667,-92.333889,Y,N,N,explained
3,2011-02-24T21:06:34.898Z,2011-02-21,20:56,disk,13,1,partly cloudy,Clifton,Bechtelar,41.496944,-71.367778,Y,N,N,explained
4,1991-03-09T16:18:45.501Z,1991-03-09,11:42,circle,17,1,mostly cloudy,Jayda,Ebert,47.606389,-122.330833,Y,N,N,explained


In [4]:
df.shape

(18000, 15)

Cleaning, transforming and preparing data required to be fed to the algorithm

Create another dataframe with just latitude and longiture

In [5]:
df_geo = df[['latitude', 'longitude']]

In [6]:
df_geo.head()

Unnamed: 0,latitude,longitude
0,47.329444,-122.578889
1,52.664913,-1.034894
2,38.951667,-92.333889
3,41.496944,-71.367778
4,47.606389,-122.330833


In [7]:
df_geo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18000 entries, 0 to 17999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   latitude   18000 non-null  float64
 1   longitude  18000 non-null  float64
dtypes: float64(2)
memory usage: 281.4 KB


Check for missing values

In [8]:
missing_values = df_geo.isnull().values.any()
print('Are there any missing value {}'.format(missing_values))
if missing_values:
    df_geo[df_geo.isnull().any(axis=1)]

Are there any missing value False


Transform dataframe into a numpy.ndarray - that's what K-means expects as its input - each row as a record object

In [9]:
data_train = df_geo.values.astype('float32')
data_train

array([[  47.329445, -122.57889 ],
       [  52.664913,   -1.034894],
       [  38.951668,  -92.333885],
       ...,
       [  36.86639 ,  -83.888885],
       [  35.385834,  -94.39833 ],
       [  29.883055,  -97.94111 ]], dtype=float32)

Create and train model

In [11]:
from sagemaker import KMeans

num_clusters = 10
output_location = 's3://' + bucket + '/model-artifacts'

kmeans = KMeans(role=role,
               train_instance_count=1,
               train_instance_type='ml.c4.xlarge',
               output_path=output_location,
               k=num_clusters)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [12]:
job_name = 'kmeans-geo-job-{}'.format(datetime.now().strftime('%Y%m%d%H%M%S'))
print('job name {}'.format(job_name))

job name kmeans-geo-job-20211207123229


In [13]:
%%time
kmeans.fit(kmeans.record_set(data_train), job_name=job_name) #creates 10 clusters on the 18000 lat and long records

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-12-07 12:32:39 Starting - Starting the training job...
2021-12-07 12:32:48 Starting - Launching requested ML instancesProfilerReport-1638880359: InProgress
......
2021-12-07 12:33:50 Starting - Preparing the instances for training............
2021-12-07 12:35:51 Downloading - Downloading input data...
2021-12-07 12:36:40 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[12/07/2021 12:36:45 INFO 140164129539904] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'eval_metrics': '["msd"]', 'force_dense': 'true', '_disable_wait_

At this point Sagemaker has created a model artifcact and placed it in our output folder in our S3 bucket. We need to do few things to see the latitude and longitude for our 10 clusters and the center point of those clusters.

Here we are going to de-serialize' our model artifact. We'll open and review them in our notebook instance. We can unzip our model artifact which will contain model_algo-1. This is a serialized Apache MXNetobject.

From here we can load the serialized object into a numpy.ndarray and extract the clustered centroids from the numpy.ndarray.

After we extract the results into a dataframe, we can create a csv and put into an S3 bucket and visualize with Quicksight.

In [14]:
import os
model_key = 'model-artifacts/' + job_name + '/output/model.tar.gz'

boto3.resource('s3').Bucket(bucket).download_file(model_key, 'model.tar.gz')
os.system('tar -zxvf model.tar.gz')
os.system('unzip model_alog-1')

2304

Model is now downloaded to our jupiter instance. Next we need to use the MXNet library to de-serialize our model object we just unzipped

In [15]:
!pip install mxnet

Collecting mxnet
  Downloading mxnet-1.8.0.post0-py2.py3-none-manylinux2014_x86_64.whl (46.9 MB)
[K     |████████████████████████████████| 46.9 MB 312 kB/s  eta 0:00:01     |████████████████▍               | 24.0 MB 23.1 MB/s eta 0:00:01
[?25hCollecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
Successfully installed graphviz-0.8.4 mxnet-1.8.0.post0
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [16]:
import mxnet as mx

#creating an ndarray for the model we just downloaded
Kmeans_model_parms = mx.ndarray.load('model_algo-1')

#create a dataframe of the center point of our clusters
cluster_centroid_kmeans = pd.DataFrame(Kmeans_model_parms[0].asnumpy())
cluster_centroid_kmeans.columns = df_geo.columns
cluster_centroid_kmeans

Unnamed: 0,latitude,longitude
0,48.310387,-123.021492
1,41.184879,-75.183205
2,-7.306179,122.170113
3,48.126362,5.071083
4,33.48563,-95.251297
5,41.148087,-87.472839
6,39.743664,-106.243248
7,30.593796,-81.402596
8,35.69508,-117.945465
9,25.124838,-148.613235


Upload Dataframe to S3 and view on Quicksight

In [18]:
#convert to csv
from io import  StringIO

csv_buffer = StringIO()
cluster_centroid_kmeans.to_csv(csv_buffer, index=False)
s3_resource = boto3.resource('s3')
s3_resource.Object(bucket, 'results/ten_locations_kmeans.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '44CTGPCWVEV1E23D',
  'HostId': 'qDxt9ljTRC9sq9cQomleq7x2sy3wN85/8Ud2wf3iEbte2fo6suiXOLAnT2QXuPxkeSTK4g018Oc=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'qDxt9ljTRC9sq9cQomleq7x2sy3wN85/8Ud2wf3iEbte2fo6suiXOLAnT2QXuPxkeSTK4g018Oc=',
   'x-amz-request-id': '44CTGPCWVEV1E23D',
   'date': 'Tue, 07 Dec 2021 14:11:14 GMT',
   'etag': '"f1b7adf16874109fff75513d34e4e28f"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"f1b7adf16874109fff75513d34e4e28f"'}

Create a QuickSight account and view it on a map. Quicksight can directly access from S3.