# Load Libraries

In [28]:
# pip3 install s3fs
from sagemaker import Session
import sagemaker
import boto3
import re
from sagemaker import get_execution_role
import numpy as np
import pandas as pd
import os

In [29]:
role = get_execution_role()

bucket_name = 'eliezerraj-908671954593-dataset'
prefix_name = 'customer/notebook/output'
output_path = f's3://{bucket_name}/{prefix_name}'
data_location = f"s3://{bucket_name}/{prefix_name}/train"

train_data_file = f's3://{bucket_name}/{prefix_name}/train/train_data.csv'

print("---------------------------------")
print(f"bucket_name '{bucket_name}':")
print(f"prefix_name '{prefix_name}':")
print(f"data_location '{data_location}':")
print(f"output_path '{output_path}':")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
---------------------------------
bucket_name 'eliezerraj-908671954593-dataset':
prefix_name 'customer/notebook/output':
data_location 's3://eliezerraj-908671954593-dataset/customer/notebook/output/train':
output_path 's3://eliezerraj-908671954593-dataset/customer/notebook/output':


In [30]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()

#df_data = pd.read_csv(train_data_file, skiprows = 1, header=None)
#df_data_scaled = scaler.fit_transform(df_data).astype('float32')
#df_data_scaled

# Id the dataset is already scaled
df_data = pd.read_csv(train_data_file, header=None)
df_data_scaled = df_data.to_numpy().astype('float32')
df_data_scaled

array([[-0.16540559,  0.50336814, -0.5026349 ,  0.6200389 ],
       [ 0.3335704 ,  2.0431986 ,  0.8342485 , -0.736292  ],
       [ 0.58305836,  0.50336814,  0.8342485 ,  1.2982044 ],
       ...,
       [-0.29014957, -1.0364624 , -0.5026349 , -0.736292  ],
       [-2.0365655 , -0.26654714,  0.8342485 , -0.05812655],
       [-0.41489357, -0.26654714,  0.8342485 , -0.736292  ]],
      dtype=float32)

# Build Model

In [31]:
from sagemaker import KMeans

kmeans = KMeans(role=role,
                instance_count=1,
                instance_type='ml.m4.xlarge',
                data_location=data_location,
                output_path=output_path,
                init_method='kmeans++',
                k=5,
                epochs=5)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [32]:
kmeans.fit(kmeans.record_set(df_data_scaled))

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: kmeans-2024-04-29-14-51-14-549


2024-04-29 14:51:14 Starting - Starting the training job...
2024-04-29 14:51:30 Starting - Preparing the instances for training...
2024-04-29 14:52:03 Downloading - Downloading input data...
2024-04-29 14:52:33 Downloading - Downloading the training image.........
2024-04-29 14:54:14 Training - Training image download completed. Training in progress.
2024-04-29 14:54:14 Uploading - Uploading generated training model.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[04/29/2024 14:54:09 INFO 140277581055808] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'init_method': 'random', 'mini_batch_size': '5000', 'epochs': '1', 'extra_center_factor': 'auto', 'local_lloyd_max_iter': '300', 'local_lloyd_tol': '0.0001', 'local_lloyd_init_method': 'kmeans++', 'local_lloyd_num_trials': 'auto', 'half_life_time_size': '0', 'eval_metrics': '["msd"]', 'force_dense':

# NOT USE FOR INFERENCE - Deploy serverless model

In [None]:
job_name = kmeans.latest_training_job.name

model_output_path = f's3://{bucket_name}/{prefix_name}/{job_name}/output/model.tar.gz'

print("model_output_path : ", model_output_path)

In [None]:
# Get the model
from time import gmtime, strftime

client = boto3.client(service_name="sagemaker")
region = boto3.Session().region_name

model_name = "kmeans-serverless-customer-v1-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("Model name: " + model_name)

In [None]:
# Get the container image
from sagemaker import image_uris
container = image_uris.retrieve("kmeans", region, "latest")

print("containere: " + container)

In [None]:
# dummy environment variables
byo_container_env_vars = {"SAGEMAKER_CONTAINER_LOG_LEVEL": "20"}

create_model_response = client.create_model(
    ModelName=model_name,
    Containers=[
        {
            "Image": container,
            "Mode": "SingleModel",
            "ModelDataUrl": model_output_path,
            "Environment": byo_container_env_vars,
        }
    ],
    ExecutionRoleArn=role,
)

print("Model Arn: " + create_model_response["ModelArn"])

In [None]:
# Serveless endpoint config
model_epc_name = "mlops-serverless-epc-customer-v1-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

endpoint_config_response = client.create_endpoint_config(
    EndpointConfigName=model_epc_name,
    ProductionVariants=[
        {
            "VariantName": "byoVariant",
            "ModelName": model_name,
            "ServerlessConfig": {
                "MemorySizeInMB": 4096,
                "MaxConcurrency": 1,
            },
        },
    ],
)

# EC2 endpoint
kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

print("Endpoint Configuration Arn: " + endpoint_config_response["EndpointConfigArn"])

In [13]:
# Serveless endpoint

endpoint_name = "kmeans-serverless-ep-customer-v1" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

create_endpoint_response = client.create_endpoint(
    EndpointName=endpoint_name,
    EndpointConfigName=model_epc_name,
)

print("Endpoint Arn: " + create_endpoint_response["EndpointArn"])

Endpoint Arn: arn:aws:sagemaker:us-east-2:908671954593:endpoint/kmeans-serverless-ep-customer-v12024-04-27-02-56-37


# Deploy INFERENCE EC2 Model, Endpoint Config and Endpoint

In [33]:
# EC2 endpoint
kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: kmeans-2024-04-29-14-58-01-252
INFO:sagemaker:Creating endpoint-config with name kmeans-2024-04-29-14-58-01-252
INFO:sagemaker:Creating endpoint with name kmeans-2024-04-29-14-58-01-252


-------!

In [34]:
# Install model
kmeans_predictor.serializer = sagemaker.serializers.CSVSerializer()

# Inference Validation

In [53]:
df_data_scaled

array([[-0.16540559,  0.50336814, -0.5026349 ,  0.6200389 ],
       [ 0.3335704 ,  2.0431986 ,  0.8342485 , -0.736292  ],
       [ 0.58305836,  0.50336814,  0.8342485 ,  1.2982044 ],
       ...,
       [-0.29014957, -1.0364624 , -0.5026349 , -0.736292  ],
       [-2.0365655 , -0.26654714,  0.8342485 , -0.05812655],
       [-0.41489357, -0.26654714,  0.8342485 , -0.736292  ]],
      dtype=float32)

In [54]:
result = kmeans_predictor.predict(df_data_scaled)

clusters = [r.label["closest_cluster"].float32_tensor.values[0] for r in result]

print("===> (CLUSTER )result for 4 posicion : ", clusters[0:4])

===> (CLUSTER )result for 4 posicion :  [0.0, 2.0, 1.0, 0.0]


In [61]:
dataset_name = 'customer_profile.csv'
dataset_location = 's3://{}/customer/{}'.format(bucket_name, dataset_name)
dataset_location

df_inference = pd.read_csv(dataset_location)

In [62]:
df_inference["CLUSTER"] = clusters
df_inference.head(10)

Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,CLUSTER
0,768805383,45,M,3,High School,Married,$60K - $80K,0.0
1,818770008,49,F,5,Graduate,Single,Less than $40K,2.0
2,713982108,51,M,3,Graduate,Married,$80K - $120K,1.0
3,769911858,40,F,4,High School,Unknown,Less than $40K,0.0
4,709106358,40,M,3,Uneducated,Married,$60K - $80K,0.0
5,713061558,44,M,2,Graduate,Married,$40K - $60K,2.0
6,810347208,51,M,4,Unknown,Married,$120K +,1.0
7,818906208,32,M,0,High School,Unknown,$60K - $80K,4.0
8,710930508,37,M,3,Uneducated,Single,$60K - $80K,0.0
9,719661558,48,M,2,Graduate,Single,$80K - $120K,1.0


In [64]:
df_inference_train = pd.read_csv(train_data_file, names=['Customer_Age', 
                                                      'Dependent_count', 
                                                      'Education_Level_Quality',
                                                      'Income_Category_Quality'])
df_inference_train["CLUSTER"] = clusters
df_inference_train.head(10)

Unnamed: 0,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality,CLUSTER
0,-0.165406,0.503368,-0.502635,0.620039,0.0
1,0.33357,2.043199,0.834248,-0.736292,2.0
2,0.583058,0.503368,0.834248,1.298204,1.0
3,-0.789126,1.273283,-0.502635,-0.736292,0.0
4,-0.789126,0.503368,-1.171077,0.620039,0.0
5,-0.29015,-0.266547,0.834248,-0.058127,2.0
6,0.583058,1.273283,-1.171077,1.97637,1.0
7,-1.787077,-1.806378,-0.502635,0.620039,4.0
8,-1.163358,0.503368,-1.171077,0.620039,0.0
9,0.208826,-0.266547,0.834248,1.298204,1.0


# Add a new data

In [139]:
# new data customes
df_new_customer_data = [{ 'Customer_Age': 20, 
                         'Dependent_count': 1, 
                         'Education_Level_Quality': 1, 
                         'Income_Category_Quality': 1 }]

df_new_customer_data = pd.DataFrame(df_new_customer_data)
df_new_customer_data

Unnamed: 0,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality
0,20,1,1,1


In [144]:
df_customer_test = pd.concat([df_new_customer_data, df_data_raw], axis=0)
df_customer_test.head(3)

Unnamed: 0,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality
0,20,1,1,1
0,45,3,1,3
1,49,5,3,1


In [141]:
df_new_customer_data_scaled_02 = scaler.fit_transform(df_customer_test).astype('float32')
df_new_customer_data_scaled_02

array([[-3.2820961 , -1.0363563 , -0.50260377, -0.736236  ],
       [-0.16500166,  0.50346863, -0.50260377,  0.62012565],
       [ 0.33373347,  2.0432935 ,  0.8343289 , -0.736236  ],
       ...,
       [-0.28968543, -1.0363563 , -0.50260377, -0.736236  ],
       [-2.0352583 , -0.26644385,  0.8343289 , -0.05805517],
       [-0.41436923, -0.26644385,  0.8343289 , -0.736236  ]],
      dtype=float32)

In [142]:
new_customer_result_02 = kmeans_predictor.predict(df_new_customer_data_scaled_02)
new_customer_clusters_02 = [r.label["closest_cluster"].float32_tensor.values[0] for r in new_customer_result_02]

In [148]:
df_customer_test["CLUSTER"] = new_customer_clusters_02
df_customer_test.head(11)

Unnamed: 0,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality,CLUSTER
0,20,1,1,1,4.0
0,45,3,1,3,1.0
1,49,5,3,1,3.0
2,51,3,3,4,2.0
3,40,4,1,1,1.0
4,40,3,0,3,1.0
5,44,2,3,2,3.0
6,51,4,0,5,2.0
7,32,0,1,3,4.0
8,37,3,0,3,1.0


In [149]:
print(kmeans_predictor.endpoint)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


kmeans-2024-04-26-02-18-34-583


In [150]:
kmeans_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: kmeans-2024-04-26-02-18-34-583
INFO:sagemaker:Deleting endpoint with name: kmeans-2024-04-26-02-18-34-583
