In [1]:
# Load Libraries
# pip3 install fsspec / conda install -c conda-forge fsspec
# pip3 install s3fs

import pandas as pd
import os
import boto3
import datetime
from sagemaker import Session

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [3]:
# Load data from csv
print('start time:', datetime.datetime.now())

s3_client = boto3.client("s3")
s3 = boto3.resource('s3')

dataset_name = 'customer_profile.csv'
bucket_name = "eliezerraj-908671954593-dataset/customer"

location_scaler = 's3://{}'.format(bucket_name)
data_location = 's3://{}/{}'.format(bucket_name, dataset_name)

data = pd.read_csv(data_location)

print("---------------------")
print("data_location : ", data_location)
print("location_scaler : ", location_scaler)
print("shape : ", data.shape)
print("index : ",data.index)

start time: 2024-04-29 17:56:00.007104
---------------------
data_location :  s3://eliezerraj-908671954593-dataset/customer/customer_profile.csv
location_scaler :  s3://eliezerraj-908671954593-dataset/customer
shape :  (10127, 7)
index :  RangeIndex(start=0, stop=10127, step=1)


In [6]:
#Cleaning and econding
df_customer = data.filter(['CLIENTNUM','Customer_Age','Gender','Dependent_count','Education_Level','Marital_Status','Income_Category','Months_on_book'], axis=1)

print("-------------------- show categorical data  ----------------------------")
print(df_customer["Education_Level"].unique())
print(df_customer["Income_Category"].unique())
print("--------------------- show categorical data  ---------------------------")

education_map = {'Uneducated': 0, 'Unknown':0, 'High School': 1, 'College':2,'Graduate': 3,'Post-Graduate':4 , 'Doctorate': 5}
income_map = { 'Unknown': 0,'Less than $40K': 1, '$40K - $60K':2, '$60K - $80K': 3, '$80K - $120K':4 ,'$120K +': 5}

df_customer['Education_Level_Quality'] = df_customer['Education_Level'].map(education_map)
df_customer['Income_Category_Quality'] = df_customer['Income_Category'].map(income_map)
df_customer.head(2)

-------------------- show categorical data  ----------------------------
['High School' 'Graduate' 'Uneducated' 'Unknown' 'College' 'Post-Graduate'
 'Doctorate']
['$60K - $80K' 'Less than $40K' '$80K - $120K' '$40K - $60K' '$120K +'
 'Unknown']
--------------------- show categorical data  ---------------------------


Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Education_Level_Quality,Income_Category_Quality
0,768805383,45,M,3,High School,Married,$60K - $80K,1,3
1,818770008,49,F,5,Graduate,Single,Less than $40K,3,1


In [21]:
# show dataframe date

df_training = df_customer[['Customer_Age',
                           'Dependent_count',
                           'Education_Level_Quality', 
                           'Income_Category_Quality']]
df_training.head(3)

Unnamed: 0,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality
0,45,3,1,3
1,49,5,3,1
2,51,3,3,4


In [22]:
# Fitting kmeans to the dataset with k=3
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df_training = df_customer[['Customer_Age',
                           'Dependent_count',
                           'Education_Level_Quality',
                           'Income_Category_Quality']]

df_training_scaled = scaler.fit_transform(df_training).astype('float32')
df_training_scaled

array([[-0.16540559,  0.50336814, -0.5026349 ,  0.6200389 ],
       [ 0.3335704 ,  2.0431986 ,  0.8342485 , -0.736292  ],
       [ 0.58305836,  0.50336814,  0.8342485 ,  1.2982044 ],
       ...,
       [-0.29014957, -1.0364624 , -0.5026349 , -0.736292  ],
       [-2.0365655 , -0.26654714,  0.8342485 , -0.05812655],
       [-0.41489357, -0.26654714,  0.8342485 , -0.736292  ]],
      dtype=float32)

In [35]:
# Save model
import s3fs
import os
import joblib

fs = s3fs.S3FileSystem()
output_file = os.path.join(location_scaler, "scaler.joblib")

with fs.open(output_file, 'wb') as f:
    joblib.dump(scaler, f)

print("model saved :", output_file)

model saved : s3://eliezerraj-908671954593-dataset/customer/scaler.joblib


In [23]:
# Load model
import s3fs
import os
import joblib

fs = s3fs.S3FileSystem()
output_file = os.path.join(location_scaler, "scaler.joblib")

with fs.open(output_file, 'rb') as f:
    scaler_load = joblib.load(f)

print("model loaded !")

model loaded !


In [25]:
# Testing scaler
print(df_training.head(3))

df_training_scaled_by_loaded = scaler_load.fit_transform(df_training).astype('float32')

df_training_scaled_by_loaded

   Customer_Age  Dependent_count  Education_Level_Quality  \
0            45                3                        1   
1            49                5                        3   
2            51                3                        3   

   Income_Category_Quality  
0                        3  
1                        1  
2                        4  


array([[-0.16540559,  0.50336814, -0.5026349 ,  0.6200389 ],
       [ 0.3335704 ,  2.0431986 ,  0.8342485 , -0.736292  ],
       [ 0.58305836,  0.50336814,  0.8342485 ,  1.2982044 ],
       ...,
       [-0.29014957, -1.0364624 , -0.5026349 , -0.736292  ],
       [-2.0365655 , -0.26654714,  0.8342485 , -0.05812655],
       [-0.41489357, -0.26654714,  0.8342485 , -0.736292  ]],
      dtype=float32)

In [40]:
# Add new data to scale
import pandas as pd

df_new_customer_data = [{ 'Customer_Age': 23, 
                         'Dependent_count': 1, 
                         'Education_Level_Quality': 1, 
                         'Income_Category_Quality': 1 }]

df_new_customer_data = pd.DataFrame(df_new_customer_data)
df_new_customer_data

Unnamed: 0,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality
0,23,1,1,1


In [41]:
df_payload_scaled = scaler_load.fit_transform(df_new_customer_data).astype('float32')
df_payload_scaled

array([[0., 0., 0., 0.]], dtype=float32)

In [42]:
df_final = pd.concat([df_new_customer_data, df_training], axis=0)
df_final

Unnamed: 0,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality
0,23,1,1,1
0,45,3,1,3
1,49,5,3,1
2,51,3,3,4
3,40,4,1,1
...,...,...,...,...
10122,50,2,3,2
10123,41,2,0,2
10124,44,1,1,1
10125,30,2,3,2


In [43]:
df_final_scaled = scaler_load.fit_transform(df_final).astype('float32')
df_final_scaled

array([[-2.9084144 , -1.0363563 , -0.50260377, -0.736236  ],
       [-0.16505745,  0.50346863, -0.50260377,  0.62012565],
       [ 0.3337347 ,  2.0432935 ,  0.8343289 , -0.736236  ],
       ...,
       [-0.2897555 , -1.0363563 , -0.50260377, -0.736236  ],
       [-2.035528  , -0.26644385,  0.8343289 , -0.05805517],
       [-0.41445354, -0.26644385,  0.8343289 , -0.736236  ]],
      dtype=float32)