In [32]:
# Load Libraries
import pandas as pd
import os
import boto3
import datetime

In [33]:
print('start time:', datetime.datetime.now())

# Load data from csv
s3_client = boto3.client("s3")

dataset_name = 'customer_profile.csv'
data_bucket_name = "eliezerraj-908671954593-dataset/customer"

data_location = 's3://{}/{}'.format(data_bucket_name, dataset_name)

#data = pd.read_csv(data_location, sep=';',engine='python')
data = pd.read_csv(data_location)

print("shape : ", data.shape)
print("indesx : ",data.index)

start time: 2024-04-27 02:33:38.429700
shape :  (10127, 7)
indesx :  RangeIndex(start=0, stop=10127, step=1)


In [34]:
#Cleaning and econding
df_customer = data.filter(['CLIENTNUM','Customer_Age','Gender','Dependent_count','Education_Level','Marital_Status','Income_Category','Months_on_book'], axis=1)

print("-------------------- show categorical data  ----------------------------")
print(df_customer["Education_Level"].unique())
print(df_customer["Income_Category"].unique())
print("--------------------- show categorical data  ---------------------------")

education_map = {'Uneducated': 0, 'Unknown':0, 'High School': 1, 'College':2,'Graduate': 3,'Post-Graduate':4 , 'Doctorate': 5}
income_map = { 'Unknown': 0,'Less than $40K': 1, '$40K - $60K':2, '$60K - $80K': 3, '$80K - $120K':4 ,'$120K +': 5}

df_customer['Education_Level_Quality'] = df_customer['Education_Level'].map(education_map)
df_customer['Income_Category_Quality'] = df_customer['Income_Category'].map(income_map)
df_customer.head(2)

-------------------- show categorical data  ----------------------------
['High School' 'Graduate' 'Uneducated' 'Unknown' 'College' 'Post-Graduate'
 'Doctorate']
['$60K - $80K' 'Less than $40K' '$80K - $120K' '$40K - $60K' '$120K +'
 'Unknown']
--------------------- show categorical data  ---------------------------


Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Education_Level_Quality,Income_Category_Quality
0,768805383,45,M,3,High School,Married,$60K - $80K,1,3
1,818770008,49,F,5,Graduate,Single,Less than $40K,3,1


In [35]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [36]:
df_training = df_customer[['Customer_Age',
                           'Dependent_count',
                           'Education_Level_Quality', 
                           'Income_Category_Quality']]
df_training.head(3)

Unnamed: 0,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality
0,45,3,1,3
1,49,5,3,1
2,51,3,3,4


In [37]:
##Fitting kmeans to the dataset with k=3
scaler = StandardScaler()

df_training = df_customer[['Customer_Age',
                           'Dependent_count',
                           'Education_Level_Quality',
                           'Income_Category_Quality']]

df_training_scaled = scaler.fit_transform(df_training).astype('float32')

model = KMeans(n_clusters = 4, random_state = 0)
result_prediction = model.fit_predict(df_training_scaled)

In [31]:
df_training_scaled

array([[-0.16540559,  0.50336814, -0.5026349 ,  0.6200389 ],
       [ 0.3335704 ,  2.0431986 ,  0.8342485 , -0.736292  ],
       [ 0.58305836,  0.50336814,  0.8342485 ,  1.2982044 ],
       ...,
       [-0.29014957, -1.0364624 , -0.5026349 , -0.736292  ],
       [-2.0365655 , -0.26654714,  0.8342485 , -0.05812655],
       [-0.41489357, -0.26654714,  0.8342485 , -0.736292  ]],
      dtype=float32)

In [38]:
# convert array into dataframe 
df_data_scaled = pd.DataFrame(df_training_scaled).astype('float32')
df_data_scaled.to_csv("df_data_scaled.csv")

In [21]:
df_training_final = df_customer[['CLIENTNUM','Customer_Age','Dependent_count','Education_Level_Quality', 'Income_Category_Quality']]
df_training_final["CLUSTER"] = result_prediction
df_training_final.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_training_final["CLUSTER"] = result_prediction


Unnamed: 0,CLIENTNUM,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality,CLUSTER
0,768805383,45,3,1,3,2
1,818770008,49,5,3,1,3
2,713982108,51,3,3,4,2
3,769911858,40,4,1,1,0
4,709106358,40,3,0,3,0


In [22]:
# new data customes
df_new_customer_data = [{ 'CLIENTNUM': 9999, 
                         'Customer_Age': 49, 
                         'Dependent_count': 5, 
                         'Education_Level_Quality': 3, 
                         'Income_Category_Quality': 1 }]

df_new_customer_data = pd.DataFrame(df_new_customer_data)
df_new_customer_data

Unnamed: 0,CLIENTNUM,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality
0,9999,49,5,3,1


In [23]:
df_customer_test = df_customer[['CLIENTNUM','Customer_Age','Dependent_count','Education_Level_Quality','Income_Category_Quality']]
df_customer_test = pd.concat([df_new_customer_data, df_customer_test], axis=0)
df_customer_test

Unnamed: 0,CLIENTNUM,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality
0,9999,49,5,3,1
0,768805383,45,3,1,3
1,818770008,49,5,3,1
2,713982108,51,3,3,4
3,769911858,40,4,1,1
...,...,...,...,...,...
10122,772366833,50,2,3,2
10123,710638233,41,2,0,2
10124,716506083,44,1,1,1
10125,717406983,30,2,3,2


In [24]:
df_customer_prediction = df_customer_test[['Customer_Age','Dependent_count','Education_Level_Quality','Income_Category_Quality']]
df_customer_prediction

Unnamed: 0,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality
0,49,5,3,1
0,45,3,1,3
1,49,5,3,1
2,51,3,3,4
3,40,4,1,1
...,...,...,...,...
10122,50,2,3,2
10123,41,2,0,2
10124,44,1,1,1
10125,30,2,3,2


In [25]:
df_customer_prediction_scaled = scaler.fit_transform(df_customer_prediction)

In [26]:
df_customer_prediction_result = model.predict(df_customer_prediction_scaled)
df_customer_prediction_result[:1]

array([3], dtype=int32)

In [27]:
df_customer_test["CLUSTER"] = df_customer_prediction_result
df_customer_test

Unnamed: 0,CLIENTNUM,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality,CLUSTER
0,9999,49,5,3,1,3
0,768805383,45,3,1,3,2
1,818770008,49,5,3,1,3
2,713982108,51,3,3,4,2
3,769911858,40,4,1,1,0
...,...,...,...,...,...,...
10122,772366833,50,2,3,2,3
10123,710638233,41,2,0,2,0
10124,716506083,44,1,1,1,0
10125,717406983,30,2,3,2,3


In [28]:
df_new_customer_data["CLUSTER"] = df_customer_prediction_result[:1]
df_new_customer_data

Unnamed: 0,CLIENTNUM,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality,CLUSTER
0,9999,49,5,3,1,3


In [29]:
df_customer_test

Unnamed: 0,CLIENTNUM,Customer_Age,Dependent_count,Education_Level_Quality,Income_Category_Quality,CLUSTER
0,9999,49,5,3,1,3
0,768805383,45,3,1,3,2
1,818770008,49,5,3,1,3
2,713982108,51,3,3,4,2
3,769911858,40,4,1,1,0
...,...,...,...,...,...,...
10122,772366833,50,2,3,2,3
10123,710638233,41,2,0,2,0
10124,716506083,44,1,1,1,0
10125,717406983,30,2,3,2,3


In [30]:
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

In [31]:
# Look at the distribuition
df_customer_test_order = order_cluster('CLUSTER','Dependent_count',df_customer_test,False)
df_customer_test_order.groupby('CLUSTER')['Dependent_count'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
CLUSTER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2971.0,2.662067,1.21726,0.0,2.0,3.0,4.0,5.0
1,2655.0,2.648588,1.239748,0.0,2.0,3.0,4.0,5.0
2,2643.0,2.639425,1.110369,0.0,2.0,3.0,3.0,5.0
3,1859.0,0.994083,0.799938,0.0,0.0,1.0,2.0,3.0


In [32]:
df_customer_test_order = order_cluster('CLUSTER','Customer_Age',df_customer_test,False)
df_customer_test_order.groupby('CLUSTER')['Customer_Age'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
CLUSTER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1859.0,56.528779,4.748418,41.0,53.0,56.0,60.0,73.0
1,2643.0,46.653046,6.269401,26.0,42.0,47.0,52.0,62.0
2,2971.0,42.921912,6.527741,26.0,39.0,44.0,47.0,58.0
3,2655.0,42.666667,6.574769,26.0,38.0,43.0,47.0,58.0


In [33]:
df_customer_test_order = order_cluster('CLUSTER','Education_Level_Quality',df_customer_test,False)
df_customer_test_order.groupby('CLUSTER')['Education_Level_Quality'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
CLUSTER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2971.0,3.210367,0.839176,2.0,3.0,3.0,3.0,5.0
1,1859.0,1.681011,1.393094,0.0,0.0,1.0,3.0,5.0
2,2643.0,1.566023,1.342435,0.0,0.0,1.0,3.0,5.0
3,2655.0,0.355179,0.478658,0.0,0.0,0.0,1.0,1.0


In [34]:
df_customer_test_order = order_cluster('CLUSTER','Income_Category_Quality',df_customer_test,False)
df_customer_test_order.groupby('CLUSTER')['Income_Category_Quality'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
CLUSTER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2643.0,4.068483,0.689108,3.0,4.0,4.0,5.0,5.0
1,1859.0,1.493814,1.052108,0.0,1.0,1.0,2.0,5.0
2,2971.0,1.423763,0.955488,0.0,1.0,1.0,2.0,4.0
3,2655.0,1.266667,0.840121,0.0,1.0,1.0,2.0,3.0
