In [159]:
import pandas as pd
from sklearn import preprocessing
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from pyclustering.cluster.kmeans import kmeans
from pyclustering.utils.metric import type_metric, distance_metric
from pyclustering.cluster.kmeans import kmeans
from pyclustering.utils.metric import distance_metric
from pyclustering.cluster.center_initializer import random_center_initializer
from pyclustering.cluster.encoder import type_encoding
from pyclustering.cluster.encoder import cluster_encoder

In [207]:
def read_csv(path: str) -> 'dataframe':
    '''read the excel file and return a df'''
    
    return pd.read_csv(path)


def drop_cols(df: 'dataframe', cols: list):
    '''drop columns that are specified'''
    df.drop(columns=cols, inplace=True)

    
def label_encoder(df: 'dataframe', col: 'str'):
    '''label encodes the column of the df
    it uses sklearn label encoder'''
    le = preprocessing.LabelEncoder()
    vals = df[col].values
    le.fit(vals)
    encoded_vals = le.transform(vals)
    df[col] = encoded_vals


#This function is not used (Normalization is not necessary for K-means
#Performing k-means on normalized data gave the same result
def normalize(df: 'dataframe', cols: list):
    '''normalizes columns of dataframe'''
    transformer = preprocessing.Normalizer()
    for col in cols:
        vals = df[col].values.reshape(1, -1)
        transformer.fit(vals)
        norm_vals = transformer.transform(vals)
        df[col] = norm_vals[0]


def k_means_manhattan(df):
    initial_centers = random_center_initializer(df.values, 5, random_state=5).initialize()
    Kmeans_manhattan = kmeans(df, initial_centers=initial_centers, metric=distance_metric(2))
    Kmeans_manhattan.process()
    pyClusters = Kmeans_manhattan.get_clusters()
    pyCenters = Kmeans_manhattan.get_centers()
    pyEncoding = Kmeans_manhattan.get_cluster_encoding()
    pyEncoder = cluster_encoder(pyEncoding, pyClusters, df)
    pyLabels = pyEncoder.set_encoding(0).get_clusters()
    
    #creating a label for dataframe
    df['labels'] = pyLabels

In [147]:
label_encoder(df, 'Genre')

In [130]:
normalize(df, ['Spending_Score', 'Annual_Income_(k$)', 'Age'])

In [208]:
k_means_manhattan(df)

In [209]:
df

Unnamed: 0,Age,Annual_Income_(k$),Spending_Score,labels
0,19,15,39,1
1,21,15,81,1
2,20,16,6,1
3,23,16,77,1
4,31,17,40,1
...,...,...,...,...
195,35,120,79,2
196,45,126,28,4
197,32,126,74,2
198,32,137,18,4


In [148]:
drop_cols(df, ['CustomerID','Genre'])

In [189]:
df['labels'] = k_means_manhattan(df)

In [146]:
df = read_excel('Mall_Customers.csv')
df.head(10)

Unnamed: 0,CustomerID,Genre,Age,Annual_Income_(k$),Spending_Score
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
5,6,Female,22,17,76
6,7,Female,35,18,6
7,8,Female,23,18,94
8,9,Male,64,19,3
9,10,Female,30,19,72


In [26]:
df.describe()

Unnamed: 0,CustomerID,Age,Annual_Income_(k$),Spending_Score
count,200.0,200.0,200.0,200.0
mean,100.5,38.85,60.56,50.2
std,57.879185,13.969007,26.264721,25.823522
min,1.0,18.0,15.0,1.0
25%,50.75,28.75,41.5,34.75
50%,100.5,36.0,61.5,50.0
75%,150.25,49.0,78.0,73.0
max,200.0,70.0,137.0,99.0


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   CustomerID          200 non-null    int64 
 1   Genre               200 non-null    object
 2   Age                 200 non-null    int64 
 3   Annual_Income_(k$)  200 non-null    int64 
 4   Spending_Score      200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [29]:
df.isnull().sum()

CustomerID            0
Genre                 0
Age                   0
Annual_Income_(k$)    0
Spending_Score        0
dtype: int64