In [None]:
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv("income.csv")
df.head()

In [None]:
plt.scatter(df.Age,df['Income($)'])
plt.xlabel('Age')
plt.ylabel('Income($)')

In [None]:
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(df[['Age','Income($)']])
y_predicted

In [None]:
df['cluster']=y_predicted
df.head()

In [None]:
km.cluster_centers_

In [None]:
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
plt.scatter(df1.Age,df1['Income($)'],color='green')
plt.scatter(df2.Age,df2['Income($)'],color='red')
plt.scatter(df3.Age,df3['Income($)'],color='black')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='purple',marker='*',label='centroid')
plt.xlabel('Age')
plt.ylabel('Income ($)')
plt.legend()

<h4 style='color:purple'>Preprocessing using min max scaler</h4>

The KMeans clustering algorithm is sensitive to the scale of the data. If one feature (e.g., Income) has a much larger range or variance than another feature (e.g., Age), then the clustering will be dominated by the feature with the larger range. This is because the algorithm is based on calculating distances between data points.
By scaling the features to a consistent range, you're ensuring that each feature contributes equally to the distance computations. This can significantly change (and often improve) the resulting clusters.

Without scaling, if the range of Income is 20,000-100,000 and the range of Age is 20-70, then differences in Income will massively outweigh differences in Age when computing distances, even if Age is an important feature for clustering.

With scaling, both Age and Income will have a range between 0 and 1. This means both features have an equal say in the clustering process.

So, by using MinMaxScaler, you're making sure that the clustering isn't unduly influenced by the original scales of the variables, which often leads to more meaningful and balanced clusters.


In [None]:
scaler = MinMaxScaler()

scaler.fit(df[['Income($)']])
df['Income($)'] = scaler.transform(df[['Income($)']])

scaler.fit(df[['Age']])
df['Age'] = scaler.transform(df[['Age']])

Scaling the Data:

The MinMaxScaler is applied separately to the Income and Age columns.
This scaler scales and translates each feature individually such that it is in the range [0, 1].
This means, after scaling, the smallest value of Income and Age will be 0 and the largest value will be 1.

In [None]:
df.head()

In [None]:
plt.scatter(df.Age,df['Income($)'])

In [None]:
km = KMeans(n_clusters=3)
y_predicted = km.fit_predict(df[['Age','Income($)']])
y_predicted

In [None]:
df['cluster']=y_predicted
df.head()

In [None]:
km.cluster_centers_

In [None]:
df1 = df[df.cluster==0]
df2 = df[df.cluster==1]
df3 = df[df.cluster==2]
plt.scatter(df1.Age,df1['Income($)'],color='green')
plt.scatter(df2.Age,df2['Income($)'],color='red')
plt.scatter(df3.Age,df3['Income($)'],color='black')
plt.scatter(km.cluster_centers_[:,0],km.cluster_centers_[:,1],color='purple',marker='*',label='centroid')
plt.legend()

<h4 style='color:purple'>Elbow Plot</h4>

In [None]:
sse = []
k_rng = range(1,10)
for k in k_rng:
    km = KMeans(n_clusters=k)
    km.fit(df[['Age','Income($)']])
    sse.append(km.inertia_)

In [None]:
plt.xlabel('K')
plt.ylabel('Sum of squared error')
plt.plot(k_rng,sse)