<a href="https://colab.research.google.com/github/chibaba/DATA-SCIENCE-AND-MACHINE-LEARNING/blob/master/Finding_the_Closest_Centroids_in_Our_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import pandas as pd
from sklearn.cluster import KMeans
import altair as alt

In [25]:
file_url = 'https://raw.githubusercontent.com/'\
           'PacktWorkshops/The-Data-Science-Workshop/'\
           'master/Chapter05/DataSet/taxstats2015.csv'
df = pd.read_csv(file_url, \
                 usecols=['Postcode', \
                          'Average total business income', \
                          'Average total business expenses'])

In [26]:
X = df[['Average total business income', \
        'Average total business expenses']]

In [27]:
business_income_min = df['Average total business income'].min()
business_income_max = df['Average total business income'].max()
business_expenses_min = df['Average total business expenses']\
                        .min()
business_expenses_max = df['Average total business expenses']\
                        .max()

In [28]:
print(business_income_min)
print(business_income_max)
print(business_expenses_min)
print(business_expenses_max)

0
876324
0
884659


In [29]:
import random
random.seed(42)

In [30]:
centroids = pd.DataFrame()


In [31]:
centroids\
['Average total business income'] = random.sample\
                                    (range\
                                    (business_income_min, \
                                     business_income_max), 4)

In [32]:
centroids\
['Average total business expenses'] = random.sample\
                                      (range\
                                      (business_expenses_min,\
                                       business_expenses_max), 4)

In [33]:
centroids['cluster'] = centroids.index
centroids

Unnamed: 0,Average total business income,Average total business expenses,cluster
0,670487,288389,0
1,116739,256787,1
2,26225,234053,2
3,777572,146316,3


In [34]:
chart1 = alt.Chart(df.head()).mark_circle()\
            .encode(x='Average total business income', \
                    y='Average total business expenses', \
                    color=alt.value('orange'), \
                    tooltip=['Postcode', \
                             'Average total business income', \
                             'Average total business expenses'])\
                   .interactive()

In [35]:
chart2 = alt.Chart(centroids).mark_circle(size=100)\
            .encode(x='Average total business income', \
                    y='Average total business expenses', \
                    color=alt.value('black'), \
                    tooltip=['cluster', \
                             'Average total business income',\
                             'Average total business expenses'])\
                   .interactive()

In [36]:
chart1 + chart2


In [37]:
def squared_euclidean(data_x, data_y, \
                      centroid_x, centroid_y, ):
    return (data_x - centroid_x)**2 + (data_y - centroid_y)**2

In [38]:
data_x = df.at[0, 'Average total business income']
data_y = df.at[0, 'Average total business expenses']

In [39]:
distances = [squared_euclidean\
             (data_x, data_y, centroids.at\
              [i, 'Average total business income'], \
              centroids.at[i, \
              'Average total business expenses']) \
              for i in range(4)]
distances

[215601466600, 10063365460, 34245932020, 326873037866]

In [40]:
cluster_index = distances.index(min(distances))


In [41]:
df.at[0, 'cluster'] = cluster_index


In [42]:
df.head()


Unnamed: 0,Postcode,Average total business income,Average total business expenses,cluster
0,2000,210901,222191,1.0
1,2006,69983,48971,
2,2007,575099,639499,
3,2008,53329,32173,
4,2009,237539,222993,


In [44]:
distances = [squared_euclidean\
             (df.at[1, 'Average total business income'], \
              df.at[1, 'Average total business expenses'], \
              centroids.at[i, 'Average total business income'],\
              centroids.at[i, \
                           'Average total business expenses'])\
             for i in range(4)]
df.at[1, 'cluster'] = distances.index(min(distances))
distances = [squared_euclidean\
             (df.at[2, 'Average total business income'], \
              df.at[2, 'Average total business expenses'], \
              centroids.at[i, 'Average total business income'],\
              centroids.at[i, \
                           'Average total business expenses'])\
             for i in range(4)]
df.at[2, 'cluster'] = distances.index(min(distances))
distances = [squared_euclidean\
             (df.at[3, 'Average total business income'], \
              df.at[3, 'Average total business expenses'], \
              centroids.at[i, 'Average total business income'],\
              centroids.at[i, \
                           'Average total business expenses'])\
             for i in range(4)]
df.at[3, 'cluster'] = distances.index(min(distances))
distances = [squared_euclidean\
             (df.at[4, 'Average total business income'], \
              df.at[4, 'Average total business expenses'], \
              centroids.at[i, \
              'Average total business income'], \
              centroids.at[i, \
              'Average total business expenses']) \
             for i in range(4)]
df.at[4, 'cluster'] = distances.index(min(distances))
df.head()

Unnamed: 0,Postcode,Average total business income,Average total business expenses,cluster
0,2000,210901,222191,1.0
1,2006,69983,48971,2.0
2,2007,575099,639499,0.0
3,2008,53329,32173,2.0
4,2009,237539,222993,1.0


In [45]:
chart1 = alt.Chart(df.head()).mark_circle()\
            .encode(x='Average total business income', \
                    y='Average total business expenses', \
                    color='cluster:N', \
                    tooltip=['Postcode', 'cluster', \
                             'Average total business income', \
                             'Average total business expenses'])\
                   .interactive()
chart2 = alt.Chart(centroids).mark_circle(size=100)\
            .encode(x='Average total business income', \
                    y='Average total business expenses', \
                    color=alt.value('black'), \
                    tooltip=['cluster', \
                             'Average total business income',\
                             'Average total business expenses'])\
                   .interactive()
chart1 + chart2

In [46]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

In [47]:
min_max_scaler.fit(X)


MinMaxScaler(copy=True, feature_range=(0, 1))

In [48]:
X_min_max = min_max_scaler.transform(X)
X_min_max

array([[0.24066555, 0.25116005],
       [0.07985973, 0.05535579],
       [0.65626298, 0.72287627],
       ...,
       [0.05203897, 0.03244188],
       [0.0606488 , 0.04504561],
       [0.13814183, 0.10186976]])

In [49]:
X_min_max[:,0].min(), X_min_max[:,0].max(), \
X_min_max[:,1].min(), X_min_max[:,1].max()

(0.0, 1.0, 0.0, 1.0)

In [50]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()

In [51]:
X_scaled = standard_scaler.fit_transform(X)
X_scaled

array([[ 2.23613093,  3.06484229],
       [-0.36778919, -0.42507078],
       [ 8.96587802, 11.47246637],
       ...,
       [-0.81828928, -0.83347643],
       [-0.67887077, -0.60883418],
       [ 0.57596971,  0.40396994]])

In [52]:
X_scaled[:,0].min(), X_scaled[:,0].max(), \
X_scaled[:,1].min(), X_scaled[:,1].max()

(-1.6609535924297942, 14.531993341096587, -1.41170357415988, 16.41177526627243)

In [54]:
kmeans = KMeans(random_state=42, n_clusters=3, \
                init='k-means++', n_init=5)
kmeans.fit(X_scaled)
df['cluster7'] = kmeans.predict(X_scaled)
alt.Chart(df).mark_circle()\
             .encode(x='Average net tax', \
                     y='Average total deductions', \
                     color='cluster7:N', \
                     tooltip=['Postcode', 'cluster7', \
                              'Average net tax', \
                              'Average total deductions'])\
                    .interactive()

ValueError: ignored

alt.Chart(...)