In [1]:
# Import the required libraries and dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
cali_df = pd.read_csv('new_house_data.csv')

In [3]:
cali_df.head()

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,population,households,median_income,median_house_value,ocean_proximity,avg_rooms_per_household,avg_bedrooms_per_household
0,0,-122.23,37.88,41.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,1.02381
1,1,-122.22,37.86,21.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.97188
2,2,-122.24,37.85,52.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,1.073446
3,3,-122.25,37.85,52.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,1.073059
4,4,-122.25,37.85,52.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,1.081081


In [4]:
mode_value = cali_df['avg_bedrooms_per_household'].mode()[0]
 # Fill missing values with the mode 
cali_df['avg_bedrooms_per_household'].fillna(mode_value, inplace=True)

In [5]:
cali_df.count()

Unnamed: 0                    20640
longitude                     20640
latitude                      20640
housing_median_age            20640
population                    20640
households                    20640
median_income                 20640
median_house_value            20640
ocean_proximity               20640
avg_rooms_per_household       20640
avg_bedrooms_per_household    20640
dtype: int64

In [6]:
# scale house value wiht inflation 
cali_df['median_house_value']= cali_df['median_house_value'] * 1.69
cali_df = cali_df.drop(columns='Unnamed: 0')

In [7]:
# look at final df
cali_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,population,households,median_income,median_house_value,ocean_proximity,avg_rooms_per_household,avg_bedrooms_per_household
0,-122.23,37.88,41.0,322.0,126.0,8.3252,764894.0,NEAR BAY,6.984127,1.02381
1,-122.22,37.86,21.0,2401.0,1138.0,8.3014,605865.0,NEAR BAY,6.238137,0.97188
2,-122.24,37.85,52.0,496.0,177.0,7.2574,595049.0,NEAR BAY,8.288136,1.073446
3,-122.25,37.85,52.0,558.0,219.0,5.6431,576797.0,NEAR BAY,5.817352,1.073059
4,-122.25,37.85,52.0,565.0,259.0,3.8462,578318.0,NEAR BAY,6.281853,1.081081


In [8]:
#columns to scale 
cali_df.columns


Index(['longitude', 'latitude', 'housing_median_age', 'population',
       'households', 'median_income', 'median_house_value', 'ocean_proximity',
       'avg_rooms_per_household', 'avg_bedrooms_per_household'],
      dtype='object')

In [9]:
#scale the data
columns_to_scale = ['housing_median_age', 'population',
       'households', 'median_income', 'median_house_value',
       'avg_rooms_per_household', 'avg_bedrooms_per_household']
scaler = StandardScaler()
cali_scaled = scaler.fit_transform(cali_df[columns_to_scale])

In [10]:
# creat scaled df
cali_scaled_df = pd.DataFrame(cali_scaled, columns=columns_to_scale)
cali_scaled_df.head()

Unnamed: 0,housing_median_age,population,households,median_income,median_house_value,avg_rooms_per_household,avg_bedrooms_per_household
0,0.982143,-0.974429,-0.977033,2.344766,2.129631,0.628559,-0.152553
1,-0.607019,0.861439,1.669961,2.332238,1.314156,0.327041,-0.262155
2,1.856182,-0.820777,-0.843637,1.782699,1.258693,1.15562,-0.04779
3,1.856182,-0.766028,-0.733781,0.932968,1.1651,0.156966,-0.048606
4,1.856182,-0.759847,-0.629157,-0.012881,1.1729,0.344711,-0.031676


In [11]:
# encode categorical variable proximity to ocean 
ocean_proximity_dummies = pd.get_dummies(cali_df['ocean_proximity'], dtype=int)
ocean_proximity_dummies.head()

Unnamed: 0,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0


In [12]:
# concatenate dummies with scaled df
cali_scaled_df = pd.concat([cali_scaled_df, ocean_proximity_dummies], axis = 1)
cali_scaled_df.head()

Unnamed: 0,housing_median_age,population,households,median_income,median_house_value,avg_rooms_per_household,avg_bedrooms_per_household,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,0.982143,-0.974429,-0.977033,2.344766,2.129631,0.628559,-0.152553,0,0,0,1,0
1,-0.607019,0.861439,1.669961,2.332238,1.314156,0.327041,-0.262155,0,0,0,1,0
2,1.856182,-0.820777,-0.843637,1.782699,1.258693,1.15562,-0.04779,0,0,0,1,0
3,1.856182,-0.766028,-0.733781,0.932968,1.1651,0.156966,-0.048606,0,0,0,1,0
4,1.856182,-0.759847,-0.629157,-0.012881,1.1729,0.344711,-0.031676,0,0,0,1,0


## find best k value for model

In [18]:
#create list of k 
k = list(range(1,21))

In [19]:
inertia = [KMeans(n_clusters=i,random_state=7).fit(cali_scaled_df).inertia_ for i in k]
inertia

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

[158400.46579457365,
 133693.81572351672,
 112599.20714885852,
 93332.97430272878,
 82782.57247632978,
 74009.96020151577,
 67633.70960533015,
 62689.37228466217,
 58761.48568243565,
 55380.07966259046,
 52493.90165341168,
 49975.952599528966,
 48052.19228700085,
 46430.183286996995,
 44901.642842262554,
 43662.8373132304,
 42349.25050304145,
 41213.83154030004,
 40073.46602956686,
 38991.45641081751]

In [20]:
# Create a dictionary with the data to plot the Elbow curve
elbow_curve = {'k': k, 'inertia': inertia}

# Create a DataFrame with the data to plot the Elbow curve
elbow_plot_df = pd.DataFrame(elbow_curve)

elbow_plot_df.head()

Unnamed: 0,k,inertia
0,1,158400.465795
1,2,133693.815724
2,3,112599.207149
3,4,93332.974303
4,5,82782.572476


In [24]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_plot=elbow_plot_df.hvplot.line(
    x = 'k',
    y = 'inertia',
    xticks = k,
    label = 'Non-PCA elbow curve'
)
display(elbow_plot)

print('The optimal value for K is 4 based on the above figure using the elbow method')

The optimal value for K is 4 based on the above figure using the elbow method


## Build model fro clustering without using pca


In [25]:
# instantiate the model
model = KMeans(n_clusters = 4, random_state=42)

In [26]:
# Fit the K-Means model using the scaled data
model.fit(cali_scaled_df)

  super()._check_params_vs_input(X, default_n_init=10)


In [32]:
# predict
cluster_labels = model.predict(cali_scaled_df)
print(labels)


[1 1 1 ... 0 0 0]


In [31]:
# create copy of df 
cali_scaled_df_copy = cali_scaled_df.copy()

In [33]:
# add labels to copy of df
cali_scaled_df_copy['cluster label'] = cluster_labels

In [35]:
cali_scaled_df_copy.head()

Unnamed: 0,housing_median_age,population,households,median_income,median_house_value,avg_rooms_per_household,avg_bedrooms_per_household,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN,cluster label
0,0.982143,-0.974429,-0.977033,2.344766,2.129631,0.628559,-0.152553,0,0,0,1,0,1
1,-0.607019,0.861439,1.669961,2.332238,1.314156,0.327041,-0.262155,0,0,0,1,0,1
2,1.856182,-0.820777,-0.843637,1.782699,1.258693,1.15562,-0.04779,0,0,0,1,0,1
3,1.856182,-0.766028,-0.733781,0.932968,1.1651,0.156966,-0.048606,0,0,0,1,0,1
4,1.856182,-0.759847,-0.629157,-0.012881,1.1729,0.344711,-0.031676,0,0,0,1,0,1
