In [None]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
#Importing train and test dataset
test = pd.read_csv("test.csv",sep=",")
train = pd.read_csv("train.csv",sep=",")

In [None]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------
#TRAIN DATASET
#-----------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
#Train - description
train.head(3) #initially we have a lot of null values

In [None]:
#Train - shape
train.shape #we have initially 1460 rows and 81 columns 

In [None]:
#Train - info
train.info()

In [None]:
#Studying null values
(train.isnull().sum()/train.shape[0]).sort_values(ascending=False).head(20) #finding % of null values in each column

In [None]:
#Deleting all columns with more than 10% of null values
remove = train.columns[(train.isnull().sum()/train.shape[0]) > 0.1]
train = train.drop(remove,axis=1)

In [None]:
#Checking numeric columns
numeric_columns = train.columns[train.dtypes != 'object']

In [None]:
#Checking categoric columns
categoric_columns = train.columns[train.dtypes == 'object']

In [None]:
#Checking and treating null values on numeric train base
train_numeric = train.loc[:,numeric_columns]
train_numeric.head()
train_numeric.isnull().sum().sort_values(ascending=False) #Two columns with null values: GarageYrBlt and MasVnrArea

In [None]:
#GarageYrBlt
top_garageyrblt = train_numeric.groupby(train_numeric['GarageYrBlt']).size().sort_values(ascending=False).head(5).tolist()
#In order, we get: 2005.0,2006.0,2004.0,2003.0,2007.0. Will use these values to fill the null
train_numeric['GarageYrBlt'].fillna(pd.Series(np.random.choice(top_garageyrblt,size=len(train_numeric.index))), inplace=True)
#Checking if we still have null values
train_numeric.isnull().sum().sort_values(ascending=False)

In [None]:
#MasVnrArea
top_masva = train_numeric.groupby(train_numeric['MasVnrArea']).size().sort_values(ascending=False).head(1)
train_numeric['MasVnrArea'].fillna(top_masva, inplace=True)
#Checking if we still have null values
train_numeric.isnull().sum().sort_values(ascending=False) #No more null values

In [None]:
#Checking and treating null values on categoric train base
train_categoric = train.loc[:,categoric_columns]
train_categoric.head()
train_categoric.isnull().sum().sort_values(ascending=False)
'''
Columns with null values:
GarageCond       81
GarageQual       81
GarageFinish     81
GarageType       81
BsmtExposure     38
BsmtFinType2     38
BsmtCond         37
BsmtFinType1     37
BsmtQual         37
Electrical        1
'''

In [None]:
#GarageCond
garage_cond = train_categoric.groupby(train_categoric['GarageCond']).size().sort_values(ascending=False).head(5)
#Since we have a huge difference between 'TA' and other values in this column, we will replace null values for 'TA'
train_categoric['GarageCond'].fillna('TA',inplace=True)
#Checking if we still have null values on GarageCond
train_categoric.isnull().sum().sort_values(ascending=False)

In [None]:
#GarageQual, GarageFinish, GarageType
garage_qual = train_categoric.groupby(train_categoric['GarageQual']).size().sort_values(ascending=False).head(5) #same case as GarageCond
garage_finish = train_categoric.groupby(train_categoric['GarageFinish']).size().sort_values(ascending=False).head(5)
garage_type = train_categoric.groupby(train_categoric['GarageType']).size().sort_values(ascending=False).head(5)

#Garage qual is the same case as GarageCond -> Fill with TA
train_categoric['GarageQual'].fillna('TA',inplace=True)
#For GarageFinish, we have not much difference between the data, so we will random fill the null values
train_categoric['GarageFinish'].fillna(pd.Series(np.random.choice(garage_finish.tolist(),size=len(train_categoric.index))), inplace=True)
#For GarageType, we have a huge difference between "Attchd" and other values, so we can fill the null values with this item
train_categoric['GarageType'].fillna('Attchd',inplace=True)

In [None]:
#BsmtExposure, BsmtFinType2, BsmtCond, BsmtQual, BsmtFinType1
bsmt_exposure = train_categoric.groupby(train_categoric['BsmtExposure']).size().sort_values(ascending=False).head(5)
#Replacing for No, since it's an outstanding class in the data
train_categoric['BsmtExposure'].fillna('No',inplace=True)
bsmt_fin2 = train_categoric.groupby(train_categoric['BsmtFinType2']).size().sort_values(ascending=False).head(5)
#Replacing for Unf, since it's an outstanding class in the data
train_categoric['BsmtFinType2'].fillna('Unf',inplace=True)
bsmt_fin1 = train_categoric.groupby(train_categoric['BsmtFinType1']).size().sort_values(ascending=False).head(2)
#Random fill with top 2 values (430 and 418)
train_categoric['BsmtFinType1'].fillna(pd.Series(np.random.choice(bsmt_fin1.tolist(),size=len(train_categoric.index))), inplace=True)
bsmt_cond = train_categoric.groupby(train_categoric['BsmtCond']).size().sort_values(ascending=False).head(5)
#Filling null with TA, since it's the outstanding value
train_categoric['BsmtCond'].fillna('TA',inplace=True)
bsmt_qual = train_categoric.groupby(train_categoric['BsmtQual']).size().sort_values(ascending=False).head(5)
#Random filling null with top 2 values (649 and 618)
train_categoric['BsmtQual'].fillna(pd.Series(np.random.choice(bsmt_qual.tolist(),size=len(train_categoric.index))), inplace=True)

In [None]:
#Electrical
electrical = train_categoric.groupby(train_categoric['Electrical']).size().sort_values(ascending=False).head(5)
#Fill with 'Sbrkr' which is the most outstanding value
train_categoric['Electrical'].fillna('SBrkr',inplace=True)

In [None]:
#Getting our final train_dataset by combining both numeric and categoric data
train_dataset = pd.concat([train_numeric,train_categoric],axis=1)

In [None]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------
#TEST DATASET
#-----------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
#Here we will do the same thing we've done above to check for null values and understand our columns
test.head(3)
test.shape #1459 rows and 80 columns (no 'Sales price' column, which is what we want to predict)
test.info()

In [None]:
#Removing all columns that have more than 10% of null values
(test.isnull().sum()/test.shape[0]).sort_values(ascending=False).head(20)
remove_test = test.columns[(test.isnull().sum()/test.shape[0]) > 0.1]
test = test.drop(remove_test,axis=1)

In [None]:
numeric_tst = test.columns[test.dtypes != 'object']
categoric_tst = test.columns[test.dtypes == 'object']

In [None]:
#Treating numeric values
test_numeric = test.loc[:,numeric_tst]
test_numeric.head(5)
test_numeric.isnull().sum().sort_values(ascending=False).head(10) #10 columns with null values
'''
GarageYrBlt     78
MasVnrArea      15
BsmtHalfBath     2
BsmtFullBath     2
BsmtUnfSF        1
GarageCars       1
GarageArea       1
BsmtFinSF1       1
BsmtFinSF2       1
TotalBsmtSF      1
'''

In [None]:
#Grouping columns
garage_yrblt = test_numeric.groupby(test_numeric['GarageYrBlt']).size().sort_values(ascending=False).head(5)
masvnrarea = test_numeric.groupby(test_numeric['MasVnrArea']).size().sort_values(ascending=False).head(5)
bsmthalfbath = test_numeric.groupby(test_numeric['BsmtHalfBath']).size().sort_values(ascending=False).head(5)
bsmtfullbath = test_numeric.groupby(test_numeric['BsmtFullBath']).size().sort_values(ascending=False).head(5)
bsmtunfsf = test_numeric.groupby(test_numeric['BsmtUnfSF']).size().sort_values(ascending=False).head(5)
garagecars = test_numeric.groupby(test_numeric['GarageCars']).size().sort_values(ascending=False).head(5)
garagearea = test_numeric.groupby(test_numeric['GarageArea']).size().sort_values(ascending=False).head(5)
bsmtfinsf1 = test_numeric.groupby(test_numeric['BsmtFinSF1']).size().sort_values(ascending=False).head(5)
bsmtfinsf2 = test_numeric.groupby(test_numeric['BsmtFinSF2']).size().sort_values(ascending=False).head(5)
totalbsmtsf = test_numeric.groupby(test_numeric['TotalBsmtSF']).size().sort_values(ascending=False).head(2)

#Garage_yrblt - Random fill with top 5 values
test_numeric['GarageYrBlt'].fillna(pd.Series(np.random.choice(garage_yrblt.tolist(),size=len(test_numeric.index))), inplace=True)
#MasVnrArea - Fill with 0.0
test_numeric['MasVnrArea'].fillna('0.0',inplace=True)
#BmstHalfBath - Fill with 0.0
test_numeric['BsmtHalfBath'].fillna('0.0',inplace=True)
#Bsmtfullbath - fill with 0.0
test_numeric['BsmtFullBath'].fillna('0.0',inplace=True)
#Bsmtunfsf - fill with 0.0
test_numeric['BsmtUnfSF'].fillna('0.0',inplace=True)
#Garagecars - fill with '2.0'
test_numeric['GarageCars'].fillna('2.0',inplace=True)
#GarageArea - randomic fill among top 5 values
test_numeric['GarageArea'].fillna(pd.Series(np.random.choice(garagearea.tolist(),size=len(test_numeric.index))), inplace=True)
#BsmtfinsF1 (Fill with '0.0') and BsmtfinsF2 (Fill with '0.0')
test_numeric['BsmtFinSF1'].fillna('0.0',inplace=True)
test_numeric['BsmtFinSF2'].fillna('0.0',inplace=True)
#TotalBsmtSF - Randomic fill among top 2
test_numeric['TotalBsmtSF'].fillna(pd.Series(np.random.choice(totalbsmtsf.tolist(),size=len(test_numeric.index))),inplace=True)

In [None]:
test_categoric = test.loc[:,categoric_tst]
test_categoric.head(5)
test_categoric.isnull().sum().sort_values(ascending=False).head(16) #16 columns with null values
'''
GarageCond      78
GarageQual      78
GarageFinish    78
GarageType      76
BsmtCond        45
BsmtExposure    44
BsmtQual        44
BsmtFinType1    42
BsmtFinType2    42
MSZoning         4
Functional       2
Utilities        2
Exterior1st      1
Exterior2nd      1
SaleType         1
KitchenQual      1
'''

In [None]:
#GarageCond      78
tst_garagecond = test_categoric.groupby(test_categoric['GarageCond']).size().sort_values(ascending=False).head(5) 
test_categoric['GarageCond'].fillna('TA',inplace=True)
#GarageQual      78
tst_garagequal = test_categoric.groupby(test_categoric['GarageQual']).size().sort_values(ascending=False).head(5)
test_categoric['GarageQual'].fillna('TA',inplace=True)
#GarageFinish    78
tst_garagefinish = test_categoric.groupby(test_categoric['GarageFinish']).size().sort_values(ascending=False).head(5)
test_categoric['GarageFinish'].fillna('Unf',inplace=True)
#GarageType      76
tst_garagetype = test_categoric.groupby(test_categoric['GarageType']).size().sort_values(ascending=False).head(5)
test_categoric['GarageType'].fillna('Attchd',inplace=True)
#BsmtCond        45
tst_bsmtcond = test_categoric.groupby(test_categoric['BsmtCond']).size().sort_values(ascending=False).head(5)
test_categoric['BsmtCond'].fillna('TA',inplace=True)
#BsmtExposure    44
tst_bsmtexposure = test_categoric.groupby(test_categoric['BsmtExposure']).size().sort_values(ascending=False).head(5)
test_categoric['BsmtExposure'].fillna('No',inplace=True)
#BsmtQual        44
tst_bsmtqual = test_categoric.groupby(test_categoric['BsmtQual']).size().sort_values(ascending=False).head(2)
test_categoric['BsmtQual'].fillna(pd.Series(np.random.choice((tst_bsmtqual.tolist()),size=len(test_categoric.index))),inplace=True)
#BsmtFinType1    42
tst_bsmtfintype1 = test_categoric.groupby(test_categoric['BsmtFinType1']).size().sort_values(ascending=False).head(2)
test_categoric['BsmtFinType1'].fillna(pd.Series(np.random.choice((tst_bsmtfintype1.tolist()),size=len(test_categoric.index))),inplace=True)
#BsmtFinType2    42
tst_bsmtfintype2 = test_categoric.groupby(test_categoric['BsmtFinType2']).size().sort_values(ascending=False).head(5)
test_categoric['BsmtFinType2'].fillna('Unf',inplace=True)
#MSZoning         4
tst_mszoning = test_categoric.groupby(test_categoric['MSZoning']).size().sort_values(ascending=False).head(5)
test_categoric['MSZoning'].fillna('RL',inplace=True)
#Functional       2
tst_functional = test_categoric.groupby(test_categoric['Functional']).size().sort_values(ascending=False).head(5)
test_categoric['Functional'].fillna('Typ',inplace=True)
#Utilities        2
tst_utilities = test_categoric.groupby(test_categoric['Utilities']).size().sort_values(ascending=False).head(5) #Only AllPub
test_categoric['Utilities'].fillna('AllPub',inplace=True)
#Exterior1st      1
tst_exterior1st = test_categoric.groupby(test_categoric['Exterior1st']).size().sort_values(ascending=False).head(5)
test_categoric['Exterior1st'].fillna('VinylSd',inplace=True)
#Exterior2nd      1
tst_exterior2nd = test_categoric.groupby(test_categoric['Exterior2nd']).size().sort_values(ascending=False).head(5)
test_categoric['Exterior2nd'].fillna('VinylSd',inplace=True)
#SaleType         1
tst_saletype = test_categoric.groupby(test_categoric['SaleType']).size().sort_values(ascending=False).head(5)
test_categoric['SaleType'].fillna('WD',inplace=True)
#KitchenQual      1
tst_kitchenqual = test_categoric.groupby(test_categoric['KitchenQual']).size().sort_values(ascending=False).head(5)
test_categoric['KitchenQual'].fillna('TA',inplace=True)

In [None]:
#Getting our final test dataset with no null values
test_dataset = pd.concat([test_numeric,test_categoric],axis=1)

In [None]:
#Comparing train and test datasets
columns_in_train_not_in_test = (set(train_dataset.columns)) - (set(test_dataset.columns))
columns_in_test_not_in_train = (set(test_dataset.columns)) - (set(train_dataset.columns))

In [None]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------
#ORGANIZING DATASETS + CLEANING SOME COLUMNS
#-----------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
#First we will clean our database by removing some columns that may not add important info to our study. 

columns_to_remove = ['LandContour','LotConfig','YearRemodAdd','RoofStyle','Exterior2nd',
                     'MasVnrArea','ExterCond','BsmtQual','BsmtExposure','BsmtFinSF1','BsmtFinType1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF',
                     'HeatingQC','Electrical','Functional','GarageYrBlt','GarageFinish','GarageCars','GarageCond','3SsnPorch',
                     'ScreenPorch','MoSold','YrSold']

'''['Lot Frontage','Alley','LandContour','LotConfig','Land Slope','Condition 2','House Style','YearRemodAdd','RoofStyle','Exterior2nd',
                     'MasVnrArea','ExterCond','BsmtQual','BsmtExposure','BsmtFinSF1','BsmtFinType1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF',
                     'HeatingQC','Electrical','Functional','FireplaceQu','GarageYrBlt','GarageFinish','GarageCars','GarageCond','3SsnPorch',
                     'ScreenPorch','MiscFeature','MoSold','YrSold']'''

#['Lot Frontage', 'Alley', 'Land Slope', 'Condition 2', 'House Style', 'FireplaceQu', 'MiscFeature'] 

train_final1 = train_dataset.drop(columns_to_remove,axis=1) #This leaves us with 49 columns to treat
train_final1.head()

In [None]:
#Treating the rest of the columns we have on our database

#MSZoning - Only letting the first letter of the values (this way we will make RH, RL, RP and RM into one (Residential))
group_msz = train_final1.groupby(train_final1['MSZoning']).size()
train_final1['MSZoning'] = train_final1['MSZoning'].str[0]

#LotShape - Transform data to keep only the first letter (R for Regular and I for Irregular)
group_ls = train_final1.groupby(train_final1['LotShape']).size()
train_final1['LotShape'] = train_final1['LotShape'].str[0]

#Create column (Overall) with the sum of OverallQual and OverallCond
#train_final1['Overall'] = train_final1['OverallCond'] + train_final1['OverallQual']
#group_ov = train_final1.groupby(train_final1['Overall']).size()
#train_final1 = train_final1.drop(columns=['OverallCond','OverallQual'],axis=1)

In [None]:
#1stFlrSF and 2ndFlrSF - Put it into groups of data
#--------------------------------------------------------------------------------------------------------------------------------------------------
#1stFlrSF
#--------------------------------------------------------------------------------------------------------------------------------------------------
#Using kmeans to understand what is the better way to divide the groups
fflrsf = train_final1['1stFlrSF'].values.reshape(-1,1)
sse = []
for i in range(1,11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(fflrsf)
    sse.append(kmeans.inertia_) #Lower SSE values indicate more compact and well-defined clusters.

""" plt.plot(range(1,11),sse,marker='x')
plt.xlabel('Clusters nmb')
plt.ylabel('Inertia sum') #sum of the distances to the center
plt.title('Kmeans for 1stFlrSF')
plt.show() """

#3 groups is our optimal number of clusters for 1stFlrSF

kmeans = KMeans(n_clusters=3)
kmeans.fit(fflrsf)
clusters = kmeans.predict(fflrsf)
df_fstflrsf = pd.DataFrame({'Value':train_final1['1stFlrSF'],'Cluster':clusters})

train_final1['1stFlrSF Cluster'] = clusters

In [None]:
#--------------------------------------------------------------------------------------------------------------------------------------------------
#2ndFlrSF
#--------------------------------------------------------------------------------------------------------------------------------------------------

sflrsf = train_final1['2ndFlrSF'].values.reshape(-1,1)
sse=[]
for i in range(1,11):
    kmeans=KMeans(n_clusters=i)
    kmeans.fit(sflrsf)
    sse.append(kmeans.inertia_)

""" plt.plot(range(1,11),sse,marker='x')
plt.xlabel('Clusters nmb')
plt.ylabel('Inertia sum')
plt.title('Kmeans for 2ndFlrSF')
plt.show() """

#3 groups is our optimal number of clusters for 2ndFlrSF

kmeans = KMeans(n_clusters=3)
kmeans.fit(sflrsf)
clusters_1 = kmeans.predict(sflrsf)
df_sftflrsf = pd.DataFrame({'Value':train_final1['2ndFlrSF'],'Cluster':clusters_1})
train_final1['2ndFlrSF Cluster'] = clusters_1
train_final1.drop(columns=['2ndFlrSF'],axis=1)

In [None]:
#GrLivArea - Put it into groups of data
grlivarea = train_final1['GrLivArea'].values.reshape(-1,1)
sse=[]
for i in range(1,11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(grlivarea)
    sse.append(kmeans.inertia_)

plt.plot(range(1,11),sse,marker='x')
plt.xlabel('Clusters nmb')
plt.ylabel('Inertia sum')
plt.title('Kmeans for GrLivArea')
plt.show()

#3 is the most correct number of clusters

kmeans = KMeans(n_clusters=3)
kmeans.fit(grlivarea)
clusters_2 = kmeans.predict(grlivarea)
df_grlivarea = pd.DataFrame({'Value':train_final1['GrLivArea'],'Cluster':clusters_2})
train_final1['GrLivArea Cluster'] = clusters_2

In [None]:
#Sum BsmtFullBath + BsmtHalfBath + FullBath + HalfBath
train_final1['House Total Bath'] = train_final1['BsmtFullBath'] + train_final1['BsmtHalfBath'] + train_final1['FullBath'] + train_final1['HalfBath']

In [None]:
#WoodDeckSF - group
wooddecksf = train_final1['WoodDeckSF'].values.reshape(-1,1)
sse=[]
for i in range(1,11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(wooddecksf)
    sse.append(kmeans.inertia_)

plt.plot(range(1,11),sse,marker='x')
plt.xlabel('Clusters nmb')
plt.ylabel('Inertia sum')
plt.title('Kmeans for WoodDeckSF')
plt.show()

#3 is the most correct number of clusters

kmeans = KMeans(n_clusters=3)
kmeans.fit(wooddecksf)
clusters_3 = kmeans.predict(wooddecksf)
train_final1['WoodDeckSF Cluster'] = clusters_3

In [None]:
#Grouping LotArea
lotarea = train_final1['LotArea'].values.reshape(-1,1)
sse=[]
for i in range(1,15):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(lotarea)
    sse.append(kmeans.inertia_)

plt.plot(range(1,15),sse,marker='x')
plt.xlabel('Clusters nmb')
plt.ylabel('Inertia sum')
plt.title('Kmeans for LotArea')
plt.show()

#3 is the most correct number of clusters

kmeans = KMeans(n_clusters=3)
kmeans.fit(lotarea)
clusters_4 = kmeans.predict(lotarea)
train_final1['LotArea Cluster'] = clusters_4

In [None]:
#YearBuilt
yearbuilt = train_final1['YearBuilt'].values.reshape(-1,1)
sse=[]
for i in range(1,15):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(yearbuilt)
    sse.append(kmeans.inertia_)

plt.plot(range(1,15),sse,marker='x')
plt.xlabel('Clusters nmb')
plt.ylabel('Inertia sum')
plt.title('Kmeans for YearBuilt')
plt.show()

kmeans = KMeans(n_clusters=3)
kmeans.fit(yearbuilt)
clusters_5 = kmeans.predict(yearbuilt)
train_final1['YearBuilt cluster'] = clusters_5

In [None]:
#TotalBsmtSF
totalbsmtsf_k = train_final1['TotalBsmtSF'].values.reshape(-1,1)
sse=[]
for i in range(1,15):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(totalbsmtsf_k)
    sse.append(kmeans.inertia_)

plt.plot(range(1,15),sse,marker='x')
plt.xlabel('Clusters nmb')
plt.ylabel('Inertia sum')
plt.title('Kmeans for Total BsmtSF')
plt.show()

kmeans = KMeans(n_clusters=3)
kmeans.fit(totalbsmtsf_k)
clusters_6 = kmeans.predict(totalbsmtsf_k)
train_final1['TotalBsmtSF'] = clusters_6

In [None]:
#Cleaning database 1
columns_to_delete_1 = ['LotArea','OverallQual','OverallCond','YearBuilt','1stFlrSF','2ndFlrSF','BsmtFullBath','BsmtHalfBath','FullBath','HalfBath']
columns_to_delete_2 = ['BedroomAbvGr','KitchenAbvGr']
#train_final1 = train_final1.drop(columns_to_delete_1,axis=1)
train_final1 = train_final1.drop(columns_to_delete_2,axis=1)

In [None]:
#GrLivArea
grlivarea = train_final1['GrLivArea'].values.reshape(-1,1)
sse=[]
for i in range(1,15):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(grlivarea)
    sse.append(kmeans.inertia_)

plt.plot(range(1,15),sse,marker='x')
plt.xlabel('Clusters nmb')
plt.ylabel('Inertia sum')
plt.title('Kmeans for Total GrLivArea')
plt.show()

kmeans = KMeans(n_clusters=3)
kmeans.fit(grlivarea)
clusters_7 = kmeans.predict(grlivarea)
train_final1['GrLivArea'] = clusters_7

In [None]:
#GarageArea
garagearea_k = train_final1['GarageArea'].values.reshape(-1,1)
sse=[]
for i in range(1,15):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(garagearea_k)
    sse.append(kmeans.inertia_)

plt.plot(range(1,15),sse,marker='x')
plt.xlabel('Clusters nmb')
plt.ylabel('Inertia sum')
plt.title('Kmeans for Total GarageArea')
plt.show()

kmeans = KMeans(n_clusters=3)
kmeans.fit(garagearea_k)
clusters_8 = kmeans.predict(garagearea_k)
train_final1['GarageArea'] = clusters_8

In [63]:
#SaleCondition
salec = train_final1.groupby(train_final1['SaleCondition']).size()
salec
#Normal = 1, the rest of the conditions will be trated as Abnorml
train_final1['SaleCondition'] = train_final1['SaleCondition'].map({'Abnorml':'0','AdjLand':'0','Alloca':'0','Family':'0','Normal':'1','Partial':'0'})

SaleCondition
0     262
1    1198
dtype: int64

In [65]:
columns_to_delete_3 = ['WoodDeckSF','Fireplaces','OpenPorchSF']
train_final1 = train_final1.drop(columns_to_delete_3,axis=1)
train_final1.head()

Unnamed: 0,Id,MSSubClass,TotalBsmtSF,LowQualFinSF,GrLivArea,TotRmsAbvGrd,GarageArea,EnclosedPorch,PoolArea,MiscVal,...,PavedDrive,SaleType,SaleCondition,1stFlrSF Cluster,2ndFlrSF Cluster,GrLivArea Cluster,House Total Bath,WoodDeckSF Cluster,LotArea Cluster,YearBuilt cluster
0,1,60,1,0,0,8,2,0,0,0,...,Y,WD,,0,2,1,4,1,0,1
1,2,20,1,0,1,6,2,0,0,0,...,Y,WD,,1,1,0,3,0,0,2
2,3,60,1,0,0,6,2,0,0,0,...,Y,WD,,0,2,1,4,1,0,1
3,4,70,2,0,0,7,2,272,0,0,...,Y,WD,,0,2,1,2,1,0,0
4,5,60,1,0,2,9,1,0,0,0,...,Y,WD,,1,0,2,4,2,0,1
