In [903]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
#machine learning
from sklearn import tree
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [904]:
#Importing train and test dataset
test = pd.read_csv("test.csv",sep=",")
train = pd.read_csv("train.csv",sep=",")

In [905]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------
#TRAIN DATASET
#-----------------------------------------------------------------------------------------------------------------------------------------------------

In [906]:
#Train - description
train.head(3) #initially we have a lot of null values

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [907]:
#Train - shape
train.shape #we have initially 1460 rows and 81 columns 

(1460, 81)

In [908]:
#Train - info
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [909]:
#Studying null values
(train.isnull().sum()/train.shape[0]).sort_values(ascending=False).head(20) #finding % of null values in each column

PoolQC          0.995205
MiscFeature     0.963014
Alley           0.937671
Fence           0.807534
MasVnrType      0.597260
FireplaceQu     0.472603
LotFrontage     0.177397
GarageYrBlt     0.055479
GarageCond      0.055479
GarageType      0.055479
GarageFinish    0.055479
GarageQual      0.055479
BsmtFinType2    0.026027
BsmtExposure    0.026027
BsmtQual        0.025342
BsmtCond        0.025342
BsmtFinType1    0.025342
MasVnrArea      0.005479
Electrical      0.000685
Id              0.000000
dtype: float64

In [910]:
#Deleting all columns with more than 10% of null values
remove = train.columns[(train.isnull().sum()/train.shape[0]) > 0.1]
train = train.drop(remove,axis=1)

In [911]:
#Checking numeric columns
numeric_columns = train.columns[train.dtypes != 'object']

In [912]:
#Checking categoric columns
categoric_columns = train.columns[train.dtypes == 'object']

In [913]:
#Checking and treating null values on numeric train base
train_numeric = train.loc[:,numeric_columns]
train_numeric.head()
train_numeric.isnull().sum().sort_values(ascending=False) #Two columns with null values: GarageYrBlt and MasVnrArea

GarageYrBlt      81
MasVnrArea        8
Id                0
OpenPorchSF       0
KitchenAbvGr      0
TotRmsAbvGrd      0
Fireplaces        0
GarageCars        0
GarageArea        0
WoodDeckSF        0
EnclosedPorch     0
HalfBath          0
3SsnPorch         0
ScreenPorch       0
PoolArea          0
MiscVal           0
MoSold            0
YrSold            0
BedroomAbvGr      0
FullBath          0
MSSubClass        0
BsmtFinSF2        0
LotArea           0
OverallQual       0
OverallCond       0
YearBuilt         0
YearRemodAdd      0
BsmtFinSF1        0
BsmtUnfSF         0
BsmtHalfBath      0
TotalBsmtSF       0
1stFlrSF          0
2ndFlrSF          0
LowQualFinSF      0
GrLivArea         0
BsmtFullBath      0
SalePrice         0
dtype: int64

In [914]:
#GarageYrBlt
top_garageyrblt = train_numeric.groupby(train_numeric['GarageYrBlt']).size().sort_values(ascending=False).head(5).tolist()
#In order, we get: 2005.0,2006.0,2004.0,2003.0,2007.0. Will use these values to fill the null
train_numeric['GarageYrBlt'].fillna(pd.Series(np.random.choice(top_garageyrblt,size=len(train_numeric.index))), inplace=True)
#Checking if we still have null values
train_numeric.isnull().sum().sort_values(ascending=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_numeric['GarageYrBlt'].fillna(pd.Series(np.random.choice(top_garageyrblt,size=len(train_numeric.index))), inplace=True)


MasVnrArea       8
Id               0
OpenPorchSF      0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
EnclosedPorch    0
HalfBath         0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
BedroomAbvGr     0
FullBath         0
MSSubClass       0
BsmtFinSF2       0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
BsmtFinSF1       0
BsmtUnfSF        0
BsmtHalfBath     0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
SalePrice        0
dtype: int64

In [915]:
#MasVnrArea
top_masva = train_numeric.groupby(train_numeric['MasVnrArea']).size().sort_values(ascending=False).head(1)
train_numeric['MasVnrArea'].fillna(top_masva, inplace=True)
#Checking if we still have null values
train_numeric.isnull().sum().sort_values(ascending=False) #No more null values

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_numeric['MasVnrArea'].fillna(top_masva, inplace=True)


MasVnrArea       8
Id               0
OpenPorchSF      0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
EnclosedPorch    0
HalfBath         0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
BedroomAbvGr     0
FullBath         0
MSSubClass       0
BsmtFinSF2       0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
BsmtFinSF1       0
BsmtUnfSF        0
BsmtHalfBath     0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
SalePrice        0
dtype: int64

In [916]:
#Checking and treating null values on categoric train base
train_categoric = train.loc[:,categoric_columns]
train_categoric.head()
train_categoric.isnull().sum().sort_values(ascending=False)
'''
Columns with null values:
GarageCond       81
GarageQual       81
GarageFinish     81
GarageType       81
BsmtExposure     38
BsmtFinType2     38
BsmtCond         37
BsmtFinType1     37
BsmtQual         37
Electrical        1
'''

'\nColumns with null values:\nGarageCond       81\nGarageQual       81\nGarageFinish     81\nGarageType       81\nBsmtExposure     38\nBsmtFinType2     38\nBsmtCond         37\nBsmtFinType1     37\nBsmtQual         37\nElectrical        1\n'

In [917]:
#GarageCond
garage_cond = train_categoric.groupby(train_categoric['GarageCond']).size().sort_values(ascending=False).head(5)
#Since we have a huge difference between 'TA' and other values in this column, we will replace null values for 'TA'
train_categoric['GarageCond'].fillna('TA',inplace=True)
#Checking if we still have null values on GarageCond
train_categoric.isnull().sum().sort_values(ascending=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_categoric['GarageCond'].fillna('TA',inplace=True)


GarageQual       81
GarageFinish     81
GarageType       81
BsmtExposure     38
BsmtFinType2     38
BsmtCond         37
BsmtFinType1     37
BsmtQual         37
Electrical        1
HeatingQC         0
Heating           0
MSZoning          0
CentralAir        0
Functional        0
GarageCond        0
PavedDrive        0
SaleType          0
KitchenQual       0
Foundation        0
Street            0
ExterCond         0
LotShape          0
LandContour       0
Utilities         0
LotConfig         0
LandSlope         0
Neighborhood      0
Condition1        0
Condition2        0
BldgType          0
HouseStyle        0
RoofStyle         0
RoofMatl          0
Exterior1st       0
Exterior2nd       0
ExterQual         0
SaleCondition     0
dtype: int64

In [918]:
#GarageQual, GarageFinish, GarageType
garage_qual = train_categoric.groupby(train_categoric['GarageQual']).size().sort_values(ascending=False).head(5) #same case as GarageCond
garage_finish = train_categoric.groupby(train_categoric['GarageFinish']).size().sort_values(ascending=False).head(5)
garage_type = train_categoric.groupby(train_categoric['GarageType']).size().sort_values(ascending=False).head(5)

#Garage qual is the same case as GarageCond -> Fill with TA
train_categoric['GarageQual'].fillna('TA',inplace=True)
#For GarageFinish, we have not much difference between the data, so we will random fill the null values
train_categoric['GarageFinish'].fillna(pd.Series(np.random.choice(garage_finish.tolist(),size=len(train_categoric.index))), inplace=True)
#For GarageType, we have a huge difference between "Attchd" and other values, so we can fill the null values with this item
train_categoric['GarageType'].fillna('Attchd',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_categoric['GarageQual'].fillna('TA',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_categoric['GarageFinish'].fillna(pd.Series(np.random.choice(garage_finish.tolist(),size=len(train_categoric.index))), inplace=True)
The behavior will change in pandas 3.0

In [919]:
#BsmtExposure, BsmtFinType2, BsmtCond, BsmtQual, BsmtFinType1
bsmt_exposure = train_categoric.groupby(train_categoric['BsmtExposure']).size().sort_values(ascending=False).head(5)
#Replacing for No, since it's an outstanding class in the data
train_categoric['BsmtExposure'].fillna('No',inplace=True)
bsmt_fin2 = train_categoric.groupby(train_categoric['BsmtFinType2']).size().sort_values(ascending=False).head(5)
#Replacing for Unf, since it's an outstanding class in the data
train_categoric['BsmtFinType2'].fillna('Unf',inplace=True)
bsmt_fin1 = train_categoric.groupby(train_categoric['BsmtFinType1']).size().sort_values(ascending=False).head(2)
#Random fill with top 2 values (430 and 418)
train_categoric['BsmtFinType1'].fillna(pd.Series(np.random.choice(bsmt_fin1.tolist(),size=len(train_categoric.index))), inplace=True)
bsmt_cond = train_categoric.groupby(train_categoric['BsmtCond']).size().sort_values(ascending=False).head(5)
#Filling null with TA, since it's the outstanding value
train_categoric['BsmtCond'].fillna('TA',inplace=True)
bsmt_qual = train_categoric.groupby(train_categoric['BsmtQual']).size().sort_values(ascending=False).head(5)
#Random filling null with top 2 values (649 and 618)
train_categoric['BsmtQual'].fillna(pd.Series(np.random.choice(bsmt_qual.tolist(),size=len(train_categoric.index))), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_categoric['BsmtExposure'].fillna('No',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_categoric['BsmtFinType2'].fillna('Unf',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whic

In [920]:
#Electrical
electrical = train_categoric.groupby(train_categoric['Electrical']).size().sort_values(ascending=False).head(5)
#Fill with 'Sbrkr' which is the most outstanding value
train_categoric['Electrical'].fillna('SBrkr',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_categoric['Electrical'].fillna('SBrkr',inplace=True)


In [921]:
#Getting our final train_dataset by combining both numeric and categoric data
train_dataset = pd.concat([train_numeric,train_categoric],axis=1)

In [922]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------
#TEST DATASET
#-----------------------------------------------------------------------------------------------------------------------------------------------------

In [923]:
#Here we will do the same thing we've done above to check for null values and understand our columns
test.head(3)
test.shape #1459 rows and 80 columns (no 'Sales price' column, which is what we want to predict)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [924]:
#Removing all columns that have more than 10% of null values
(test.isnull().sum()/test.shape[0]).sort_values(ascending=False).head(20)
remove_test = test.columns[(test.isnull().sum()/test.shape[0]) > 0.1]
test = test.drop(remove_test,axis=1)

In [925]:
numeric_tst = test.columns[test.dtypes != 'object']
categoric_tst = test.columns[test.dtypes == 'object']

In [926]:
#Treating numeric values
test_numeric = test.loc[:,numeric_tst]
test_numeric.head(5)
test_numeric.isnull().sum().sort_values(ascending=False).head(10) #10 columns with null values
'''
GarageYrBlt     78
MasVnrArea      15
BsmtHalfBath     2
BsmtFullBath     2
BsmtUnfSF        1
GarageCars       1
GarageArea       1
BsmtFinSF1       1
BsmtFinSF2       1
TotalBsmtSF      1
'''

'\nGarageYrBlt     78\nMasVnrArea      15\nBsmtHalfBath     2\nBsmtFullBath     2\nBsmtUnfSF        1\nGarageCars       1\nGarageArea       1\nBsmtFinSF1       1\nBsmtFinSF2       1\nTotalBsmtSF      1\n'

In [927]:
#Grouping columns
garage_yrblt = test_numeric.groupby(test_numeric['GarageYrBlt']).size().sort_values(ascending=False).head(5)
masvnrarea = test_numeric.groupby(test_numeric['MasVnrArea']).size().sort_values(ascending=False).head(5)
bsmthalfbath = test_numeric.groupby(test_numeric['BsmtHalfBath']).size().sort_values(ascending=False).head(5)
bsmtfullbath = test_numeric.groupby(test_numeric['BsmtFullBath']).size().sort_values(ascending=False).head(5)
bsmtunfsf = test_numeric.groupby(test_numeric['BsmtUnfSF']).size().sort_values(ascending=False).head(5)
garagecars = test_numeric.groupby(test_numeric['GarageCars']).size().sort_values(ascending=False).head(5)
garagearea = test_numeric.groupby(test_numeric['GarageArea']).size().sort_values(ascending=False).head(5)
bsmtfinsf1 = test_numeric.groupby(test_numeric['BsmtFinSF1']).size().sort_values(ascending=False).head(5)
bsmtfinsf2 = test_numeric.groupby(test_numeric['BsmtFinSF2']).size().sort_values(ascending=False).head(5)
totalbsmtsf = test_numeric.groupby(test_numeric['TotalBsmtSF']).size().sort_values(ascending=False).head(2)

#Garage_yrblt - Random fill with top 5 values
test_numeric['GarageYrBlt'].fillna(pd.Series(np.random.choice(garage_yrblt.tolist(),size=len(test_numeric.index))), inplace=True)
#MasVnrArea - Fill with 0.0
test_numeric['MasVnrArea'].fillna('0.0',inplace=True)
#BmstHalfBath - Fill with 0.0
test_numeric['BsmtHalfBath'].fillna('0.0',inplace=True)
#Bsmtfullbath - fill with 0.0
test_numeric['BsmtFullBath'].fillna('0.0',inplace=True)
#Bsmtunfsf - fill with 0.0
test_numeric['BsmtUnfSF'].fillna('0.0',inplace=True)
#Garagecars - fill with '2.0'
test_numeric['GarageCars'].fillna('2.0',inplace=True)
#GarageArea - randomic fill among top 5 values
test_numeric['GarageArea'].fillna(pd.Series(np.random.choice(garagearea.tolist(),size=len(test_numeric.index))), inplace=True)
#BsmtfinsF1 (Fill with '0.0') and BsmtfinsF2 (Fill with '0.0')
test_numeric['BsmtFinSF1'].fillna('0.0',inplace=True)
test_numeric['BsmtFinSF2'].fillna('0.0',inplace=True)
#TotalBsmtSF - Randomic fill among top 2
test_numeric['TotalBsmtSF'].fillna(pd.Series(np.random.choice(totalbsmtsf.tolist(),size=len(test_numeric.index))),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_numeric['GarageYrBlt'].fillna(pd.Series(np.random.choice(garage_yrblt.tolist(),size=len(test_numeric.index))), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_numeric['MasVnrArea'].fillna('0.0',inplace=True)
  test_numeric['MasVnrArea'].fillna('0.0',inplac

In [928]:
test_categoric = test.loc[:,categoric_tst]
test_categoric.head(5)
test_categoric.isnull().sum().sort_values(ascending=False).head(16) #16 columns with null values
'''
GarageCond      78
GarageQual      78
GarageFinish    78
GarageType      76
BsmtCond        45
BsmtExposure    44
BsmtQual        44
BsmtFinType1    42
BsmtFinType2    42
MSZoning         4
Functional       2
Utilities        2
Exterior1st      1
Exterior2nd      1
SaleType         1
KitchenQual      1
'''

'\nGarageCond      78\nGarageQual      78\nGarageFinish    78\nGarageType      76\nBsmtCond        45\nBsmtExposure    44\nBsmtQual        44\nBsmtFinType1    42\nBsmtFinType2    42\nMSZoning         4\nFunctional       2\nUtilities        2\nExterior1st      1\nExterior2nd      1\nSaleType         1\nKitchenQual      1\n'

In [929]:
#GarageCond      78
tst_garagecond = test_categoric.groupby(test_categoric['GarageCond']).size().sort_values(ascending=False).head(5) 
test_categoric['GarageCond'].fillna('TA',inplace=True)
#GarageQual      78
tst_garagequal = test_categoric.groupby(test_categoric['GarageQual']).size().sort_values(ascending=False).head(5)
test_categoric['GarageQual'].fillna('TA',inplace=True)
#GarageFinish    78
tst_garagefinish = test_categoric.groupby(test_categoric['GarageFinish']).size().sort_values(ascending=False).head(5)
test_categoric['GarageFinish'].fillna('Unf',inplace=True)
#GarageType      76
tst_garagetype = test_categoric.groupby(test_categoric['GarageType']).size().sort_values(ascending=False).head(5)
test_categoric['GarageType'].fillna('Attchd',inplace=True)
#BsmtCond        45
tst_bsmtcond = test_categoric.groupby(test_categoric['BsmtCond']).size().sort_values(ascending=False).head(5)
test_categoric['BsmtCond'].fillna('TA',inplace=True)
#BsmtExposure    44
tst_bsmtexposure = test_categoric.groupby(test_categoric['BsmtExposure']).size().sort_values(ascending=False).head(5)
test_categoric['BsmtExposure'].fillna('No',inplace=True)
#BsmtQual        44
tst_bsmtqual = test_categoric.groupby(test_categoric['BsmtQual']).size().sort_values(ascending=False).head(2)
test_categoric['BsmtQual'].fillna(pd.Series(np.random.choice((tst_bsmtqual.tolist()),size=len(test_categoric.index))),inplace=True)
#BsmtFinType1    42
tst_bsmtfintype1 = test_categoric.groupby(test_categoric['BsmtFinType1']).size().sort_values(ascending=False).head(2)
test_categoric['BsmtFinType1'].fillna(pd.Series(np.random.choice((tst_bsmtfintype1.tolist()),size=len(test_categoric.index))),inplace=True)
#BsmtFinType2    42
tst_bsmtfintype2 = test_categoric.groupby(test_categoric['BsmtFinType2']).size().sort_values(ascending=False).head(5)
test_categoric['BsmtFinType2'].fillna('Unf',inplace=True)
#MSZoning         4
tst_mszoning = test_categoric.groupby(test_categoric['MSZoning']).size().sort_values(ascending=False).head(5)
test_categoric['MSZoning'].fillna('RL',inplace=True)
#Functional       2
tst_functional = test_categoric.groupby(test_categoric['Functional']).size().sort_values(ascending=False).head(5)
test_categoric['Functional'].fillna('Typ',inplace=True)
#Utilities        2
tst_utilities = test_categoric.groupby(test_categoric['Utilities']).size().sort_values(ascending=False).head(5) #Only AllPub
test_categoric['Utilities'].fillna('AllPub',inplace=True)
#Exterior1st      1
tst_exterior1st = test_categoric.groupby(test_categoric['Exterior1st']).size().sort_values(ascending=False).head(5)
test_categoric['Exterior1st'].fillna('VinylSd',inplace=True)
#Exterior2nd      1
tst_exterior2nd = test_categoric.groupby(test_categoric['Exterior2nd']).size().sort_values(ascending=False).head(5)
test_categoric['Exterior2nd'].fillna('VinylSd',inplace=True)
#SaleType         1
tst_saletype = test_categoric.groupby(test_categoric['SaleType']).size().sort_values(ascending=False).head(5)
test_categoric['SaleType'].fillna('WD',inplace=True)
#KitchenQual      1
tst_kitchenqual = test_categoric.groupby(test_categoric['KitchenQual']).size().sort_values(ascending=False).head(5)
test_categoric['KitchenQual'].fillna('TA',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_categoric['GarageCond'].fillna('TA',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_categoric['GarageQual'].fillna('TA',inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

In [930]:
#Getting our final test dataset with no null values
test_dataset = pd.concat([test_numeric,test_categoric],axis=1)

In [931]:
#Comparing train and test datasets
columns_in_train_not_in_test = (set(train_dataset.columns)) - (set(test_dataset.columns))
columns_in_test_not_in_train = (set(test_dataset.columns)) - (set(train_dataset.columns))

In [932]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------
#ORGANIZING DATASETS + CLEANING SOME COLUMNS
#-----------------------------------------------------------------------------------------------------------------------------------------------------

In [933]:
#First we will clean our database by removing some columns that may not add important info to our study. 

#Some columns were chosen to be deleted initially from our database
columns_to_remove = ['LandContour','LotConfig','YearRemodAdd','RoofStyle','Exterior2nd',
                     'MasVnrArea','ExterCond','BsmtQual','BsmtExposure','BsmtFinSF1','BsmtFinType1','BsmtFinType2','BsmtFinSF2','BsmtUnfSF',
                     'HeatingQC','Electrical','Functional','GarageYrBlt','GarageFinish','GarageCars','GarageCond','3SsnPorch',
                     'ScreenPorch','MoSold','YrSold','LowQualFinSF','BedroomAbvGr','KitchenAbvGr','WoodDeckSF','OpenPorchSF', 'EnclosedPorch',
                     'MiscVal','LandSlope','Condition1','Condition2','BldgType','Foundation','KitchenQual','GarageQual','SaleCondition']

train_final1 = train_dataset.drop(columns_to_remove,axis=1)
test_final1 = test_dataset.drop(columns_to_remove,axis=1)

In [934]:
#LotArea
#Predicting clusters with train dataset
lotarea_train = train_final1['LotArea'].values.reshape(-1,1)
sse=[]
for i in range(1,11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(lotarea_train)
    sse.append(kmeans.inertia_)
#Ploting graph to understand the best clusters number
""" plt.plot(range(1,11),sse,marker='x')
plt.xlabel('Cluster numbers')
plt.ylabel('Inertia sum')
plt.title('Kmeans - Lot Area')
plt.show() """
#Fitting our predict for train
kmeans = KMeans(n_clusters=3)
kmeans.fit(lotarea_train)
clusters = kmeans.predict(lotarea_train)
train_final1['LotArea'] = clusters
#Using the clusters to fit our test dataset
lotarea_test = test_final1['LotArea'].values.reshape(-1,1)
clusters_test = kmeans.predict(lotarea_test)
test_final1['LotArea'] = clusters_test

In [935]:
#GrLivArea
grlivarea_train = train_final1['GrLivArea'].values.reshape(-1,1)
sse=[]
for j in range(1,11):
    kmeans_1 = KMeans(n_clusters=j)
    kmeans_1.fit(grlivarea_train)
    sse.append(kmeans_1.inertia_)

""" plt.plot(range(1,11),sse,marker='x')
plt.xlabel('Cluster numbers')
plt.ylabel('Inertia sum')
plt.title('Kmeans - GrLivArea')
plt.show() """

kmeans_1 = KMeans(n_clusters=3)
kmeans_1.fit(grlivarea_train)
clusters_1 = kmeans_1.predict(grlivarea_train)
train_final1['GrLivArea'] = clusters_1

grlivarea_test = test_final1['GrLivArea'].values.reshape(-1,1)
clusters_1_test = kmeans_1.predict(grlivarea_test)
test_final1['GrLivArea'] = clusters_1_test

In [936]:
# YearBuilt
yrbuilt_train = train_final1['YearBuilt'].values.reshape(-1,1)
sse=[]
for k in range(1,11):
    kmeans_2 = KMeans(n_clusters=k)
    kmeans_2.fit(yrbuilt_train)
    sse.append(kmeans_2.inertia_)

""" plt.plot(range(1,11),sse,marker='x')
plt.xlabel('Cluster numbers')
plt.ylabel('Inertia sum')
plt.title('Kmeans - YearBuilt')
plt.show() """

kmeans_2 = KMeans(n_clusters=3)
kmeans_2.fit(yrbuilt_train)
clusters_2 = kmeans_2.predict(yrbuilt_train)
train_final1['YearBuilt'] = clusters_2

yrbuilt_test = test_final1['YearBuilt'].values.reshape(-1,1)
clusters_2_test = kmeans_2.predict(yrbuilt_test)
test_final1['YearBuilt'] = clusters_2_test

In [937]:
#TotalBsmtSF
totalbsmtsf_train = train_final1['TotalBsmtSF'].values.reshape(-1,1)
sse=[]
for l in range(1,11):
    kmeans_3 = KMeans(n_clusters=l)
    kmeans_3.fit(totalbsmtsf_train)
    sse.append(kmeans_3.inertia_)

""" plt.plot(range(1,11),sse,marker='x')
plt.xlabel('Cluster numbers')
plt.ylabel('Inertia sum')
plt.title('Kmeans - TotalBsmtSF')
plt.show() """

kmeans_3 = KMeans(n_clusters=3)
kmeans_3.fit(totalbsmtsf_train)
clusters_3 = kmeans_3.predict(totalbsmtsf_train)
train_final1['TotalBsmtSF'] = clusters_3

totalbsmtsf_test = test_final1['TotalBsmtSF'].values.reshape(-1,1)
clusters_3_test = kmeans_3.predict(totalbsmtsf_test)
test_final1['TotalBsmtSF'] = clusters_3_test

In [938]:
#1stFlrSF
fstflrsf_train = train_final1['1stFlrSF'].values.reshape(-1,1)
sse=[]
for m in range(1,11):
    kmeans_4 = KMeans(n_clusters=m)
    kmeans_4.fit(fstflrsf_train)
    sse.append(kmeans_4.inertia_)

""" plt.plot(range(1,11),sse,marker='x')
plt.xlabel('Cluster numbers')
plt.ylabel('Inertia sum')
plt.title('Kmeans - 1stFlrSF')
plt.show() """

kmeans_4 = KMeans(n_clusters=3)
kmeans_4.fit(fstflrsf_train)
clusters_4 = kmeans_4.predict(fstflrsf_train)
train_final1['1stFlrSF'] = clusters_4

fstflrsf_test = test_final1['1stFlrSF'].values.reshape(-1,1)
clusters_4_test = kmeans_4.predict(fstflrsf_test)
test_final1['1stFlrSF'] = clusters_4_test

In [939]:
#2ndFlrSF
sndflrsf_train = train_final1['2ndFlrSF'].values.reshape(-1,1)
sse=[]
for n in range(1,11):
    kmeans_5 = KMeans(n_clusters=n)
    kmeans_5.fit(sndflrsf_train)
    sse.append(kmeans_5.inertia_)

""" plt.plot(range(1,11),sse,marker='x')
plt.xlabel('Cluster numbers')
plt.ylabel('Inertia sum')
plt.title('Kmeans - 2ndFlrSF')
plt.show() """

kmeans_5 = KMeans(n_clusters=3)
kmeans_5.fit(sndflrsf_train)
clusters_5 = kmeans_5.predict(sndflrsf_train)
train_final1['2ndFlrSF'] = clusters_5

sndflrsf_test = test_final1['2ndFlrSF'].values.reshape(-1,1)
clusters_5_test = kmeans_5.predict(sndflrsf_test)
test_final1['2ndFlrSF'] = clusters_5_test

In [940]:
#GarageArea
garagearea_train = train_final1['GarageArea'].values.reshape(-1,1)
sse=[]
for o in range(1,11):
    kmeans_6 = KMeans(n_clusters=o)
    kmeans_6.fit(garagearea_train)
    sse.append(kmeans_6.inertia_)

""" plt.plot(range(1,11),sse,marker='x')
plt.xlabel('Cluster numbers')
plt.ylabel('Inertia sum')
plt.title('Kmeans - GarageArea')
plt.show() """

kmeans_6 = KMeans(n_clusters=3)
kmeans_6.fit(garagearea_train)
clusters_6 = kmeans_6.predict(garagearea_train)
train_final1['GarageArea'] = clusters_6

garagearea_test = test_final1['GarageArea'].values.reshape(-1,1)
clusters_6_test = kmeans_6.predict(garagearea_test)
test_final1['GarageArea'] = clusters_6_test

In [941]:
##Other treatments columns:
#Overall = OverallQual + OverallCond

train_final1['Overall'] = train_final1['OverallCond'] + train_final1['OverallQual']
test_final1['Overall'] = test_final1['OverallCond'] + test_final1['OverallQual']

#Total Bath = BsmtFullBath + BsmtHalfBath + FullBath + HalfBath
#print(test_final1[['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath']].dtypes)
#It was found that BsmtFullBath and BsmtHalfBath in test_final1 is type object, so we have to convert it to int to be able to sum
train_final1['Total Bath'] = train_final1['BsmtFullBath'] + train_final1['BsmtHalfBath'] + train_final1['FullBath'] + train_final1['HalfBath']

test_final1['BsmtFullBath'] = pd.to_numeric(test_final1['BsmtFullBath'], errors='coerce')
test_final1['BsmtHalfBath'] = pd.to_numeric(test_final1['BsmtHalfBath'], errors='coerce')
test_final1['BsmtFullBath'] = test_final1['BsmtFullBath'].fillna(0).astype(int)
test_final1['BsmtHalfBath'] = test_final1['BsmtHalfBath'].fillna(0).astype(int)
test_final1['Total Bath'] = test_final1['BsmtFullBath'] + test_final1['BsmtHalfBath'] + test_final1['FullBath'] + test_final1['HalfBath']

In [942]:
#Fireplaces = 1 or 0
fireplaces_train = train_final1.groupby(train_final1['Fireplaces']).size()
fireplaces_train
"""fireplaces_test = test_final1.groupby(test_final1['Fireplaces']).size()
fireplaces_test """

#Replacing values for 0 (no fireplace) and 1 (there are fireplaces)
sub = {0: 0, 1: 1, 2: 1, 3: 1, 4: 1}
train_final1['Fireplaces'] = train_final1['Fireplaces'].map(sub)
test_final1['Fireplaces'] = test_final1['Fireplaces'].map(sub)

In [943]:
#PoolArea = 1 or 0
pool_train = train_final1.groupby(train_final1['PoolArea']).size()
pool_test = test_final1.groupby(test_final1['PoolArea']).size()

#Replacing values 0 (no pool), 1 (there is a pool) when value > 0
train_final1['PoolArea'] = train_final1['PoolArea'].apply(lambda x: 0 if x == 0 else (1 if x >= 1 else x))
test_final1['PoolArea'] = test_final1['PoolArea'].apply(lambda x: 0 if x == 0 else (1 if x >= 1 else x))

In [944]:
#MSZoning = Keep only the first letter of each value (group residentials)
train_final1['MSZoning'] = train_final1['MSZoning'].str[0]
test_final1['MSZoning'] = test_final1['MSZoning'].str[0]

In [945]:
#Street = Paved = 1, Gravel = 0

street_train = train_final1.groupby(train_final1['Street']).size()
street_train

street_test = test_final1.groupby(test_final1['Street']).size()
street_test

sub_street = {'Pave': 1, 'Grvl': 0}
train_final1['Street'] = train_final1['Street'].map(sub_street)
test_final1['Street'] = test_final1['Street'].map(sub_street)

In [946]:
#LotShape = Regular, Irregular
lotshape = train_final1.groupby(train_final1['LotShape']).size()
lotshape_test = test_final1.groupby(test_final1['LotShape']).size()

train_final1['LotShape'] = train_final1['LotShape'].str[0]
test_final1['LotShape'] = test_final1['LotShape'].str[0]

sub_lot = {'R': 1, 'I': 0}
train_final1['LotShape'] = train_final1['LotShape'].map(sub_lot)
test_final1['LotShape'] = test_final1['LotShape'].map(sub_lot)

In [947]:
#CentralAir = 1 or 0
ca_train = train_final1.groupby(train_final1['CentralAir']).size()
ca_test = test_final1.groupby(test_final1['CentralAir']).size()

sub_ca = {'Y':1,'N':0}
train_final1['CentralAir'] = train_final1['CentralAir'].map(sub_ca)
test_final1['CentralAir'] = test_final1['CentralAir'].map(sub_ca)

In [948]:
#PavedDrive = 1 or 0
paved_train = train_final1.groupby(train_final1['PavedDrive']).size()
paved_test = test_final1.groupby(test_final1['PavedDrive']).size()

#converting 'N' to 0 and 'P'/'Y' to 1
sub_paved = {'N':0,'P':1,'Y':1}
train_final1['PavedDrive'] = train_final1['PavedDrive'].map(sub_paved)
test_final1['PavedDrive'] = test_final1['PavedDrive'].map(sub_paved)

In [949]:
columns_to_remove_2 = ['OverallQual','OverallCond','BsmtHalfBath','BsmtFullBath','FullBath','HalfBath','Utilities','HouseStyle','RoofMatl','Heating'
                       ,'Id','Exterior1st']

train_final1 = train_final1.drop(columns=columns_to_remove_2,axis=1)
test_final1 = test_final1.drop(columns=columns_to_remove_2,axis=1)

In [950]:
#Encoding categoric columns
train_final1_encoded = pd.get_dummies(train_final1)
test_final1_encoded = pd.get_dummies(test_final1)

In [951]:
print(train_final1_encoded.isnull().sum().sort_values(ascending=False))

MSSubClass              0
Neighborhood_Timber     0
BsmtCond_Fa             0
ExterQual_TA            0
ExterQual_Gd            0
                       ..
Neighborhood_Gilbert    0
Neighborhood_IDOTRR     0
Neighborhood_MeadowV    0
Neighborhood_Mitchel    0
SaleType_WD             0
Length: 69, dtype: int64


In [952]:
#Saving databases as csv to open and analyze the data
train_final1_encoded.to_csv('train_final1.csv',index=False)
test_final1_encoded.to_csv('test_final1.csv',index=False)

In [953]:
#Separating x and y for our train model
X = train_final1_encoded.drop('SalePrice',axis=1)
y = train_final1_encoded.SalePrice

In [954]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

In [955]:
#LinearRegression
l_reg = LinearRegression().fit(X_train,y_train)
y_lr = l_reg.predict(X_test)
ma_lr = mean_absolute_error(y_test,y_lr)
ms_lr = mean_squared_error(y_test,y_lr)

In [956]:
#Tree
tree_reg = tree.DecisionTreeRegressor(random_state=42).fit(X_train,y_train)
y_tree = tree_reg.predict(X_test)
ma_tree = mean_absolute_error(y_test,y_tree)
ms_tree = mean_squared_error(y_test,y_tree)

In [957]:
#KNN
knn = KNeighborsClassifier(n_neighbors = 2).fit(X_train,y_train)
y_knn = knn.predict(X_test)
ma_knn = mean_absolute_error(y_test,y_knn)
ms_knn = mean_squared_error(y_test,y_knn)

In [958]:
#Support Vector Machines
svc = SVC().fit(X_train,y_train)
y_svc = svc.predict(X_test)
ma_svc = mean_absolute_error(y_test,y_svc)
ms_svc = mean_squared_error(y_test,y_svc)

In [959]:
#Gaussian Naive Bayes
gaussian = GaussianNB().fit(X_train,y_train)
y_gau = gaussian.predict(X_test)
ma_gau = mean_absolute_error(y_test,y_gau)
ms_gau = mean_squared_error(y_test,y_gau)

In [960]:
#Perceptron
perceptron = Perceptron().fit(X_train,y_train)
y_per = perceptron.predict(X_test)
ma_per = mean_absolute_error(y_test,y_per)
ms_per = mean_squared_error(y_test,y_per)

In [961]:
#Linear SVC
linear_svc = LinearSVC().fit(X_train,y_train)
y_lsvc = linear_svc.predict(X_test)
ma_lsvc = mean_absolute_error(y_test,y_lsvc)
ms_lsvc = mean_squared_error(y_test,y_lsvc)



In [962]:
#Stochastic Gradient Descent
sgdc = SGDClassifier().fit(X_train,y_train)
y_sgdc = sgdc.predict(X_test)
ma_sgdc = mean_absolute_error(y_test,y_sgdc)
ms_sgdc = mean_squared_error(y_test,y_sgdc)

In [963]:
#Random Forest
r_forest = RandomForestClassifier(n_estimators=100).fit(X_train,y_train)
y_rf = r_forest.predict(X_test)
ma_rf = mean_absolute_error(y_test,y_rf)
ms_rf = mean_squared_error(y_test,y_rf)

In [964]:
#Models Evaluation

models_evaluation = pd.DataFrame(
    {
        'Model':['Linear Regression','Tree','KNN','SVC','Gaussian','Perceptron','Linear SVC','SGDC','Random Forest'],
        'Abs Error': [ma_lr,ma_tree,ma_knn,ma_svc,ma_gau,ma_per,ma_lsvc,ma_sgdc,ma_rf],
        'Squared Error': [ms_lr,ms_tree,ms_knn,ms_svc,ms_gau,ms_per,ms_lsvc,ms_sgdc,ms_rf]
    }
)

#As the result will be evaluated by squared error (Kaggle description), we will sort by it
models_evaluation.sort_values(by='Squared Error',ascending=True)

print(models_evaluation) #Linear Regression is the most accurate model to predict this data

               Model     Abs Error  Squared Error
0  Linear Regression  23629.015943   1.474572e+09
1               Tree  28736.394537   1.930919e+09
2                KNN  26814.358921   2.302711e+09
3                SVC  60887.410788   9.392358e+09
4           Gaussian  33507.201245   3.498199e+09
5         Perceptron  61861.294606   7.461244e+09
6         Linear SVC  55986.771784   6.490616e+09
7               SGDC  51909.269710   6.483276e+09
8      Random Forest  25219.966805   1.513960e+09


In [965]:
#Using Linear Regression to predict y_test
y_pred = l_reg.predict(test_final1_encoded)

In [968]:
#Add saleprice column to test dataset
test_final1['SalePrice'] = y_pred
test_final1['Id'] = test['Id']

In [970]:
#Extracting only Id and Sale Price from test_final1
submission_file = test_final1[['Id','SalePrice']]
submission_file.to_csv('submission_file.csv',index=False)