## House Price Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the data
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
#Find the shape of the Data
print("SHAPE: ",data.shape)

print('*******************************')

# Checking for any missing values
print(data.isnull().any())

SHAPE:  (1460, 81)
*******************************
Id               False
MSSubClass       False
MSZoning         False
LotFrontage       True
LotArea          False
                 ...  
MoSold           False
YrSold           False
SaleType         False
SaleCondition    False
SalePrice        False
Length: 81, dtype: bool


In [4]:
data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
data.isnull().sum() / len(data) * 100


Id                0.000000
MSSubClass        0.000000
MSZoning          0.000000
LotFrontage      17.739726
LotArea           0.000000
                   ...    
MoSold            0.000000
YrSold            0.000000
SaleType          0.000000
SaleCondition     0.000000
SalePrice         0.000000
Length: 81, dtype: float64

In [6]:
# Select relevant features variables
data = data[['OverallQual', 'LotArea', '1stFlrSF', '2ndFlrSF', 'OverallCond', 'GrLivArea',
                'TotRmsAbvGrd','GarageCars', 'GarageArea', 'TotalBsmtSF', 'FullBath','WoodDeckSF', 
                 'OpenPorchSF','EnclosedPorch', '3SsnPorch','ScreenPorch', 'PoolArea', 'MiscVal',
                 'YearBuilt','SalePrice']]
data

Unnamed: 0,OverallQual,LotArea,1stFlrSF,2ndFlrSF,OverallCond,GrLivArea,TotRmsAbvGrd,GarageCars,GarageArea,TotalBsmtSF,FullBath,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,YearBuilt,SalePrice
0,7,8450,856,854,5,1710,8,2,548,856,2,0,61,0,0,0,0,0,2003,208500
1,6,9600,1262,0,8,1262,6,2,460,1262,2,298,0,0,0,0,0,0,1976,181500
2,7,11250,920,866,5,1786,6,2,608,920,2,0,42,0,0,0,0,0,2001,223500
3,7,9550,961,756,5,1717,7,3,642,756,1,0,35,272,0,0,0,0,1915,140000
4,8,14260,1145,1053,5,2198,9,3,836,1145,2,192,84,0,0,0,0,0,2000,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,6,7917,953,694,5,1647,7,2,460,953,2,0,40,0,0,0,0,0,1999,175000
1456,6,13175,2073,0,6,2073,7,2,500,1542,2,349,0,0,0,0,0,0,1978,210000
1457,7,9042,1188,1152,9,2340,9,1,252,1152,2,0,60,0,0,0,0,2500,1941,266500
1458,5,9717,1078,0,6,1078,5,1,240,1078,1,366,0,112,0,0,0,0,1950,142125


In [7]:
data.isnull().sum() / len(data) * 100


OverallQual      0.0
LotArea          0.0
1stFlrSF         0.0
2ndFlrSF         0.0
OverallCond      0.0
GrLivArea        0.0
TotRmsAbvGrd     0.0
GarageCars       0.0
GarageArea       0.0
TotalBsmtSF      0.0
FullBath         0.0
WoodDeckSF       0.0
OpenPorchSF      0.0
EnclosedPorch    0.0
3SsnPorch        0.0
ScreenPorch      0.0
PoolArea         0.0
MiscVal          0.0
YearBuilt        0.0
SalePrice        0.0
dtype: float64

In [8]:
# Feature scaling on dataset
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
rs

RobustScaler()

In [9]:
col=['LotArea', '1stFlrSF', '2ndFlrSF', 'GrLivArea','WoodDeckSF', 'OpenPorchSF','EnclosedPorch', 
     '3SsnPorch','ScreenPorch', 'PoolArea', 'MiscVal','SalePrice']
data[col] = rs.fit_transform(data[col])
data.head()

Unnamed: 0,OverallQual,LotArea,1stFlrSF,2ndFlrSF,OverallCond,GrLivArea,TotRmsAbvGrd,GarageCars,GarageArea,TotalBsmtSF,FullBath,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,YearBuilt,SalePrice
0,7,-0.254076,-0.453608,1.173077,5,0.38007,8,2,548,856,2,0.0,0.529412,0.0,0.0,0.0,0.0,0.0,2003,0.541506
1,6,0.030015,0.343643,0.0,8,-0.31209,6,2,460,1262,2,1.77381,-0.367647,0.0,0.0,0.0,0.0,0.0,1976,0.220173
2,7,0.437624,-0.327933,1.18956,5,0.497489,6,2,608,920,2,0.0,0.25,0.0,0.0,0.0,0.0,0.0,2001,0.720024
3,7,0.017663,-0.247423,1.038462,5,0.390885,7,3,642,756,1,0.0,0.147059,272.0,0.0,0.0,0.0,0.0,1915,-0.273728
4,8,1.181201,0.113893,1.446429,5,1.134029,9,3,836,1145,2,1.142857,0.867647,0.0,0.0,0.0,0.0,0.0,2000,1.035406


In [10]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = data.drop('SalePrice', axis=1)
y = data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Build and train a linear regression model
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr

LinearRegression()

In [12]:
lr.fit(X_train, y_train)

LinearRegression()

In [13]:
y_pred = lr.predict(X_test)


In [14]:
# Calculate the Accuracy
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print("r2_score:", r2)

r2_score: 0.8060469293870014
