## Insert the dataset

In [125]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [126]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [127]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

## Missing data

In [128]:
missing_columns = []
missing = train.columns[train.isnull().any()]
for i in missing:
    missing_columns.append(i)
    print(i,"\t", train[i].isnull().sum(), train[i].dtypes)

LotFrontage 	 259 float64
Alley 	 1369 object
MasVnrType 	 8 object
MasVnrArea 	 8 float64
BsmtQual 	 37 object
BsmtCond 	 37 object
BsmtExposure 	 38 object
BsmtFinType1 	 37 object
BsmtFinType2 	 38 object
Electrical 	 1 object
FireplaceQu 	 690 object
GarageType 	 81 object
GarageYrBlt 	 81 float64
GarageFinish 	 81 object
GarageQual 	 81 object
GarageCond 	 81 object
PoolQC 	 1453 object
Fence 	 1179 object
MiscFeature 	 1406 object


In [129]:
droped_items = []
for i in missing:
    percentage = (train[i].isnull().sum()/len(train))*100
    if percentage>80:
        droped_items.append(i)
        train.drop([i], axis=1, inplace=True)
    print(i, "\t", "%.2f" %percentage,"\t")

LotFrontage 	 17.74 	
Alley 	 93.77 	
MasVnrType 	 0.55 	
MasVnrArea 	 0.55 	
BsmtQual 	 2.53 	
BsmtCond 	 2.53 	
BsmtExposure 	 2.60 	
BsmtFinType1 	 2.53 	
BsmtFinType2 	 2.60 	
Electrical 	 0.07 	
FireplaceQu 	 47.26 	
GarageType 	 5.55 	
GarageYrBlt 	 5.55 	
GarageFinish 	 5.55 	
GarageQual 	 5.55 	
GarageCond 	 5.55 	
PoolQC 	 99.52 	
Fence 	 80.75 	
MiscFeature 	 96.30 	


In [130]:
for i in droped_items:
    missing_columns.remove(i)

In [131]:
missing_object_columns=[]
missing_float_columns=[]
for i in missing_columns:
    if train[i].dtypes==object:
        missing_object_columns.append(i)
    elif train[i].dtypes==float:
        missing_float_columns.append(i)

In [132]:
for i in missing_float_columns:
    train[i].fillna(train[i].mean(), inplace=True)

In [133]:
for i in missing_object_columns:
    train[i].fillna(train[i].value_counts().idxmax(), inplace=True)

In [134]:
columns = train.columns[train.any()]
for i in columns:
    print(i,"\t",train[i].isnull().sum())

Id 	 0
MSSubClass 	 0
MSZoning 	 0
LotFrontage 	 0
LotArea 	 0
Street 	 0
LotShape 	 0
LandContour 	 0
Utilities 	 0
LotConfig 	 0
LandSlope 	 0
Neighborhood 	 0
Condition1 	 0
Condition2 	 0
BldgType 	 0
HouseStyle 	 0
OverallQual 	 0
OverallCond 	 0
YearBuilt 	 0
YearRemodAdd 	 0
RoofStyle 	 0
RoofMatl 	 0
Exterior1st 	 0
Exterior2nd 	 0
MasVnrType 	 0
MasVnrArea 	 0
ExterQual 	 0
ExterCond 	 0
Foundation 	 0
BsmtQual 	 0
BsmtCond 	 0
BsmtExposure 	 0
BsmtFinType1 	 0
BsmtFinSF1 	 0
BsmtFinType2 	 0
BsmtFinSF2 	 0
BsmtUnfSF 	 0
TotalBsmtSF 	 0
Heating 	 0
HeatingQC 	 0
CentralAir 	 0
Electrical 	 0
1stFlrSF 	 0
2ndFlrSF 	 0
LowQualFinSF 	 0
GrLivArea 	 0
BsmtFullBath 	 0
BsmtHalfBath 	 0
FullBath 	 0
HalfBath 	 0
BedroomAbvGr 	 0
KitchenAbvGr 	 0
KitchenQual 	 0
TotRmsAbvGrd 	 0
Functional 	 0
Fireplaces 	 0
FireplaceQu 	 0
GarageType 	 0
GarageYrBlt 	 0
GarageFinish 	 0
GarageCars 	 0
GarageArea 	 0
GarageQual 	 0
GarageCond 	 0
PavedDrive 	 0
WoodDeckSF 	 0
OpenPorchSF 	 0
Enclosed

## Encode labels into categorical variables

In [135]:
object_columns=[]
for i in train.columns:
    if train[i].dtypes==object:
        object_columns.append(i)
        print(i,"\t",train[i].dtypes)

MSZoning 	 object
Street 	 object
LotShape 	 object
LandContour 	 object
Utilities 	 object
LotConfig 	 object
LandSlope 	 object
Neighborhood 	 object
Condition1 	 object
Condition2 	 object
BldgType 	 object
HouseStyle 	 object
RoofStyle 	 object
RoofMatl 	 object
Exterior1st 	 object
Exterior2nd 	 object
MasVnrType 	 object
ExterQual 	 object
ExterCond 	 object
Foundation 	 object
BsmtQual 	 object
BsmtCond 	 object
BsmtExposure 	 object
BsmtFinType1 	 object
BsmtFinType2 	 object
Heating 	 object
HeatingQC 	 object
CentralAir 	 object
Electrical 	 object
KitchenQual 	 object
Functional 	 object
FireplaceQu 	 object
GarageType 	 object
GarageFinish 	 object
GarageQual 	 object
GarageCond 	 object
PavedDrive 	 object
SaleType 	 object
SaleCondition 	 object


In [136]:
from sklearn import preprocessing
label = preprocessing.LabelEncoder()
for i in object_columns:
    train[i] = label.fit_transform(train[i])

In [137]:
count=0
for i in train.columns:
    count = count+1
    print(i,"\t",train[i].dtypes)
print("columns count",count)

Id 	 int64
MSSubClass 	 int64
MSZoning 	 int32
LotFrontage 	 float64
LotArea 	 int64
Street 	 int32
LotShape 	 int32
LandContour 	 int32
Utilities 	 int32
LotConfig 	 int32
LandSlope 	 int32
Neighborhood 	 int32
Condition1 	 int32
Condition2 	 int32
BldgType 	 int32
HouseStyle 	 int32
OverallQual 	 int64
OverallCond 	 int64
YearBuilt 	 int64
YearRemodAdd 	 int64
RoofStyle 	 int32
RoofMatl 	 int32
Exterior1st 	 int32
Exterior2nd 	 int32
MasVnrType 	 int32
MasVnrArea 	 float64
ExterQual 	 int32
ExterCond 	 int32
Foundation 	 int32
BsmtQual 	 int32
BsmtCond 	 int32
BsmtExposure 	 int32
BsmtFinType1 	 int32
BsmtFinSF1 	 int64
BsmtFinType2 	 int32
BsmtFinSF2 	 int64
BsmtUnfSF 	 int64
TotalBsmtSF 	 int64
Heating 	 int32
HeatingQC 	 int32
CentralAir 	 int32
Electrical 	 int32
1stFlrSF 	 int64
2ndFlrSF 	 int64
LowQualFinSF 	 int64
GrLivArea 	 int64
BsmtFullBath 	 int64
BsmtHalfBath 	 int64
FullBath 	 int64
HalfBath 	 int64
BedroomAbvGr 	 int64
KitchenAbvGr 	 int64
KitchenQual 	 int32
TotRmsAbv

In [138]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,3,3,0,4,...,0,0,0,0,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,3,3,0,2,...,0,0,0,0,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,0,3,0,4,...,0,0,0,0,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,0,3,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,0,3,0,2,...,0,0,0,0,0,12,2008,8,4,250000


## Train and test split

In [139]:
from sklearn.model_selection import train_test_split
x = train.drop(['SalePrice'], axis=1)
y = train['SalePrice']
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.33, random_state=1)

## Linear regression

In [140]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
import math
model = linear_model.LinearRegression()
model.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [141]:
prediction=model.predict(x_test)
prediction

array([212554.86479198, 185825.32032913,  97155.35486615,  69517.0936127 ,
       134681.8889279 , 317083.6871228 , 291783.28019474, 145089.24843463,
       228140.71964655, 245129.95246482, 202747.06376112,  72415.80431256,
       200315.1842786 , 365543.56172389, 240795.4430361 , 107363.66398819,
        90618.3550955 , 108622.93879039, 273153.67958822, 116398.5921623 ,
       107154.77871604,  94158.1092298 , 218084.61521752, 363681.16097068,
       108001.39588679, 228949.33919056, 142201.33893287, 175412.14174015,
       412704.83225542, 108463.21706145, 136543.08233164, 123048.22979211,
       124013.38507574,  89194.48037949, 154572.27857612, 349150.7220514 ,
       137340.99495657,  82464.85113735, 255203.26974488, 104897.7019006 ,
       139907.55438239, 146487.1343882 , 124228.6132353 , 125675.44609488,
       154786.89931074, 182389.70348941, 119947.58082131, 210034.78380073,
       246237.55233456, 278986.31080944,  93236.86042765, 280828.64338969,
        82921.20096981, 2

In [142]:
print(r2_score(y_test,prediction))
mse = mean_squared_error(y_test,prediction)
rmse = math.sqrt(mse)
print(mse)
print(rmse)

0.8377505986396725
1153065448.249267
33956.81740459885
