# House price prediction 

In [2]:
import pandas as pd
import numpy as np
import  matplotlib.pyplot as plt
import seaborn as sns
import sklearn
%matplotlib inline
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

## Reading the datasets

In [3]:
df=pd.read_csv("kc_house_data.csv")
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


### The "date" column

In [4]:
df['date']

0        20141013T000000
1        20141209T000000
2        20150225T000000
3        20141209T000000
4        20150218T000000
              ...       
21608    20140521T000000
21609    20150223T000000
21610    20140623T000000
21611    20150116T000000
21612    20141015T000000
Name: date, Length: 21613, dtype: object

### Number of columns

In [5]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [6]:
df.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

### Converting date column into date time format

In [7]:
df['date']=pd.to_datetime(df['date'])

In [8]:
df['date']

0       2014-10-13
1       2014-12-09
2       2015-02-25
3       2014-12-09
4       2015-02-18
           ...    
21608   2014-05-21
21609   2015-02-23
21610   2014-06-23
21611   2015-01-16
21612   2014-10-15
Name: date, Length: 21613, dtype: datetime64[ns]

In [9]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

### Extracting month and year from the date column 

In [10]:
df['Month'] = df['date'].apply(lambda date: date.month)
df['Year'] = df['date'].apply(lambda date: date.year)

### Rounding of the Bathrooms and Floors 

In [11]:
df['bathrooms'] = np.round(df['bathrooms'])
df['floors'] = np.round(df['floors'])

### Dropping "date" and "id" columns

In [12]:
df.drop(['date','id'],inplace=True,axis=1)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   price          21613 non-null  float64
 1   bedrooms       21613 non-null  int64  
 2   bathrooms      21613 non-null  float64
 3   sqft_living    21613 non-null  int64  
 4   sqft_lot       21613 non-null  int64  
 5   floors         21613 non-null  float64
 6   waterfront     21613 non-null  int64  
 7   view           21613 non-null  int64  
 8   condition      21613 non-null  int64  
 9   grade          21613 non-null  int64  
 10  sqft_above     21613 non-null  int64  
 11  sqft_basement  21613 non-null  int64  
 12  yr_built       21613 non-null  int64  
 13  yr_renovated   21613 non-null  int64  
 14  zipcode        21613 non-null  int64  
 15  lat            21613 non-null  float64
 16  long           21613 non-null  float64
 17  sqft_living15  21613 non-null  int64  
 18  sqft_l

## X and y Split

In [14]:
X=df.drop('price',axis=1)

In [15]:
y=df['price']

## Train test Split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

In [17]:
MSE_Score = []
R2_Score = []
Algorithm = []
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

##Linear Regression

In [18]:
#import LinearRegressor
from sklearn.linear_model import LinearRegression
Algorithm.append('LinearRegression')
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test Set Results
predicted= regressor.predict(X_test)

In [19]:
# Appending the Scores For Visualisation at a Later Part
MSE_Score.append(mean_squared_error(y_test, predicted))
R2_Score.append(r2_score(y_test, predicted))

In [20]:
for i in Algorithm, MSE_Score, R2_Score:
    print(i,end=',')

['LinearRegression'],[39837353863.439804],[0.7105007250973994],

# Support Vector Regression 

In [21]:
from sklearn.svm import SVR
Algorithm.append('Support Vector Regression')
regressor = SVR()
regressor.fit(X_train, y_train)

# Predicting the Test Set Results
predicted= regressor.predict(X_test)

In [22]:
# Appending the Scores For Visualisation at a Later Part
MSE_Score.append(mean_squared_error(y_test, predicted))
R2_Score.append(r2_score(y_test, predicted))

In [23]:
for i in Algorithm, MSE_Score, R2_Score:
    print(i,end=',')

['LinearRegression', 'Support Vector Regression'],[39837353863.439804, 146300343975.5921],[0.7105007250973994, -0.06316909612324051],

##K-Nearest Neigbor Regression

In [24]:
from sklearn.neighbors import KNeighborsRegressor
Algorithm.append('KNeighborsRegressor')
knr = KNeighborsRegressor(n_neighbors = 10)
knr.fit(X_train,y_train)

predicted = knr.predict(X_test)

In [25]:
# Appending the Scores For Visualisation at a Later Part
MSE_Score.append(mean_squared_error(y_test, predicted))
R2_Score.append(r2_score(y_test, predicted))

In [26]:
for i in Algorithm, MSE_Score, R2_Score:
    print(i,end=',')

['LinearRegression', 'Support Vector Regression', 'KNeighborsRegressor'],[39837353863.439804, 146300343975.5921, 64746532588.86919],[0.7105007250973994, -0.06316909612324051, 0.5294849577311571],

##Decision Tree Regression



In [27]:
# Fitting Decision Tree to the Training Set
from sklearn.tree import DecisionTreeRegressor
Algorithm.append('DecisionTreeRegression')
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)

# Predicting the Test Set Results
predicted = reg.predict(X_test)

In [28]:
# Appending the Scores For Visualisation at a Later Part
MSE_Score.append(mean_squared_error(y_test, predicted))
R2_Score.append(r2_score(y_test, predicted))

In [29]:
for i in Algorithm, MSE_Score, R2_Score:
    print(i,end=',')

['LinearRegression', 'Support Vector Regression', 'KNeighborsRegressor', 'DecisionTreeRegression'],[39837353863.439804, 146300343975.5921, 64746532588.86919, 37102806570.78933],[0.7105007250973994, -0.06316909612324051, 0.5294849577311571, 0.730372764317748],

##Random Forest Regression

In [30]:
#import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
Algorithm.append('RandomForestRegression')
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

# Predicting the Test Set Results
predicted=rf.predict(X_test)

In [31]:
# Appending the Scores For Visualisation at a Later Part
MSE_Score.append(mean_squared_error(y_test, predicted))
R2_Score.append(r2_score(y_test, predicted))

In [32]:
for i in Algorithm, MSE_Score, R2_Score:
    print(i,end=',')

['LinearRegression', 'Support Vector Regression', 'KNeighborsRegressor', 'DecisionTreeRegression', 'RandomForestRegression'],[39837353863.439804, 146300343975.5921, 64746532588.86919, 37102806570.78933, 17500009853.51035],[0.7105007250973994, -0.06316909612324051, 0.5294849577311571, 0.730372764317748, 0.8728268905423187],

##Extreme Gradient Boost Algorithm

In [33]:
import xgboost as xgb
# Fitting XGBoost Regressor to the Training Set
Algorithm.append('Extreme Gradient Boost Algorithm')
reg = xgb.XGBRegressor()
reg.fit(X_train, y_train)

# Predicting the Test Set Results
predicted = reg.predict(X_test)



In [34]:
# Appending the Scores For Visualisation at a Later Part
MSE_Score.append(mean_squared_error(y_test, predicted))
R2_Score.append(r2_score(y_test, predicted))

In [35]:
for i in Algorithm, MSE_Score, R2_Score:
    print(i,end=',')

['LinearRegression', 'Support Vector Regression', 'KNeighborsRegressor', 'DecisionTreeRegression', 'RandomForestRegression', 'Extreme Gradient Boost Algorithm'],[39837353863.439804, 146300343975.5921, 64746532588.86919, 37102806570.78933, 17500009853.51035, 17262956838.557987],[0.7105007250973994, -0.06316909612324051, 0.5294849577311571, 0.730372764317748, 0.8728268905423187, 0.8745495620876587],

#Comparison Dataframe

In [36]:
Comparison = pd.DataFrame(list(zip(Algorithm, MSE_Score, R2_Score)), columns = ['Algorithm', 'MSE_Score', 'R2_Score'])
Comparison

Unnamed: 0,Algorithm,MSE_Score,R2_Score
0,LinearRegression,39837350000.0,0.710501
1,Support Vector Regression,146300300000.0,-0.063169
2,KNeighborsRegressor,64746530000.0,0.529485
3,DecisionTreeRegression,37102810000.0,0.730373
4,RandomForestRegression,17500010000.0,0.872827
5,Extreme Gradient Boost Algorithm,17262960000.0,0.87455
