In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from math import sqrt
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
house = pd.read_csv("../input/kc_house_data.csv")
price = house['price'] # y of the data for linear regression

In [None]:
house.info()

# Feature analysis and transformation

In [None]:
house['date'] = pd.to_datetime(house['date'])#transforming date column to datetype
house['year'] = house['date'].dt.year #extracting year 
house['month'] = house['date'].dt.month #month
house['day'] = house['date'].dt.day #day
house['yr_renovated'] = house['yr_renovated'].apply(lambda x: 2018 - x if x != 0 else 0)#analysing how old is a renovation
house['yr_built'] = np.abs(house['yr_built'] - 2018) #analysing how old is a building
house = pd.concat([house, pd.get_dummies(house['zipcode'])], axis=1); # creating dummies from zipcode

house = house.drop('date', axis = 1)
house = house.drop('zipcode', axis = 1)
house = house.drop('id', axis = 1)


In [None]:
# visualising houses on the map with indication of house price
house.plot(kind="scatter", x="long", y="lat", alpha=0.4, figsize=(16,8), c=price,
           cmap=plt.get_cmap("jet"), colorbar=True, sharex=False)
# As we can see most of the houses are cheaper than 300 thousands and most of the expensive houses are 
#located near the water (water is mainly white spacec on the graph)

In [None]:
#visualizationg of price corresponging to sqrt_living with indication of condiditon of the property
plt.figure(figsize = (12,8))
g = sns.FacetGrid(data=house, hue='condition',size= 5, aspect=2)
g.map(plt.scatter, "sqft_living", "price")
plt.show()

In [None]:
#creating correlation matrix to know the relation between target feature and other features
features = ['price','sqft_living','grade','sqft_above','sqft_living15', 'bathrooms','view','sqft_basement',
            'bedrooms', 'lat', 'waterfront', 'floors', 'sqft_lot', 'sqft_lot15','yr_renovated','yr_built',
            'condition', 'year','long', 'day', 'month']
f, ax = plt.subplots(figsize=(20, 15))
plt.title('Correlation Matrix',fontsize=25)
sns.heatmap(house[features].corr(), linewidths=0.25, vmax=1.0, square=True, cmap="RdBu_r", linecolor='k', annot=True)

In [None]:
house = house.drop('price', axis = 1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(house, price, test_size = 0.2, random_state=49)
print("Linear regression datasets")
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

# Linear regression

In [None]:
from sklearn import linear_model
linear = linear_model.LinearRegression()
linear.fit(X_train, y_train)
predictions = linear.predict(X_test)
# The mean squared error
print("MSE test: %.2f"% mean_squared_error(y_test, predictions))
print("MSE train: %.2f"% mean_squared_error(y_train, linear.predict(X_train)))
# The root mean squared error
print("RMSE: %.2f"% sqrt(mean_squared_error(y_test, predictions)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, predictions))

In [None]:
# to visualize easier I will take features 
train_score = [67931066834.67, 62306584063.49, 61471318688.55, 61331972036.77, 61062544452.85, 57063428453.64,
               57063428453.64, 56474065847.59, 48498227841.03, 46620127163.50, 46471326069.48, 46450188074.47, 
               46346785423.06, 46175074862.26, 41929732178.36, 41609035250.42, 41422672632.55, 41210098136.32, 
               40699278920.83, 40694987157.64, 25994480772.17]
test_score = [70035987663.86,  64466114800.02, 63394337312.83, 63346196234.39, 62906940485.03, 57520064549.14,
              57520064549.14, 57147409666.80, 48670558382.01, 44954342653.28, 44870210542.26, 44664863538.50,
              44802783588.71, 44445870879.89, 40142222536.94, 39705475435.78, 39456722730.82, 39327197095.20,
              38930511620.08, 38903082114.59, 24585784318.73]
number_of_features = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 17, 18, 19, 90]

In [None]:
plt.figure(figsize=(16, 5))
plt.plot(number_of_features, train_score)
plt.plot(number_of_features, test_score)
plt.xlim([0, 20])
plt.ylim([38000000000, 71000000000])
plt.rcParams['font.size'] = 12
plt.title('Train Test Error')
plt.xlabel('Number of features')
plt.ylabel('MSE')
plt.grid(True)

# Polynomial regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(house)

X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(X_poly, price, test_size=0.2, random_state=49)

model = linear_model.LinearRegression()
model.fit(X_train_poly, y_train_poly)
predictions_poly = model.predict(X_test_poly)
# The mean squared error
print("MSE: %.2f"% mean_squared_error(y_test_poly, predictions_poly))
# The root mean squared error
print("RMSE: %.2f"% sqrt(mean_squared_error(y_test_poly, predictions_poly)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test_poly, predictions_poly))

# Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=500)
rf_model.fit(X_train, y_train)
predictions_rf = rf_model.predict(X_test)
# The mean squared error
print("MSE: %.2f"% mean_squared_error(y_test, predictions_rf))
# The root mean squared error
print("RMSE: %.2f"% sqrt(mean_squared_error(y_test, predictions_rf)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, predictions_rf))

# XGBoost

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,colsample_bytree=1, max_depth=7)
xgb.fit(X_train, y_train)
predictions_xgboost = xgb.predict(X_test)
# The mean squared error
print("MSE: %.2f"% mean_squared_error(y_test, predictions_xgboost))
# The root mean squared error
print("RMSE: %.2f"% sqrt(mean_squared_error(y_test, predictions_xgboost)))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, predictions_xgboost))