In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams["figure.figsize"] = (20,10)
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_10 = pd.read_csv("./data/clean_data.csv")

In [3]:
dummies = pd.get_dummies(df_10.location)
dummies.head()

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df_11 = pd.concat([df_10, dummies.drop('other', axis='columns')], axis='columns')
df_11.head()

Unnamed: 0,location,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1st Block Jayanagar,2850.0,4.0,428.0,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1st Block Jayanagar,1630.0,3.0,194.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1st Block Jayanagar,1875.0,2.0,235.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1st Block Jayanagar,1200.0,2.0,130.0,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1st Block Jayanagar,1235.0,2.0,148.0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_12 = df_11.drop('location', axis='columns')

In [6]:
df_12.head()

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,235.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1200.0,2.0,130.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1235.0,2.0,148.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df_12.shape

(7251, 245)

In [8]:
# Predictors and Prediction separation
X = df_12.drop('price', axis='columns')
y = df_12['price']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [10]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train, y_train)
lr_clf.score(X_test, y_test)

0.8452277697874291

In [11]:
# try k-fold validation
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.82430186, 0.77166234, 0.85089567, 0.80837764, 0.83653286])

In [12]:
# Use gridsearchcv to find the best performing algorithm
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

In [13]:
def find_best_model_using_gridsearchcv(X, y):
	algos = {
		'linear_regressor' : {
			'model' : LinearRegression(),
			'params' : {
				'normalize' : [True, False]
			}
		},
		'lasso' : {
			'model' : Lasso(),
			'params' : {
				'alpha' : [1, 2],
				'selection' : ['random', 'cyclic']
			}
		},
		'decision_tree': {
			'model' : DecisionTreeRegressor(),
			'params' : {
				'criterion' : ['mse', 'friedman_mse'],
				'splitter' : ['best', 'random']
			}
		}
	}

	scores = []
	cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
	for algo_name, config in algos.items():
		gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
		gs.fit(X, y)
		scores.append({
			'model' : algo_name,
			'best_score' : gs.best_score_,
			'best_params' : gs.best_params_
		})
	return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

In [14]:
find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regressor,0.818354,{'normalize': True}
1,lasso,0.687432,"{'alpha': 1, 'selection': 'random'}"
2,decision_tree,0.721713,"{'criterion': 'mse', 'splitter': 'best'}"


In [15]:
def predict_price(location, sqft, bath, bhk):
	loc_index = np.where(X.columns == location)[0][0]

	x = np.zeros(len(X.columns))
	x[0] = sqft
	x[1] = bath
	x[2] = bhk
	if loc_index >= 0:
		x[loc_index] = 1
	return lr_clf.predict([x])[0]

In [16]:
# inference
predict_price('1st Phase JP Nagar', 1000, 2, 2)


83.49904677170528

In [17]:
# inference
predict_price('1st Phase JP Nagar', 1000, 3, 3)

86.80519395197086

In [18]:
# Dump the model as a binary pickle file for inference
import joblib, pickle
with open('bangalore_house_prices_model.pickle', 'wb') as f:
	pickle.dump(lr_clf, f)


In [19]:
import json
columns = {
	'data_columns' : [col.lower() for col in X.columns]
}

with open("columns.json", "w") as f:
	f.write(json.dumps(columns))