# 0.0. Imports

In [1]:
import re
import pandas as pd
import numpy as np

from geopy.geocoders import Nominatim

from sklearn import model_selection as ms
from sklearn import preprocessing as pp
from sklearn import dummy
from sklearn import metrics
from sklearn import linear_model as lm
from sklearn import ensemble as en

## 0.1. Helper Function

## 0.2. Load Data

In [2]:
df_raw = pd.read_csv('../data/Bengaluru_House_Data.csv')

# 1.0. Data Description

In [282]:
df1 = df_raw.copy()

In [281]:
df2.isna().mean()

area_type       0.000000
availability    0.000000
location        0.000075
size            0.001201
society         0.413063
total_sqft      0.000000
bath            0.005480
balcony         0.045721
price           0.000000
lat             0.000000
lon             0.000000
qt_bedroom      0.000000
dtype: float64

In [284]:
df1.describe(include=[object])

Unnamed: 0,area_type,availability,location,size,society,total_sqft
count,13320,13320,13319,13304,7818,13320
unique,4,81,1305,31,2688,2117
top,Super built-up Area,Ready To Move,Whitefield,2 BHK,GrrvaGr,1200
freq,8790,10581,540,5199,80,843


# 2.0. Feature Engineering

In [245]:
df2 = df1.copy()

In [81]:
# geolocator = Nominatim(user_agent="geoapiExercises")

# def location_lat(x):
#     if geolocator.geocode(x, timeout=None):
#         return geolocator.geocode(x, timeout=None).raw['lat']
#     else: 
#         return x

# df2['lat'] = df2['location'].apply(location_lat)

# def location_lon(x):
#     if geolocator.geocode(x, timeout=None):
#         return geolocator.geocode(x, timeout=None).raw['lon']
#     else: 
#         return x

# df2['lon'] = df2['location'].apply(location_lon)

# df2.to_csv('add_lat_and_lon_Bengaluru_House_Data.csv', index=False)

In [3]:
df2 = pd.read_csv('../data/add_lat_and_lon_Bengaluru_House_Data.csv')

In [280]:
# size
df2['qt_bedroom'] = df2['size'].apply(lambda x: str(x).split()[0])

In [None]:
'size', 'bath'

# 3.0. Data Filtering

In [303]:
df3 = df2.copy()

## 3.1. Filter Rows

In [304]:
# filter float
df3_lat_lon_float = df3.loc[df3['lat'].apply(lambda x: bool(re.search('^[0-9-][0-9][\.]+', x))), :]
df3_lat_lon_float.loc[:, 'lat'] = df3_lat_lon_float.loc[:, 'lat'].astype(float)

df3_lat_lon_str = df3.loc[~df3['lat'].apply(lambda x: bool(re.search('^[0-9-][0-9][\.]+', x))), :]
df3_lat_lon_float.loc[:, 'lon'] = df3_lat_lon_float.loc[:, 'lon'].astype(float)

In [305]:
df3 = df3_lat_lon_float.copy()

## 3.2. Filter Columns

In [306]:
drop_cols = ['location', 'society', 'size']
df3 = df3.drop(drop_cols, axis=1)

In [307]:
df3.isna().mean()

area_type       0.000000
availability    0.000000
total_sqft      0.000000
bath            0.005830
balcony         0.046557
price           0.000000
lat             0.000000
lon             0.000000
qt_bedroom      0.000000
dtype: float64

In [309]:
df3 = df3.dropna()

In [310]:
df3.isna().mean()

area_type       0.0
availability    0.0
total_sqft      0.0
bath            0.0
balcony         0.0
price           0.0
lat             0.0
lon             0.0
qt_bedroom      0.0
dtype: float64

# 4.0. EDA

In [312]:
df4 = df3.copy()

# 5.0. Data Preparation

In [313]:
df5 = df4.copy()

## 5.1. Standardization

## 5.2. Rescaling

## 5.3. Encoding



In [319]:
le = pp.LabelEncoder()
df5['area_type'] = le.fit_transform(df5[['area_type']])
df5['availability'] = le.fit_transform(df5[['availability']])

  return f(*args, **kwargs)


In [320]:
df5.head()

Unnamed: 0,area_type,availability,total_sqft,bath,balcony,price,lat,lon,qt_bedroom
0,3,38,1056,2.0,1.0,39.07,12.846854,77.676927,2
1,2,77,2600,5.0,3.0,120.0,12.895768,77.867101,4
2,0,77,1440,2.0,3.0,62.0,12.905568,77.545544,3
4,3,77,1200,2.0,1.0,51.0,12.580537,77.333067,2
5,3,77,1170,2.0,1.0,38.0,44.373058,-71.611858,2


# 6.0. Feature Selection

In [341]:
df6 = df5.copy()

In [342]:
X = df5.drop(['price', 'total_sqft'], axis=1)
y = df5['price'].copy()

x_train, x_val, y_train, y_val = ms.train_test_split(X, y, test_size=0.2, random_state=42)

# 7.0. Model Training

## 7.1. Average Model

In [347]:
# model definition and fit
model_baseline = dummy.DummyRegressor(strategy='mean').fit(x_train, y_train)

# model predict
yhat_baseline = model_baseline.predict(x_val)

# model perfomance
mae = metrics.mean_absolute_error(y_val, yhat_baseline)
mape = metrics.mean_absolute_percentage_error(y_val, yhat_baseline)
rmse = np.sqrt(metrics.mean_squared_error(y_val, yhat_baseline))

print('MAE: {} | MAPE: {} | RMSE: {}'.format(mae, mape, rmse))

MAE: 69.18655009838794 | MAPE: 0.9189057519989557 | RMSE: 130.53405853555498


## 7.2. Linear Regression Model

In [345]:
# model definition and fit
model_lr = lm.LinearRegression().fit(x_train, y_train)

# model predict
yhat_lr = model_lr.predict(x_val)

# model perfomance
mae = metrics.mean_absolute_error(y_val, yhat_lr)
mape = metrics.mean_absolute_percentage_error(y_val, yhat_lr)
rmse = np.sqrt(metrics.mean_squared_error(y_val, yhat_lr))

print('MAE: {} | MAPE: {} | RMSE: {}'.format(mae, mape, rmse))

MAE: 51.39237039425772 | MAPE: 0.5167872441161455 | RMSE: 113.84866787174654


## 7.3. Random Forest Model

In [361]:
# model definition and fit
model_rf = en.RandomForestRegressor().fit(x_train, y_train)

# model predict
yhat_rf = model_rf.predict(x_val)

# model perfomance
mae = metrics.mean_absolute_error(y_val, yhat_rf)
mape = metrics.mean_absolute_percentage_error(y_val, yhat_rf) 
rmse = np.sqrt(metrics.mean_squared_error(y_val, yhat_rf))

print('MAE: {} | MAPE: {} | RMSE: {}'.format(mae, mape, rmse))

MAE: 39.84512783025325 | MAPE: 0.357494658863791 | RMSE: 103.6417705071636


# 8.0. Hyperparameter Fine Tuning

# 9.0. Model Perfomance

# 10.0. Deploy to Product