In [1]:
# libraries to visualise data in browser
import webbrowser
import os

In [2]:
# lets import pre-processing libraries
import numpy as np
import pandas as pd

In [3]:
# import the data fetched from scraping
house_df = pd.read_csv("housing_data.csv")

# get a glance of the dataframe
house_df.head()

Unnamed: 0,id,category_id,title,sale_price,num_of_bhk,site,product_url,total_sqft,livable_sqft,num_bedrooms,...,flat_condition,flooring_type,parking,furnishing_state,direction_facing,property_on,ownership,year_built,location,inserted_timestamp
0,1,1,1BHK Apartment for Sale,1.1 Crores,1BHK,CommonFloor,https://www.commonfloor.com/listing/1bhk-apart...,600.0,420.0,1.0,...,New Sale,,No,,,Ground floor,,,Borivali East,2018-09-12 15:37:33
1,2,1,2BHK Apartment for Sale,75 Lakhs,2BHK,CommonFloor,https://www.commonfloor.com/listing/2bhk-apart...,955.0,620.0,2.0,...,Resale,,No,,,Ground floor,,,Mira Road,2018-09-12 15:37:38
2,3,1,1BHK Apartment for Sale,73 Lakhs,1BHK,CommonFloor,https://www.commonfloor.com/listing/1bhk-apart...,489.0,342.0,1.0,...,New Sale,,No,,,Ground floor,,,Dahisar East,2018-09-12 15:37:42
3,4,1,3BHK Apartment for Sale,2.74 Crores,3BHK,CommonFloor,https://www.commonfloor.com/listing/semi-furni...,1330.0,950.0,3.0,...,New Sale,Marble Flooring,Yes,semi furnished,East,5th floor,Freehold,Under Construction,Andheri West,2018-09-12 15:37:46
4,5,1,1BHK Apartment for Sale,80 Lakhs,1BHK,CommonFloor,https://www.commonfloor.com/listing/1bhk-apart...,575.0,375.0,1.0,...,Resale,,No,,,Ground floor,,,Kandivali West,2018-09-12 15:37:50


In [4]:
# lets check the shape of the dataset
house_df.shape

(6073, 21)

In [5]:
# get all the columns
house_df.columns

Index(['id', 'category_id', 'title', 'sale_price', 'num_of_bhk', 'site',
       'product_url', 'total_sqft', 'livable_sqft', 'num_bedrooms',
       'num_bathrooms', 'flat_condition', 'flooring_type', 'parking',
       'furnishing_state', 'direction_facing', 'property_on', 'ownership',
       'year_built', 'location', 'inserted_timestamp'],
      dtype='object')

In [6]:
# drop irrelevant columns
columns = ['id', 'category_id', 'title','site', 'product_url', 'inserted_timestamp']
house_df.drop(columns, inplace=True, axis=1)

# dataframe after deleting irrelevant columns
house_df.head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,direction_facing,property_on,ownership,year_built,location
0,1.1 Crores,1BHK,600.0,420.0,1.0,2.0,New Sale,,No,,,Ground floor,,,Borivali East
1,75 Lakhs,2BHK,955.0,620.0,2.0,2.0,Resale,,No,,,Ground floor,,,Mira Road
2,73 Lakhs,1BHK,489.0,342.0,1.0,2.0,New Sale,,No,,,Ground floor,,,Dahisar East
3,2.74 Crores,3BHK,1330.0,950.0,3.0,3.0,New Sale,Marble Flooring,Yes,semi furnished,East,5th floor,Freehold,Under Construction,Andheri West
4,80 Lakhs,1BHK,575.0,375.0,1.0,2.0,Resale,,No,,,Ground floor,,,Kandivali West


In [7]:
# Create a web page view of the data for easy viewing
# html = house_df[0:100].to_html()

# # Save the html to a temporary file
# with open("data.html", "w") as f:
#     f.write(html)

# # Open the web page in our web browser
# full_filename = os.path.abspath("data.html")
# webbrowser.open("file://{}".format(full_filename))

In [8]:
# get unique values of following columns

column_list = ["num_of_bhk","num_bedrooms","num_bathrooms","flat_condition","flooring_type","parking","furnishing_state","direction_facing","property_on","ownership","year_built"]

#column_list = ["property_on"]

for curr_col in column_list:
    print("Current Column Name: " + curr_col)
    print("Column Unique Values: ", house_df[curr_col].unique())

Current Column Name: num_of_bhk
Column Unique Values:  ['1BHK' '2BHK' '3BHK' nan '4BHK' '0.5BHK' '4+BHK']
Current Column Name: num_bedrooms
Column Unique Values:  [ 1.  2.  3.  4. nan]
Current Column Name: num_bathrooms
Column Unique Values:  [ 2.  3.  1.  5.  4. nan  7.  6.]
Current Column Name: flat_condition
Column Unique Values:  ['New Sale' 'Resale' nan]
Current Column Name: flooring_type
Column Unique Values:  [nan 'Marble Flooring' 'Vitrified Tiles' 'Ceramic' 'Marbonite' 'Wooden'
 'Granite' 'Normal Tiles' 'Vetrified' 'Mosaic Tiles']
Current Column Name: parking
Column Unique Values:  ['No' 'Yes' nan]
Current Column Name: furnishing_state
Column Unique Values:  [nan 'semi furnished' 'not furnished' 'fully furnished' 'Not furnished'
 'Fully Furnished furnished' 'Unfurnished furnished' 'Fully furnished'
 'Semi furnished' 'Semi-Furnished furnished']
Current Column Name: direction_facing
Column Unique Values:  [nan 'East' 'West' 'North' 'North-East' 'South-East' 'North-West'
 'South-

In [9]:
# Data Exploration/Analysis

# lets print data info
house_df.info()

# lets describe the data
house_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6073 entries, 0 to 6072
Data columns (total 15 columns):
sale_price          5721 non-null object
num_of_bhk          4420 non-null object
total_sqft          5600 non-null float64
livable_sqft        5655 non-null float64
num_bedrooms        5662 non-null float64
num_bathrooms       5662 non-null float64
flat_condition      5644 non-null object
flooring_type       3084 non-null object
parking             5721 non-null object
furnishing_state    3914 non-null object
direction_facing    2606 non-null object
property_on         5721 non-null object
ownership           3120 non-null object
year_built          2272 non-null object
location            5720 non-null object
dtypes: float64(4), object(11)
memory usage: 711.8+ KB


Unnamed: 0,total_sqft,livable_sqft,num_bedrooms,num_bathrooms
count,5600.0,5655.0,5662.0,5662.0
mean,857.697857,615.316357,1.703992,1.752031
std,969.901755,451.202257,0.815402,0.822516
min,120.0,1.0,1.0,1.0
25%,535.0,389.5,1.0,1.0
50%,693.0,501.0,2.0,2.0
75%,1000.0,735.0,2.0,2.0
max,55055.0,16800.0,4.0,7.0


In [10]:
# Find missing values in each column
#lets see which are the columns with missing values
house_df.isnull().sum()

sale_price           352
num_of_bhk          1653
total_sqft           473
livable_sqft         418
num_bedrooms         411
num_bathrooms        411
flat_condition       429
flooring_type       2989
parking              352
furnishing_state    2159
direction_facing    3467
property_on          352
ownership           2953
year_built          3801
location             353
dtype: int64

In [11]:
#Lets check which rows have null sale_price column
house_df[house_df['sale_price'].isnull()].head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,direction_facing,property_on,ownership,year_built,location
5590,,,,,,,,,,,,,,,
5591,,,,,,,,,,,,,,,
5593,,,,,,,,,,,,,,,
5598,,,,,,,,,,,,,,,
5601,,,,,,,,,,,,,,,


In [12]:
# lets drop those rows who has sale price null
house_df.drop(house_df[house_df['sale_price'].isnull()].index, inplace=True)

In [13]:
# lets check again the missing values count for each column
house_df.isnull().sum()

sale_price             0
num_of_bhk          1301
total_sqft           121
livable_sqft          66
num_bedrooms          59
num_bathrooms         59
flat_condition        77
flooring_type       2637
parking                0
furnishing_state    1807
direction_facing    3115
property_on            0
ownership           2601
year_built          3449
location               1
dtype: int64

In [14]:
# in order to deal with missing values in each column I will start by dealing with least missing value to column to highest missing values column.
# In that case I will start with the location attribute.

# Dealing with missing value in location column
# Lets check with rows has missing location value.
house_df[house_df['location'].isnull()]

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,direction_facing,property_on,ownership,year_built,location
3887,60 Lakhs,2BHK,755.0,521.0,2.0,2.0,,,No,Unfurnished furnished,"Garden Facing,Road S",Ground floor,freehold,,


In [15]:
# In order to replace the missing value in location column, we can find the location of all the house which has price 60 Lakhs and 2BHK apartment.
house_loc_df = house_df[(house_df['sale_price'] == '60 Lakhs') & (house_df['num_of_bhk'] == '2BHK')]
house_loc_df.head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,direction_facing,property_on,ownership,year_built,location
28,60 Lakhs,2BHK,950.0,650.0,2.0,2.0,Resale,Vitrified Tiles,Yes,fully furnished,East,Ground floor,Freehold,2010.0,Virar West
927,60 Lakhs,2BHK,800.0,640.0,2.0,2.0,Resale,Vitrified Tiles,Yes,semi furnished,East,1st floor,Freehold,,Mira Road
1353,60 Lakhs,2BHK,1250.0,970.0,2.0,2.0,Resale,,No,,,Ground floor,,,Badlapur East
1503,60 Lakhs,2BHK,1325.0,1060.0,2.0,2.0,Resale,Vitrified Tiles,Yes,semi furnished,,11th floor,Freehold,,Bhiwandi
1506,60 Lakhs,2BHK,1075.0,860.0,2.0,2.0,Resale,Vitrified Tiles,Yes,semi furnished,,9th floor,Freehold,,Bhiwandi


In [16]:
# check the shape
house_loc_df.shape

(17, 15)

In [17]:
# lets check unique location values for the custom dataframe
house_loc_df['location'].unique()

array(['Virar West', 'Mira Road', 'Badlapur East', 'Bhiwandi',
       'Kalyan West', 'Goregaon East', 'Dombivli East', nan, 'Kashimira'],
      dtype=object)

In [18]:
# lets check the location count
house_loc_df['location'].value_counts()

Dombivli East    6
Mira Road        3
Bhiwandi         2
Virar West       1
Kalyan West      1
Kashimira        1
Goregaon East    1
Badlapur East    1
Name: location, dtype: int64

In [19]:
# from the above ouput we can see that "Dombivli East" appears to be most frequent so I will assign that value to the missing location data.

house_df["location"] = house_df["location"].fillna('Dombivli East')

# now lets check if the missing value is assigned properly
house_df.iloc[3887]

sale_price                       60 Lakhs
num_of_bhk                           2BHK
total_sqft                            755
livable_sqft                          521
num_bedrooms                            2
num_bathrooms                           2
flat_condition                        NaN
flooring_type                         NaN
parking                                No
furnishing_state    Unfurnished furnished
direction_facing     Garden Facing,Road S
property_on                  Ground floor
ownership                        freehold
year_built                            NaN
location                    Dombivli East
Name: 3887, dtype: object

In [20]:
# lets check again the missing values count for each column
house_df.isnull().sum().sort_values()

sale_price             0
parking                0
property_on            0
location               0
num_bedrooms          59
num_bathrooms         59
livable_sqft          66
flat_condition        77
total_sqft           121
num_of_bhk          1301
furnishing_state    1807
ownership           2601
flooring_type       2637
direction_facing    3115
year_built          3449
dtype: int64

In [21]:
# lets deal with missing values in num_bedrooms and num_bathrooms
# My only reason to deal with them together is because the number of missing values are same in both and there might be some pattern associated due to this.

missing_bed_bath_df = house_df[house_df['num_bedrooms'].isnull() & house_df['num_bathrooms'].isnull()]

In [22]:
# lets have glance over the data
missing_bed_bath_df.head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,direction_facing,property_on,ownership,year_built,location
111,22.55 Lakhs,,,,,,,,No,,,Ground floor,,,Badlapur East
116,22.45 Lakhs,1BHK,,,,,,,No,,,Ground floor,,,Badlapur East
147,35.5 Lakhs,2BHK,,,,,,,No,,,Ground floor,,,Virar West
174,1.95 Crores,2BHK,,,,,,,No,,,Ground floor,,,Chembur
189,1.6 Crores,2BHK,,,,,,,No,,,Ground floor,,,Malad east


In [23]:
# check the shape
missing_bed_bath_df.shape

(59, 15)

In [24]:
# From the above 2 output we can see that there a 
# there are multiple columns which are null when we group by null on both num_bedrooms & num_bathrooms
# Also the number of rows are only 59 so dropping them would make sense, as we need to fill multiple column missing data which might cause issue in our prediction

row_index_list = house_df[house_df['num_bedrooms'].isnull() & house_df['num_bathrooms'].isnull()].index
house_df.drop(row_index_list, axis=0, inplace=True)

In [25]:
# lets check if the rows are deleted.
house_df[house_df['num_bedrooms'].isnull() & house_df['num_bathrooms'].isnull()]

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,direction_facing,property_on,ownership,year_built,location


In [26]:
# lets check again the missing values count for each column
house_df.isnull().sum().sort_values()

sale_price             0
num_bedrooms           0
num_bathrooms          0
parking                0
property_on            0
location               0
livable_sqft           7
flat_condition        18
total_sqft            62
num_of_bhk          1289
furnishing_state    1751
ownership           2542
flooring_type       2581
direction_facing    3058
year_built          3392
dtype: int64

In [27]:
# now that we have taken care of num_bedrooms & num_bathrooms
# lets take care of livable_sqft feature.
# lets get the null values in livable_sqft feature

house_df[house_df['livable_sqft'].isnull()]

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,direction_facing,property_on,ownership,year_built,location
1220,1 Crore,4BHK,1340.0,,4.0,3.0,Resale,Vitrified Tiles,Yes,semi furnished,East,1st floor,freehold,2008.0,Kalyan West
1241,17 Lakhs,0.5BHK,375.0,,1.0,1.0,New Sale,Marble Flooring,Yes,not furnished,,2nd floor,,2012.0,Badlapur East
1330,1.65 Crores,1BHK,637.0,,1.0,1.0,Resale,Vitrified Tiles,Yes,fully furnished,,7th floor,freehold,2007.0,Wadala East
3594,2 Crores,,942.0,,2.0,2.0,Resale,Normal Tiles,Yes,,,Ground floor,Freehold,,Santacruz East
3732,45 Lakhs,,650.0,,1.0,1.0,Resale,,No,not furnished,,Ground floor,,,Dombivli East
3738,33 Lakhs,1BHK,575.0,,1.0,1.0,New Sale,Normal Tiles,Yes,semi furnished,East,2nd floor,freehold,2014.0,Kandivali West
5255,1.2 Crores,2BHK,650.0,,2.0,2.0,Resale,Vitrified Tiles,Yes,,,Ground floor,Freehold,,Mumbai Central


In [28]:
house_df[(house_df['num_of_bhk']=='4BHK') & (house_df['total_sqft'] > 1000) & (house_df['total_sqft'] < 1500)].head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,direction_facing,property_on,ownership,year_built,location
529,1 Crore,4BHK,1429.0,1000.0,4.0,3.0,Resale,,Yes,Not furnished,,Ground floor,,,Dombivli East
1165,4.38 Crores,4BHK,1393.0,1393.0,4.0,4.0,New Sale,Vitrified Tiles,Yes,not furnished,East,8th floor,Freehold,Under Construction,Vikhroli East
1220,1 Crore,4BHK,1340.0,,4.0,3.0,Resale,Vitrified Tiles,Yes,semi furnished,East,1st floor,freehold,2008,Kalyan West
1346,4.1 Crores,4BHK,1125.0,1125.0,4.0,4.0,Resale,Vitrified Tiles,Yes,not furnished,East,6th floor,Freehold,2018,Chembur
2159,1.95 Crores,4BHK,1300.0,910.0,4.0,4.0,New Sale,,No,Fully Furnished furnished,,Ground floor,,,Kurla East


In [29]:
diff = house_df['total_sqft'] - house_df['livable_sqft']
diff.head()

0    180.0
1    335.0
2    147.0
3    380.0
4    200.0
dtype: float64

In [30]:
total_livable_sqft_diff_mean = diff.mean()
total_livable_sqft_diff_mean = round(total_livable_sqft_diff_mean)
total_livable_sqft_diff_mean

244.0

In [31]:
# when filling missing values for livable_sqft there was a lot of confusion on how should I approach
# First I thought about targetting individual rows and based on num_of_bhk & total_sqft feature I would decide the livable sqft for each row, but that would be time consuming and I personally think this is very weird approach
# So instead I decided to find out the difference between total_sqft and livable_sqft for all the rows and take the average of difference.
# And I will use the average difference value to subtract from total_sqft for every livable_sqft where the value is null.
# Vice-versa I will use the logic to fill the total_sqft, so that the values are somewhat consistent

In [32]:
house_df['livable_sqft'] = house_df['livable_sqft'].fillna(house_df['total_sqft'] - total_livable_sqft_diff_mean)

In [33]:
house_df[(house_df['num_of_bhk']=='4BHK') & (house_df['total_sqft'] > 1000) & (house_df['total_sqft'] < 1500)].head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,direction_facing,property_on,ownership,year_built,location
529,1 Crore,4BHK,1429.0,1000.0,4.0,3.0,Resale,,Yes,Not furnished,,Ground floor,,,Dombivli East
1165,4.38 Crores,4BHK,1393.0,1393.0,4.0,4.0,New Sale,Vitrified Tiles,Yes,not furnished,East,8th floor,Freehold,Under Construction,Vikhroli East
1220,1 Crore,4BHK,1340.0,1096.0,4.0,3.0,Resale,Vitrified Tiles,Yes,semi furnished,East,1st floor,freehold,2008,Kalyan West
1346,4.1 Crores,4BHK,1125.0,1125.0,4.0,4.0,Resale,Vitrified Tiles,Yes,not furnished,East,6th floor,Freehold,2018,Chembur
2159,1.95 Crores,4BHK,1300.0,910.0,4.0,4.0,New Sale,,No,Fully Furnished furnished,,Ground floor,,,Kurla East


In [34]:
# first check the count and display few rows which has null total_sqft values
house_df[house_df['total_sqft'].isnull()].head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,direction_facing,property_on,ownership,year_built,location
5,2.47 Crores,3BHK,,900.0,3.0,3.0,New Sale,Vitrified Tiles,Yes,not furnished,West,8th floor,Freehold,2018.0,Kandivali West
8,1.84 Crores,3BHK,,900.0,3.0,3.0,New Sale,Vitrified Tiles,Yes,not furnished,West,9th floor,Freehold,2016.0,Kandivali West
11,1.5 Crores,2BHK,,667.0,2.0,2.0,New Sale,Vitrified Tiles,Yes,not furnished,East,3th floor,Freehold,2017.0,Kandivali West
12,95 Lakhs,1BHK,,473.0,1.0,2.0,Resale,Vitrified Tiles,Yes,semi furnished,North,2nd floor,Freehold,,Kandivali East
13,1.99 Crores,2BHK,,727.0,2.0,2.0,New Sale,Vitrified Tiles,Yes,not furnished,East,9th floor,Freehold,2018.0,Kandivali West


In [35]:
house_df[house_df['total_sqft'].isnull()].shape

(62, 15)

In [36]:
# lets deal with the total_sqft feature, I will use the reverse process of what I did for livable_sqft feature.
house_df['total_sqft'] = house_df['total_sqft'].fillna(house_df['livable_sqft'] + total_livable_sqft_diff_mean)

In [37]:
house_df.iloc[5]

sale_price              2.47 Crores
num_of_bhk                     3BHK
total_sqft                     1144
livable_sqft                    900
num_bedrooms                      3
num_bathrooms                     3
flat_condition             New Sale
flooring_type       Vitrified Tiles
parking                         Yes
furnishing_state      not furnished
direction_facing               West
property_on               8th floor
ownership                  Freehold
year_built                     2018
location             Kandivali West
Name: 5, dtype: object

In [38]:
# lets check again the missing values count for each column
house_df.isnull().sum().sort_values()

sale_price             0
total_sqft             0
livable_sqft           0
num_bedrooms           0
num_bathrooms          0
parking                0
property_on            0
location               0
flat_condition        18
num_of_bhk          1289
furnishing_state    1751
ownership           2542
flooring_type       2581
direction_facing    3058
year_built          3392
dtype: int64

In [39]:
# lets deal with missing values for flat_condition feature
# house_df[(house['year_built']) & house_df['year_built'].astype(float) > 2015.0]

house_int_year_built = house_df[house_df['year_built'].astype(str).str.isdigit()]
house_int_year_built['year_built'].unique()

array(['2018', '2014', '2016', '2017', '2013', '2010', '2002', '2008',
       '2012', '2015', '2000', '2011', '2001', '1998', '2006', '1983',
       '2003', '1990', '2005', '2009', '2007', '1988', '1989', '1994',
       '1986', '1992', '1995', '2004', '1991', '1993', '1996', '1999'],
      dtype=object)

In [40]:
house_df.loc[house_df['year_built'] == 'Under Construction'].shape

(700, 15)

In [41]:
house_df.loc[house_df['year_built'].isnull()].shape

(3392, 15)

In [42]:
# flat condition is very much dependent on year_built feature, also the number of null values is highest in year_built feature, so currently halted working on flat_condition feature as I will first deal with year_built column

# In order to fill missing values in year_built column I will use Random Forest Regression, but in order to efficiently use Random Forest Regression I will first deal with missing values in other columns(num_of_bhk, furnishing_state)

In [43]:
# Droping the following feature
# ownership, direction_facing
columns = ['ownership', 'direction_facing']
house_df.drop(columns, inplace=True, axis=1)

# dataframe after deleting above columns
house_df.head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,property_on,year_built,location
0,1.1 Crores,1BHK,600.0,420.0,1.0,2.0,New Sale,,No,,Ground floor,,Borivali East
1,75 Lakhs,2BHK,955.0,620.0,2.0,2.0,Resale,,No,,Ground floor,,Mira Road
2,73 Lakhs,1BHK,489.0,342.0,1.0,2.0,New Sale,,No,,Ground floor,,Dahisar East
3,2.74 Crores,3BHK,1330.0,950.0,3.0,3.0,New Sale,Marble Flooring,Yes,semi furnished,5th floor,Under Construction,Andheri West
4,80 Lakhs,1BHK,575.0,375.0,1.0,2.0,Resale,,No,,Ground floor,,Kandivali West


In [44]:
# lets check again the missing values count for each column
house_df.isnull().sum().sort_values()

sale_price             0
total_sqft             0
livable_sqft           0
num_bedrooms           0
num_bathrooms          0
parking                0
property_on            0
location               0
flat_condition        18
num_of_bhk          1289
furnishing_state    1751
flooring_type       2581
year_built          3392
dtype: int64

In [45]:
house_df.loc[house_df['num_of_bhk'].isnull()].head()
# house_df.loc[house_df['num_of_bhk'].isnull()]['num_bedrooms'].unique()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,property_on,year_built,location
46,32 Lakhs,,375.0,263.0,1.0,1.0,Resale,,Yes,semi furnished,3th floor,2002.0,Bhayandar West
66,1.75 Crores,,1010.0,723.0,2.0,2.0,Resale,Vitrified Tiles,No,fully furnished,2nd floor,2008.0,Sakinaka
83,5.9 Crores,,1100.0,770.0,2.0,3.0,New Sale,Vitrified Tiles,Yes,semi furnished,1st floor,2012.0,Bandra West
86,34 Lakhs,,825.0,578.0,1.0,1.0,Resale,Normal Tiles,Yes,not furnished,6th floor,2017.0,Badlapur West
99,40 Lakhs,,540.0,378.0,1.0,1.0,New Sale,,No,,Ground floor,,Bhayander East


In [46]:
# from above and by common sense, we know that num_bedrooms is directly correlate with num_of_bhk
# e.g. if num_bedroom = 1, then 1BHK and so on

house_df.loc[house_df['num_bedrooms'] == 1 & house_df['num_of_bhk'].isnull()].head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,property_on,year_built,location
46,32 Lakhs,,375.0,263.0,1.0,1.0,Resale,,Yes,semi furnished,3th floor,2002,Bhayandar West
86,34 Lakhs,,825.0,578.0,1.0,1.0,Resale,Normal Tiles,Yes,not furnished,6th floor,2017,Badlapur West
99,40 Lakhs,,540.0,378.0,1.0,1.0,New Sale,,No,,Ground floor,,Bhayander East
110,23 Lakhs,,600.0,420.0,1.0,1.0,Resale,Vitrified Tiles,Yes,not furnished,4th floor,2014,Badlapur East
159,39.96 Lakhs,,685.0,441.0,1.0,1.0,New Sale,Marbonite,Yes,semi furnished,9th floor,Under Construction,Dombivli East


In [47]:
house_df.loc[(house_df['num_bedrooms'] == 4) & (house_df['num_of_bhk'].isnull())].shape

(30, 13)

In [48]:
map_data = {
    1: "1BHK",
    2: "2BHK",
    3: "3BHK",
    4: "4BHK",
}


for key in map_data:
    house_df.loc[(house_df['num_bedrooms'] == key) & (house_df['num_of_bhk'].isnull()), 'num_of_bhk'] = house_df.loc[(house_df['num_bedrooms'] == key) & (house_df['num_of_bhk'].isnull()), 'num_of_bhk'].fillna(map_data[key])
#     house_df['num_of_bhk'] = temp_df['num_of_bhk'].fillna(map_data[key])

In [49]:
house_df.iloc[66]

sale_price              1.75 Crores
num_of_bhk                     2BHK
total_sqft                     1010
livable_sqft                    723
num_bedrooms                      2
num_bathrooms                     2
flat_condition               Resale
flooring_type       Vitrified Tiles
parking                          No
furnishing_state    fully furnished
property_on               2nd floor
year_built                     2008
location                   Sakinaka
Name: 66, dtype: object

In [50]:
# lets check again the missing values count for each column
house_df.isnull().sum().sort_values()

sale_price             0
num_of_bhk             0
total_sqft             0
livable_sqft           0
num_bedrooms           0
num_bathrooms          0
parking                0
property_on            0
location               0
flat_condition        18
furnishing_state    1751
flooring_type       2581
year_built          3392
dtype: int64

In [51]:
# lets deal with missing values in flooring type feature.
house_df[house_df['flooring_type'].isnull()].head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,property_on,year_built,location
0,1.1 Crores,1BHK,600.0,420.0,1.0,2.0,New Sale,,No,,Ground floor,,Borivali East
1,75 Lakhs,2BHK,955.0,620.0,2.0,2.0,Resale,,No,,Ground floor,,Mira Road
2,73 Lakhs,1BHK,489.0,342.0,1.0,2.0,New Sale,,No,,Ground floor,,Dahisar East
4,80 Lakhs,1BHK,575.0,375.0,1.0,2.0,Resale,,No,,Ground floor,,Kandivali West
14,52.8 Lakhs,2BHK,880.0,750.0,2.0,2.0,Resale,,No,,Ground floor,,Virar West


In [52]:
# I will try a different approache, lets check what are is mode value for flooring with respective to num_of_bhk
print(house_df.loc[house_df['num_of_bhk'] == '1BHK','flooring_type'].mode())
print(house_df.loc[house_df['num_of_bhk'] == '2BHK','flooring_type'].mode())
print(house_df.loc[house_df['num_of_bhk'] == '3BHK','flooring_type'].mode())
print(house_df.loc[house_df['num_of_bhk'] == '4BHK','flooring_type'].mode())

0    Vitrified Tiles
dtype: object
0    Vitrified Tiles
dtype: object
0    Vitrified Tiles
dtype: object
0    Vitrified Tiles
dtype: object


In [53]:
house_df['flooring_type'].mode()

0    Vitrified Tiles
dtype: object

In [54]:
# from above to scenarios, I can conclude that for each bhk type and the complete flooring_type mode correspondence to only 1 value 'Vitrified Tiles'
# so lets add 'Vitrified Tiles' value for every null values in flooring_type column.
house_df['flooring_type'] = house_df['flooring_type'].fillna('Vitrified Tiles')

In [55]:
# lets check again the missing values count for each column
house_df.isnull().sum().sort_values()

sale_price             0
num_of_bhk             0
total_sqft             0
livable_sqft           0
num_bedrooms           0
num_bathrooms          0
flooring_type          0
parking                0
property_on            0
location               0
flat_condition        18
furnishing_state    1751
year_built          3392
dtype: int64

In [56]:
# Now there are only 3 columns that needs to be deal with 
# From this 3 I will first work with year_built as other 2 dependesn on it
house_df[house_df['year_built'].isnull()].tail()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,property_on,year_built,location
6022,60 Lakhs,1BHK,350.0,245.0,1.0,1.0,New Sale,Vitrified Tiles,No,Semi-Furnished furnished,Ground floor,,Nagari Niwara Post Office
6025,46 Lakhs,2BHK,785.0,550.0,2.0,2.0,New Sale,Vitrified Tiles,No,Semi-Furnished furnished,Ground floor,,Nalasopara East
6026,65 Lakhs,1BHK,275.0,193.0,1.0,1.0,New Sale,Vitrified Tiles,No,Semi-Furnished furnished,Ground floor,,Grant Road East
6027,56 Lakhs,2BHK,750.0,525.0,2.0,2.0,New Sale,Vitrified Tiles,No,Semi-Furnished furnished,Ground floor,,Mira Road
6033,24 Lakhs,0.5BHK,369.0,258.0,1.0,1.0,New Sale,Vetrified,Yes,Not furnished,9th floor,,Ambernath West


In [57]:
house_df.loc[house_df['year_built'] == 'Under Construction'].head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,property_on,year_built,location
3,2.74 Crores,3BHK,1330.0,950.0,3.0,3.0,New Sale,Marble Flooring,Yes,semi furnished,5th floor,Under Construction,Andheri West
33,39 Lakhs,1BHK,722.0,441.0,1.0,1.0,New Sale,Vitrified Tiles,Yes,semi furnished,9th floor,Under Construction,Dombivli East
34,1.71 Crores,2BHK,928.0,650.0,2.0,2.0,New Sale,Vitrified Tiles,Yes,not furnished,11th floor,Under Construction,Mulund West
35,2.05 Crores,3BHK,1256.0,879.0,3.0,3.0,New Sale,Vitrified Tiles,Yes,not furnished,9th floor,Under Construction,Jogeshwari East
36,2.2 Crores,2BHK,1017.0,712.0,2.0,2.0,New Sale,Vitrified Tiles,Yes,not furnished,9th floor,Under Construction,Andheri West


In [58]:
# lets chage the value to '2018' for year_built equal to Under Construction
# house_df.loc[house_df['year_built'] == 'Under Construction','year_built'] = '2018'

In [59]:
house_df.iloc[3]

sale_price                 2.74 Crores
num_of_bhk                        3BHK
total_sqft                        1330
livable_sqft                       950
num_bedrooms                         3
num_bathrooms                        3
flat_condition                New Sale
flooring_type          Marble Flooring
parking                            Yes
furnishing_state        semi furnished
property_on                  5th floor
year_built          Under Construction
location                  Andheri West
Name: 3, dtype: object

In [60]:
# lets change the data type of the year_built column from string to int
# house_df['year_built'] = house_df['year_built'].fillna(-1)
# house_df['year_built'] = house_df['year_built'].astype(np.int64)
# house_df['year_built'] = house_df['year_built'].replace(-1, np.nan)

In [61]:
house_df['year_built'].unique()

array([nan, 'Under Construction', '2018', '2014', '2016', '2017', '2013',
       '2010', '2002', '2008', '2012', '2015', '2000', '2011', '2001',
       '1998', '2006', '1983', '2003', '1990', '2005', '2009', '2007',
       '1988', '1989', '1994', '1986', '1992', '1995', '2004', '1991',
       '1993', '1996', '1999'], dtype=object)

In [62]:
house_df['year_built'].describe()

count                   2270
unique                    33
top       Under Construction
freq                     700
Name: year_built, dtype: object

In [63]:
# raise KeyboardInterrupt

In [64]:
# As we have see above that year_built has the maximum number of null values, simply dropping the null values row won't make sense as we will have insufficient data for training, also year_built column is very usefull so can't drop the column also
# So to deal with large missing values in year_built feature I have decided to use random forest regressor for predicting the values.

# import the RandomForestRegressor Class
from sklearn.ensemble import RandomForestClassifier

#Feature set
year_df = house_df[['year_built','total_sqft','livable_sqft','num_bedrooms','num_bathrooms']]

# Split sets into train and test
train  = year_df.loc[ (house_df.year_built.notnull()) ]# known year_built values
test = year_df.loc[ (house_df.year_built.isnull()) ]# null year_built values


print(train.shape)
print(test.shape)

# All year_built values are stored in a target array
y = train.values[:, 0]

# All the other values are stored in the feature array
X = train.values[:, 1::]


# Create and fit a model
# rtr = RandomForestRegressor(max_depth=5, n_estimators=2000, n_jobs=-1)
rtr = RandomForestClassifier(
        n_estimators=1000,
        max_depth=6,
        min_samples_leaf=9,
        max_features=0.1,
        n_jobs=-1,
        random_state=0
)
rtr.fit(X, y)

# Use the fitted model to predict the missing values
predictedYearBuilt = rtr.predict(test.values[:, 1::])

# Assign those predictions to the full data set
house_df.loc[ (house_df.year_built.isnull()), 'year_built' ] = predictedYearBuilt

(2270, 5)
(3392, 5)


In [65]:
# lets check again the missing values count for each column
house_df.isnull().sum().sort_values()

sale_price             0
num_of_bhk             0
total_sqft             0
livable_sqft           0
num_bedrooms           0
num_bathrooms          0
flooring_type          0
parking                0
property_on            0
year_built             0
location               0
flat_condition        18
furnishing_state    1751
dtype: int64

In [66]:
house_df.iloc[-1]

sale_price                    24 Lakhs
num_of_bhk                      0.5BHK
total_sqft                         369
livable_sqft                       258
num_bedrooms                         1
num_bathrooms                        1
flat_condition                New Sale
flooring_type                Vetrified
parking                            Yes
furnishing_state         Not furnished
property_on                  9th floor
year_built          Under Construction
location                Ambernath West
Name: 6033, dtype: object

In [67]:
house_df.loc[6027]

sale_price                          56 Lakhs
num_of_bhk                              2BHK
total_sqft                               750
livable_sqft                             525
num_bedrooms                               2
num_bathrooms                              2
flat_condition                      New Sale
flooring_type                Vitrified Tiles
parking                                   No
furnishing_state    Semi-Furnished furnished
property_on                     Ground floor
year_built                Under Construction
location                           Mira Road
Name: 6027, dtype: object

In [68]:
# lets check again the missing values count for each column
house_df.isnull().sum().sort_values()

sale_price             0
num_of_bhk             0
total_sqft             0
livable_sqft           0
num_bedrooms           0
num_bathrooms          0
flooring_type          0
parking                0
property_on            0
year_built             0
location               0
flat_condition        18
furnishing_state    1751
dtype: int64

In [69]:
# house_df[house_df['furnishing_state'].isnull()].tail()
house_df.loc[(house_df['year_built'] == '2005') & (house_df['furnishing_state'].isnull())].head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,property_on,year_built,location
431,2.6 Crores,2BHK,1000.0,750.0,2.0,2.0,Resale,Marble Flooring,Yes,,5th floor,2005,Andheri West


In [70]:
# lets deal with missing values in furnishing state
# I will use the following logic
# if year_built is greater than 2015 - not furnished, and for rest we will use most frequent from 'fully furnished' & 'semi furnished'


def filling_missing_furnishing_state(curr_row):
    if curr_row['year_built'] in ['2016','2017','2018','Under Construction']:
        return 'Not furnished'
    elif int(curr_row['year_built']) > 2008:
        return 'semi furnished'
    else:
        return 'fully furnished'
    
house_df.loc[house_df['furnishing_state'].isnull(),'furnishing_state'] = house_df[house_df['furnishing_state'].isnull()].apply(filling_missing_furnishing_state, axis=1)

In [71]:
house_df.loc[431]

sale_price               2.6 Crores
num_of_bhk                     2BHK
total_sqft                     1000
livable_sqft                    750
num_bedrooms                      2
num_bathrooms                     2
flat_condition               Resale
flooring_type       Marble Flooring
parking                         Yes
furnishing_state    fully furnished
property_on               5th floor
year_built                     2005
location               Andheri West
Name: 431, dtype: object

In [72]:
# lets check again the missing values count for each column
house_df.isnull().sum().sort_values()

sale_price           0
num_of_bhk           0
total_sqft           0
livable_sqft         0
num_bedrooms         0
num_bathrooms        0
flooring_type        0
parking              0
furnishing_state     0
property_on          0
year_built           0
location             0
flat_condition      18
dtype: int64

In [73]:
# lets deal with missing values in flat_condition
# I will use the following logic
# if year_built is greater than 2015 or 'Under Construction' then its a new sale else its resale.

house_df[house_df['flat_condition'].isnull()].head()

Unnamed: 0,sale_price,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,property_on,year_built,location
2077,66 Lakhs,1BHK,265.0,212.0,1.0,1.0,,Vitrified Tiles,No,Unfurnished furnished,Ground floor,Under Construction,Ghatkopar West
2078,2.25 Crores,2BHK,1000.0,800.0,2.0,2.0,,Vitrified Tiles,No,Unfurnished furnished,Ground floor,Under Construction,Bandra East
2079,1 Lakh,1BHK,500.0,400.0,1.0,1.0,,Vitrified Tiles,No,Unfurnished furnished,Ground floor,2008,Andheri West
2080,37.5 Lakhs,2BHK,850.0,680.0,2.0,2.0,,Vitrified Tiles,No,Fully Furnished furnished,Ground floor,2014,Bhayandar East
2081,2 Crores,3BHK,996.0,797.0,3.0,3.0,,Vitrified Tiles,No,Unfurnished furnished,Ground floor,Under Construction,Ghatkopar West


In [74]:
def filling_missing_flat_condition(curr_row):
    if curr_row['year_built'] in ['2016','2017','2018','Under Construction']:
        return 'New Sale'
    else:
        return 'Resale'
    
house_df.loc[house_df['flat_condition'].isnull(),'flat_condition'] = house_df[house_df['flat_condition'].isnull()].apply(filling_missing_flat_condition, axis=1)

In [75]:
house_df.loc[2080]

sale_price                         37.5 Lakhs
num_of_bhk                               2BHK
total_sqft                                850
livable_sqft                              680
num_bedrooms                                2
num_bathrooms                               2
flat_condition                         Resale
flooring_type                 Vitrified Tiles
parking                                    No
furnishing_state    Fully Furnished furnished
property_on                      Ground floor
year_built                               2014
location                       Bhayandar East
Name: 2080, dtype: object

In [76]:
# lets check again the missing values count for each column
house_df.isnull().sum().sort_values()

sale_price          0
num_of_bhk          0
total_sqft          0
livable_sqft        0
num_bedrooms        0
num_bathrooms       0
flat_condition      0
flooring_type       0
parking             0
furnishing_state    0
property_on         0
year_built          0
location            0
dtype: int64

In [77]:
# Completed missing value section
# Lets perform Feature Engineering & Converting Features

In [78]:
# convert sale price to numerical data
# house_df[~house_df['sale_price'].astype(str).str.isdigit()].head()

# but before that I have observed there is a row which has irrevelant sale_price value, so lets remove that remove from the dataset

def filter_sale_price(curr_row):
    curr_sale_price_value = curr_row['sale_price'].lower()
    
#     print(curr_sale_price_value)
#     quit()
    
    if any(x in curr_sale_price_value for x in ['crores','crore','lakhs','lakh']):
        return False
    else:
        return True

    
house_df['sale_filter'] = house_df.apply(filter_sale_price,axis=1)

In [79]:
# house_df.head()
# house_df.loc[house_df['sale_filter'] == True].head()
house_df.drop(house_df.loc[house_df['sale_filter'] == True].index, inplace=True)

In [80]:
# drop the temp sale_filter column
house_df.drop('sale_filter', inplace=True, axis=1)

In [81]:
# convert sale price to numerical data
def convert_sale_price_numerical(curr_sale_price_value):
    curr_sale_price_value = curr_sale_price_value.lower()
    sale_price_split = curr_sale_price_value.split(" ")
    
    if any(x in curr_sale_price_value for x in ['lakhs','lakh']):
        new_sale_price = int(float(sale_price_split[0]) * 100000)

    elif any(x in curr_sale_price_value for x in ['crores','crore']):
        new_sale_price = int(float(sale_price_split[0]) * 10000000)
        
        
    return new_sale_price
   

In [82]:
house_df['sale_price_num'] = house_df['sale_price'].apply(convert_sale_price_numerical)

# min_house_df = house_df[0:5]
# min_house_df.head()
# min_house_df.loc[min_house_df['sale_price'], 'sale_price_num'] = min_house_df['sale_price'].apply(convert_sale_price_numerical)
# output = min_house_df['sale_price'].apply(convert_sale_price_numerical)

In [83]:
# house_df.head()
# lets drop the sale_price column as it is not needed
house_df.drop('sale_price', inplace=True, axis=1)

In [84]:
house_df.head()

Unnamed: 0,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,property_on,year_built,location,sale_price_num
0,1BHK,600.0,420.0,1.0,2.0,New Sale,Vitrified Tiles,No,Not furnished,Ground floor,Under Construction,Borivali East,11000000
1,2BHK,955.0,620.0,2.0,2.0,Resale,Vitrified Tiles,No,Not furnished,Ground floor,Under Construction,Mira Road,7500000
2,1BHK,489.0,342.0,1.0,2.0,New Sale,Vitrified Tiles,No,Not furnished,Ground floor,Under Construction,Dahisar East,7300000
3,3BHK,1330.0,950.0,3.0,3.0,New Sale,Marble Flooring,Yes,semi furnished,5th floor,Under Construction,Andheri West,27400000
4,1BHK,575.0,375.0,1.0,2.0,Resale,Vitrified Tiles,No,Not furnished,Ground floor,Under Construction,Kandivali West,8000000


In [85]:
# Converting num_of_bhk feature

bhk_mapping = {"4BHK":4, "4+BHK":4, "3BHK":3, "2BHK":2, "1BHK":1, "0.5BHK":1}

# convert num_of_bhk into numbers
house_df['num_of_bhk'] = house_df['num_of_bhk'].map(bhk_mapping)

In [86]:
# Converting flat_condition feature
flat_condition_mapping = {"New Sale":1, "Resale":0}

# convert num_of_bhk into numbers
house_df['flat_condition'] = house_df['flat_condition'].map(flat_condition_mapping)

In [87]:
# converting parking feature
parking_mapping = {"No": 0, "Yes": 1}
house_df['parking'] = house_df['parking'].map(parking_mapping)

In [88]:
house_df.head()

Unnamed: 0,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,property_on,year_built,location,sale_price_num
0,1,600.0,420.0,1.0,2.0,1,Vitrified Tiles,0,Not furnished,Ground floor,Under Construction,Borivali East,11000000
1,2,955.0,620.0,2.0,2.0,0,Vitrified Tiles,0,Not furnished,Ground floor,Under Construction,Mira Road,7500000
2,1,489.0,342.0,1.0,2.0,1,Vitrified Tiles,0,Not furnished,Ground floor,Under Construction,Dahisar East,7300000
3,3,1330.0,950.0,3.0,3.0,1,Marble Flooring,1,semi furnished,5th floor,Under Construction,Andheri West,27400000
4,1,575.0,375.0,1.0,2.0,0,Vitrified Tiles,0,Not furnished,Ground floor,Under Construction,Kandivali West,8000000


In [89]:
# convert flooring_type feature
# house_df['flooring_type'].unique()

mapping_data = {'Vitrified Tiles', 'Marble Flooring', 'Ceramic', 'Marbonite',
       'Wooden', 'Granite', 'Normal Tiles', 'Vetrified', 'Mosaic Tiles'}

In [90]:
# convert furnishing_state feature
# for this feature there are certain values which correspond to the same meaning.
# e.g. 'Unfurnished furnished' and 'not furnished' means the same, so I will classify both as 'not furnished'
# similar I will do that for other values

house_df['furnishing_state'] = house_df['furnishing_state'].replace(['Fully furnished','fully furnished','Fully Furnished furnished'], 'fully furnished')

house_df['furnishing_state'] = house_df['furnishing_state'].replace(['Unfurnished furnished','not furnished','Not furnished'], 'not furnished')

house_df['furnishing_state'] = house_df['furnishing_state'].replace(['semi furnished','Semi-Furnished furnished','Semi furnished'], 'semi furnished')

# house_df['furnishing_state'].unique()

In [91]:
# convert property_on feature
# for property_on feature will extract the numeric part of the value and change Ground Floor to 0 as it 
# better denotes the data.


house_df['property_on'] = house_df['property_on'].replace('Ground floor',0)

In [92]:
import re
def convert_floor_position(curr_value):
    return re.search(r'\d+', str(curr_value)).group()

house_df['property_on_num'] = house_df['property_on'].apply(convert_floor_position)

# drop property_on column
house_df.drop('property_on', inplace=True, axis=1)

# convert the data type of the column from object to int
house_df['property_on_num'] = house_df['property_on_num'].astype(np.int64)

In [93]:
house_df['property_on_num'].unique()
# house_df.head()

Unnamed: 0,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,year_built,location,sale_price_num,property_on_num
0,1,600.0,420.0,1.0,2.0,1,Vitrified Tiles,0,not furnished,Under Construction,Borivali East,11000000,0
1,2,955.0,620.0,2.0,2.0,0,Vitrified Tiles,0,not furnished,Under Construction,Mira Road,7500000,0
2,1,489.0,342.0,1.0,2.0,1,Vitrified Tiles,0,not furnished,Under Construction,Dahisar East,7300000,0
3,3,1330.0,950.0,3.0,3.0,1,Marble Flooring,1,semi furnished,Under Construction,Andheri West,27400000,5
4,1,575.0,375.0,1.0,2.0,0,Vitrified Tiles,0,not furnished,Under Construction,Kandivali West,8000000,0


In [94]:
house_df['furnishing_state'].unique()

array(['not furnished', 'semi furnished', 'fully furnished'], dtype=object)

In [95]:
house_df['furnishing_state'].value_counts()

not furnished      3786
semi furnished     1455
fully furnished     420
Name: furnishing_state, dtype: int64

In [96]:
# lets assign numerical values to furnishing_state feature as
# 0 - not furnished, 1 - semi furnished, 2 - fully furnished
mapping_data = {"furnishing_state": {"not furnished": 0,"semi furnished": 1,"fully furnished": 2}}

house_df.replace(mapping_data, inplace=True)
house_df.head()

Unnamed: 0,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,year_built,location,sale_price_num,property_on_num
0,1,600.0,420.0,1.0,2.0,1,Vitrified Tiles,0,0,Under Construction,Borivali East,11000000,0
1,2,955.0,620.0,2.0,2.0,0,Vitrified Tiles,0,0,Under Construction,Mira Road,7500000,0
2,1,489.0,342.0,1.0,2.0,1,Vitrified Tiles,0,0,Under Construction,Dahisar East,7300000,0
3,3,1330.0,950.0,3.0,3.0,1,Marble Flooring,1,1,Under Construction,Andheri West,27400000,5
4,1,575.0,375.0,1.0,2.0,0,Vitrified Tiles,0,0,Under Construction,Kandivali West,8000000,0


In [97]:
# lets deal with location feature
len(house_df['location'].unique())

# We need to apply label encoding to location feature.

258

In [98]:
# columns that will be label encoded
# flooring_type, year_built, location

In [99]:
from sklearn.preprocessing import LabelEncoder

feature_set = ['flooring_type','year_built','location'] # production code

# feature_set = ['flooring_type'] # testing code

for curr_feature in feature_set:
    # Create a new label (category) encoder object for each feature
    le_obj = LabelEncoder()
    
    # Fit the encoder to the pandas column
    le_obj.fit(house_df[curr_feature])
    
    # Transform Categories Into Integers
    # Apply the fitted encoder to the pandas column
    house_df[curr_feature] = le_obj.transform(house_df[curr_feature])
       
#     print("Curr Feature Mapping: ", curr_feature)
#     le_feature_mapping = dict(zip(le_obj.classes_, le_obj.transform(le_obj.classes_)))
#     print(le_feature_mapping)  
    

house_df.head()

Unnamed: 0,num_of_bhk,total_sqft,livable_sqft,num_bedrooms,num_bathrooms,flat_condition,flooring_type,parking,furnishing_state,year_built,location,sale_price_num,property_on_num
0,1,600.0,420.0,1.0,2.0,1,7,0,0,32,38,11000000,0
1,2,955.0,620.0,2.0,2.0,0,7,0,0,32,153,7500000,0
2,1,489.0,342.0,1.0,2.0,1,7,0,0,32,60,7300000,0
3,3,1330.0,950.0,3.0,3.0,1,2,1,1,32,12,27400000,5
4,1,575.0,375.0,1.0,2.0,0,7,0,0,32,106,8000000,0


In [100]:
house_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5661 entries, 0 to 6033
Data columns (total 13 columns):
num_of_bhk          5661 non-null int64
total_sqft          5661 non-null float64
livable_sqft        5661 non-null float64
num_bedrooms        5661 non-null float64
num_bathrooms       5661 non-null float64
flat_condition      5661 non-null int64
flooring_type       5661 non-null int64
parking             5661 non-null int64
furnishing_state    5661 non-null int64
year_built          5661 non-null int64
location            5661 non-null int64
sale_price_num      5661 non-null int64
property_on_num     5661 non-null int64
dtypes: float64(4), int64(9)
memory usage: 619.2 KB


In [101]:
house_df['property_on_num'].unique()

array([ 0,  5,  8, 11,  6,  9,  3,  2,  7, 12, 13, 14,  4, 18, 10, 35, 16,
        1, 15, 22, 21, 20, 23, 25, 30, 34, 19, 17, 37, 39, 32, 26, 29, 36])

In [102]:
# change the year_built feature, as user can provide any year between

In [103]:
# completed data preparation part, lets start with building model for prediction

In [104]:
# Training with different models
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor


# define scoring method
scoring = 'accuracy'

# Define models to train
names = ["Gradient Boosting", "Linear Regressor","AdaBoost","Random Forest","Nearest Neighbors"]
# names = ["Gradient Boosting"]


classifiers = [
    GradientBoostingRegressor(n_estimators=1000, learning_rate=0.1, max_depth=6, min_samples_leaf=9, max_features=0.1, loss='huber'),
    LinearRegression(normalize=True),
    AdaBoostRegressor(),
    RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=3),
    KNeighborsRegressor(n_neighbors = 5),
]

models = dict(zip(names, classifiers))

In [105]:
# splitting input data
X_train = house_df.drop('sale_price_num', axis=1)
y_train = house_df['sale_price_num']

In [106]:
# Lets determine the accuracy score of each classifier
models_accuracy_score = []

for name in models:
    model = models[name]
    model.fit(X_train, y_train)
    predictions = model.predict(X_train)
    models_accuracy_score.append((name, round(model.score(X_train, y_train) * 100, 2)))

building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50building tree 5 of 50
building tree 6 of 50

building tree 7 of 50
building tree 8 of 50
building tree 9 of 50building tree 10 of 50

building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50building tree 21 of 50

building tree 22 of 50
building tree 23 of 50
building tree 24 of 50building tree 25 of 50

building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


In [107]:
results = pd.DataFrame({
    'Model': names,
    'Score': [curr_model_score[1] for curr_model_score in models_accuracy_score]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
95.67,Random Forest
87.11,Gradient Boosting
74.45,Nearest Neighbors
58.01,Linear Regressor
-18.86,AdaBoost


In [108]:
# Find score for each model by performing K-fold validation

# import K-fold class
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# define seed for reproducibility
seed = 1


models_kfold_accuracy_score = []

for name in models:    
    kfold = KFold(n_splits=10, random_state = seed)
    model = models[name]
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold)
    models_kfold_accuracy_score.append((name, round(np.mean(cv_results),2)))
    
    print("Model Name: ", name)
    print("Scores:", cv_results)
    print("Mean:", cv_results.mean())
    print("Standard Deviation:", cv_results.std())

Model Name:  Gradient Boosting
Scores: [0.79021307 0.75085967 0.59909575 0.37444551 0.67603392 0.7647479
 0.64581582 0.88276711 0.81032573 0.6798629 ]
Mean: 0.6974167378462184
Standard Deviation: 0.13435380166969743
Model Name:  Linear Regressor
Scores: [ 0.65812407  0.65558105  0.44270594  0.30070885  0.42929109  0.58945451
  0.37657462 -0.04353176  0.68184737 -3.34368163]
Mean: 0.07470741008476334
Standard Deviation: 1.1581921174877317
Model Name:  AdaBoost
Scores: [ 0.68610675  0.51662074  0.03958072  0.29630026 -1.3175638  -0.49983534
 -0.90880919 -9.45323089  0.54483512 -0.64799902]
Mean: -1.0743994654621396
Standard Deviation: 2.866630846450356
building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50building tree 8 of 50

building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
build

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50building tree 12 of 50

building tree 13 of 50
building tree 14 of 50building tree 15 of 50

building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50building tree 26 of 50
building tree 27 of 50

building tree 28 of 50
building tree 29 of 50building tree 30 of 50

building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50building tree 20 of 50

building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50building tree 27 of 50

building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50building tree 35 of 50

building tree 36 of 50
building tree 37 of 50
building tree 38 of 50building tree 39 of 50

building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50building tree 11 of 50

building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50building tree 42 of 50

building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50building tree 21 of 50

building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50building tree 30 of 50

building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50building tree 27 of 50

building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50building tree 37 of 50

building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50building tree 13 of 50

building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50
building tree 18 of 50
building tree 19 of 50
building tree 20 of 50building tree 21 of 50

building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50building tree 16 of 50

building tree 17 of 50
building tree 18 of 50
building tree 19 of 50building tree 20 of 50building tree 21 of 50


building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50building tree 43 of 50

building tree 44 of 

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50building tree 13 of 50

building tree 14 of 50
building tree 15 of 50
building tree 16 of 50building tree 17 of 50

building tree 18 of 50
building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50
building tree 24 of 50
building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50
building tree 35 of 50
building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


building tree 1 of 50
building tree 2 of 50
building tree 3 of 50
building tree 4 of 50
building tree 5 of 50
building tree 6 of 50
building tree 7 of 50
building tree 8 of 50
building tree 9 of 50
building tree 10 of 50
building tree 11 of 50
building tree 12 of 50
building tree 13 of 50
building tree 14 of 50
building tree 15 of 50
building tree 16 of 50
building tree 17 of 50building tree 18 of 50

building tree 19 of 50
building tree 20 of 50
building tree 21 of 50
building tree 22 of 50
building tree 23 of 50building tree 24 of 50

building tree 25 of 50
building tree 26 of 50
building tree 27 of 50
building tree 28 of 50
building tree 29 of 50
building tree 30 of 50
building tree 31 of 50
building tree 32 of 50
building tree 33 of 50
building tree 34 of 50building tree 35 of 50

building tree 36 of 50
building tree 37 of 50
building tree 38 of 50
building tree 39 of 50
building tree 40 of 50
building tree 41 of 50
building tree 42 of 50
building tree 43 of 50
building tree 44 of 

[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished


Model Name:  Random Forest
Scores: [0.81596013 0.76857252 0.62921791 0.3560823  0.63390507 0.75186047
 0.33927374 0.88921521 0.80272674 0.44927342]
Mean: 0.6436087501203474
Standard Deviation: 0.1886900067210211
Model Name:  Nearest Neighbors
Scores: [0.75808079 0.66316357 0.56058182 0.26845626 0.37542503 0.44370227
 0.16197322 0.70913979 0.74786242 0.15881642]
Mean: 0.4847201591048148
Standard Deviation: 0.2242114845560714


In [109]:
results = pd.DataFrame({
    'Model': names,
    'Score': [curr_model_score[1] for curr_model_score in models_kfold_accuracy_score]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.7,Gradient Boosting
0.64,Random Forest
0.48,Nearest Neighbors
0.07,Linear Regressor
-1.07,AdaBoost


In [110]:
# From above result we will choose Gradient Boosting Algo for our modeling as we can see it topped 2nd in prediction and top in kfold validation.

In [111]:
# now lets find optimal parameters for our Gradient Boosting Algo

# Applying Grid Search to find the best parameters
# from sklearn.model_selection import GridSearchCV

# model = GradientBoostingRegressor()

# parameters = {
#     'n_estimators': [500, 1000, 3000],
#     'max_depth': [4, 6],
#     'min_samples_leaf': [3, 5, 9, 17],
#     'learning_rate': [0.1, 0.05, 0.02, 0.01],
#     'max_features': [1.0, 0.3, 0.1],
#     'loss': ['ls', 'lad', 'huber']
# }

# grid_search = GridSearchCV(estimator = model,
#                            param_grid = parameters,
#                            cv = 10,
#                            n_jobs = -1)

# grid_search = grid_search.fit(X_train, y_train)
# best_accuracy = grid_search.best_score_
# best_parameters = grid_search.best_params_


# print("Accuracy: ", round(best_accuracy * 100, 2))
# print("Best Params: ", best_parameters)

In [112]:
# Grid Search CV result 
''' Accuracy:  75.28
Best Params:  {'learning_rate': 0.05, 'loss': 'huber', 'max_depth': 6, 'max_features': 1.0, 'min_samples_leaf': 5, 'n_estimators': 3000}
'''

" Accuracy:  75.28\nBest Params:  {'learning_rate': 0.05, 'loss': 'huber', 'max_depth': 6, 'max_features': 1.0, 'min_samples_leaf': 5, 'n_estimators': 3000}\n"

In [113]:
# Let us test the GradientBoosting regressor again with the new optimal parameters
model = GradientBoostingRegressor(
                n_estimators=3000,
                learning_rate=0.05,
                max_depth=6,
                min_samples_leaf=5,
                max_features=0.1,
                loss='huber'   
            )
model.fit(X_train, y_train)
y_train_prediction = model.predict(X_train)
accuracy = model.score(X_train, y_train)

print("Model accuracy after tuning parameters: ", round(accuracy * 100, 2))

# Compute error between our train predictions and the actual values.
# Import the scikit-learn function to compute error.
from sklearn.metrics import mean_squared_error
print("Mean Squared Error: " , mean_squared_error(y_train_prediction, y_train) * 100, 2)

Model accuracy after tuning parameters:  90.02
Mean Squared Error:  4218518229974479.0 2


In [114]:
print(y_train[0:5])
print(y_train_prediction[0:5])

0    11000000
1     7500000
2     7300000
3    27400000
4     8000000
Name: sale_price_num, dtype: int64
[ 9398559.99437121  8025475.59926871  6272638.9378832  26796259.47474248
  8682123.77434929]


In [115]:
# Save the trained model to a file so we can use it in other programs
from sklearn.externals import joblib
joblib.dump(model, 'trained_house_prediction_model.pkl')

['trained_house_prediction_model.pkl']

In [116]:
def format_house_value(value):
    # first calculate the length of the house
    value_len = len(str(value))
    
    new_value = 0
    
    if value_len > 7: 
        new_value = str(round(value / 10000000, 2)) + " Crores"
    elif value_len == 7:
        new_value = str(round(value / 1000000, 2)) + " Lakhs"
    elif value_len == 6:
        new_value = str(round(value / 100000, 2)) + " Lakh"
    
    return new_value

In [148]:
# lets predict for your new houses

house_value = [
  2, # num_of_bhk
  1780, # total_sqft
  1112, # livable_sqft
  2, # num_bedrooms
  1,  # num_bathrooms
  0, # flat_condition
  2, # flooring_type
  0, # parking
  1, # furnishing_state
  20, # year_built
  125, # location
  3, # property_on_num 
]

# scikit-learn assumes you want to predict the values for lots of houses at once, so it expects an array.
# We just want to look at a single house, so it will be the only item in our array.
homes_to_value = [
    house_value
]

# multiple houses to predict

# new_house = house_df.loc[5]
# new_house = new_house[0:-1]

# print("new house")
# print(new_house)

# homes_to_value = [
#     new_house
# ]

# Load the model we trained previously
model = joblib.load('trained_house_prediction_model.pkl')

# Run the model and make a prediction for each house in the homes_to_value array
predicted_home_values = model.predict(homes_to_value)

In [149]:
# Since we are only predicting the price of one house, just look at the first prediction returned
predicted_value = int(round(predicted_home_values[0]))

print("This house has an estimated value of {}".format(format_house_value(predicted_value)))

This house has an estimated value of 2.18 Crores


In [119]:
# Visualizing Data.