Importing Libraries

In [1]:
import pandas as pd  
import numpy as np 
from matplotlib import pyplot as pyplot
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20, 10)

import warnings

# Ignore all UserWarnings
warnings.simplefilter("ignore", UserWarning)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv("Bengaluru_House_Data.csv")
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
data.shape

(13320, 9)

In [4]:
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
data['area_type'].value_counts()

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64

Drop the society columns because the amount of missing value there is more than 20%

In [7]:
data1= data.drop(['society'], axis='columns')
data1.shape



(13320, 8)

Handling Missing values

In [8]:
# Replace missing values with the mode of the  location categorical columns 


data1['location'] = data1['location'].fillna(data1['location'].mode()[0])

null_count = data1['location'].isnull().sum()
print("Number of null values in 'location' column:", null_count)

Number of null values in 'location' column: 0


In [9]:
# Replace missing values with the mode of the  size categorical columns 
data1['size'] = data1['size'].fillna(data1['size'].mode().iloc[0])
data1['bath'] = data1['bath'].fillna(data1['bath'].mean())
data1['balcony'] = data1['balcony'].fillna(data1['balcony'].mean())

data1.isnull().sum()

area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [10]:
data['area_type'].value_counts()

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64

Feature Engineering 

In [11]:
data1['bedroom_size'] = data1['size'].apply(lambda x: int(x.split(' ')[0]))
data2 = data1.drop(['size'], axis='columns')
data2.shape

(13320, 8)

In [12]:
data2.describe(include=object)

Unnamed: 0,area_type,availability,location,total_sqft
count,13320,13320,13320,13320
unique,4,81,1305,2117
top,Super built-up Area,Ready To Move,Whitefield,1200
freq,8790,10581,541,843


In [13]:
data2.describe()

Unnamed: 0,bath,balcony,price,bedroom_size
count,13320.0,13320.0,13320.0,13320.0
mean,2.69261,1.584376,112.565627,2.802778
std,1.337777,0.79836,148.971674,1.294496
min,1.0,0.0,8.0,1.0
25%,2.0,1.0,50.0,2.0
50%,2.0,2.0,72.0,3.0
75%,3.0,2.0,120.0,3.0
max,40.0,3.0,3600.0,43.0


In [14]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [15]:
data2[~data2['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bedroom_size
30,Super built-up Area,19-Dec,Yelahanka,2100 - 2850,4.0,0.0,186.0,4
56,Built-up Area,20-Feb,Devanahalli,3010 - 3410,2.69261,1.584376,192.0,4
81,Built-up Area,18-Oct,Hennur Road,2957 - 3450,2.69261,1.584376,224.5,4
122,Super built-up Area,18-Mar,Hebbal,3067 - 8156,4.0,0.0,477.0,4
137,Super built-up Area,19-Mar,8th Phase JP Nagar,1042 - 1105,2.0,0.0,54.005,2
165,Super built-up Area,18-Dec,Sarjapur,1145 - 1340,2.0,0.0,43.49,2
188,Super built-up Area,Ready To Move,KR Puram,1015 - 1540,2.0,0.0,56.8,2
224,Super built-up Area,19-Dec,Devanahalli,1520 - 1740,2.69261,1.584376,74.82,3
410,Super built-up Area,Ready To Move,Kengeri,34.46Sq. Meter,1.0,0.0,18.5,1
549,Super built-up Area,18-Sep,Hennur Road,1195 - 1440,2.0,0.0,63.77,2


In [16]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) ==2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        None

data3 = data2.copy()
data3.total_sqft = data3.total_sqft.apply(convert_sqft_to_num)   


In [17]:
data3.isnull().sum()

area_type        0
availability     0
location         0
total_sqft      46
bath             0
balcony          0
price            0
bedroom_size     0
dtype: int64

In [18]:
data3['total_sqft'] = data3['total_sqft'].fillna(data3['total_sqft'].mean())

Feature Engineering

In [19]:
data4 = data3.copy()
data4['price_per_sqft'] = data4['price']*100000/data4['total_sqft']
data4.head()

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bedroom_size,price_per_sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606
1,Plot Area,Ready To Move,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4,4615.384615
2,Built-up Area,Ready To Move,Uttarahalli,1440.0,2.0,3.0,62.0,3,4305.555556
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3,6245.890861
4,Super built-up Area,Ready To Move,Kothanur,1200.0,2.0,1.0,51.0,2,4250.0


In [20]:
data3.isnull().sum()


area_type       0
availability    0
location        0
total_sqft      0
bath            0
balcony         0
price           0
bedroom_size    0
dtype: int64

In [21]:
location_stats = data4['location'].value_counts(ascending = False)
location_stats

location
Whitefield                        541
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

In [22]:
location_stats.values.sum()

13320

In [23]:
len(location_stats[location_stats>10])

241

In [24]:
len(location_stats[location_stats<=10])

1064

In [25]:
location_stats_less_than_10 = location_stats[location_stats<=10]

In [26]:
data4.location = data4.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)
len(data4.location.unique())


242

In [27]:
data4.head(10)

Unnamed: 0,area_type,availability,location,total_sqft,bath,balcony,price,bedroom_size,price_per_sqft
0,Super built-up Area,19-Dec,Electronic City Phase II,1056.0,2.0,1.0,39.07,2,3699.810606
1,Plot Area,Ready To Move,Chikka Tirupathi,2600.0,5.0,3.0,120.0,4,4615.384615
2,Built-up Area,Ready To Move,Uttarahalli,1440.0,2.0,3.0,62.0,3,4305.555556
3,Super built-up Area,Ready To Move,Lingadheeranahalli,1521.0,3.0,1.0,95.0,3,6245.890861
4,Super built-up Area,Ready To Move,Kothanur,1200.0,2.0,1.0,51.0,2,4250.0
5,Super built-up Area,Ready To Move,Whitefield,1170.0,2.0,1.0,38.0,2,3247.863248
6,Super built-up Area,18-May,Old Airport Road,2732.0,4.0,1.584376,204.0,4,7467.057101
7,Super built-up Area,Ready To Move,Rajaji Nagar,3300.0,4.0,1.584376,600.0,4,18181.818182
8,Super built-up Area,Ready To Move,Marathahalli,1310.0,3.0,1.0,63.25,3,4828.244275
9,Plot Area,Ready To Move,other,1020.0,6.0,1.584376,370.0,6,36274.509804


Use one hot encoding

In [28]:
data5= data4.drop(['area_type', 'availability'], axis='columns')
data5.shape


(13320, 7)

In [29]:
dummies = pd.get_dummies(data5.location)
dummies.head(3)

Unnamed: 0,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [30]:
data6 = pd.concat([data4,dummies.drop('other', axis='columns')],axis='columns')
data7= data6.drop(['area_type', 'balcony', 'availability','price_per_sqft', 'location'], axis='columns')
data7.head()

Unnamed: 0,total_sqft,bath,price,bedroom_size,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1056.0,2.0,39.07,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2600.0,5.0,120.0,4,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1440.0,2.0,62.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1521.0,3.0,95.0,3,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1200.0,2.0,51.0,2,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [31]:
x = data7.drop(['price'], axis='columns')
y = data7['price']
x.isnull().sum()

total_sqft              0
bath                    0
bedroom_size            0
 Devarachikkanahalli    0
1st Block Jayanagar     0
                       ..
Yelachenahalli          0
Yelahanka               0
Yelahanka New Town      0
Yelenahalli             0
Yeshwanthpur            0
Length: 244, dtype: int64

In [32]:
y.head(3)

0     39.07
1    120.00
2     62.00
Name: price, dtype: float64

In [33]:
len(y)

13320

Model building

In [34]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=10)

In [35]:
x.head()

Unnamed: 0,total_sqft,bath,bedroom_size,Devarachikkanahalli,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1056.0,2.0,2,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2600.0,5.0,4,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1440.0,2.0,3,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1521.0,3.0,3,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1200.0,2.0,2,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [36]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(x_train,y_train)
lr_clf.score(x_test,y_test)

0.23692744883318517

Using K-fold cross validation to measure the accuracy of the model 

In [37]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), x, y, cv=cv)

array([ 0.39189338,  0.52422921, -0.38413242,  0.45564687,  0.27033769])

In [38]:
x.columns

Index(['total_sqft', 'bath', 'bedroom_size', ' Devarachikkanahalli',
       '1st Block Jayanagar', '1st Phase JP Nagar',
       '2nd Phase Judicial Layout', '2nd Stage Nagarbhavi',
       '5th Block Hbr Layout', '5th Phase JP Nagar',
       ...
       'Vijayanagar', 'Vishveshwarya Layout', 'Vishwapriya Layout',
       'Vittasandra', 'Whitefield', 'Yelachenahalli', 'Yelahanka',
       'Yelahanka New Town', 'Yelenahalli', 'Yeshwanthpur'],
      dtype='object', length=244)

In [39]:
np.where(x.columns=='1st Phase JP Nagar')[0][0]

5

In [40]:
def predict_price(total_sqft, bath, bedroom_size, location): 

    loc_index = np.where(x.columns == location)
    
    if len(loc_index[0]) == 0:
        print(f"Location '{location}' not found in DataFrame columns.")
        return
    
    loc_index = loc_index[0][0]
    
    X = np.zeros(len(x.columns))
    X[0] = total_sqft
    X[1] = bath
    X[2] = bedroom_size
    if loc_index >= 0:
        X[loc_index] = 1

    return lr_clf.predict([X])[0]

In [41]:
predict_price(1000, 2, 2, '1st Phase JP Nagar')

101.42668477380728

In [42]:
import pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

In [43]:
import json
columns = {
    'data_columns' : [col.lower() for col in x.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))