In [29]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
import plotly.express as px

In [30]:
# link to dataset: https://www.kaggle.com/datasets/amitabhajoy/bengaluru-house-price-data
# NB: data dictionary with column descriptions is not available with dataset
# read in data
df = pd.read_csv('Bengaluru_House_Data.csv')

In [31]:
df.shape

(13320, 9)

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [33]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [34]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [35]:
# building a simple model, so drop some columns
# in a full project would investigate and use all columns
df.drop(columns=['area_type', 'availability', 'society', 'balcony'], inplace=True)

In [36]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [37]:
# how many null values in each column?
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [38]:
# As dataset is large (~13,300 rows) and number of null values is relatively small,
# dropping rows with missing values is unlikely to adversely affect model performance
df.dropna(axis=0, inplace=True, ignore_index=True)
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

# size column - exploration & cleaning

In [39]:
100*df['size'].value_counts(normalize=True)

size
2 BHK         39.242035
3 BHK         32.356938
4 Bedroom      6.175449
4 BHK          4.356032
3 Bedroom      4.121999
1 BHK          4.008757
2 Bedroom      2.483769
5 Bedroom      2.234637
6 Bedroom      1.441945
1 Bedroom      0.792692
8 Bedroom      0.634154
7 Bedroom      0.626604
5 BHK          0.430319
9 Bedroom      0.347275
6 BHK          0.226483
7 BHK          0.128341
1 RK           0.098143
10 Bedroom     0.090593
9 BHK          0.060396
8 BHK          0.037747
11 BHK         0.015099
11 Bedroom     0.015099
10 BHK         0.015099
14 BHK         0.007549
13 BHK         0.007549
12 Bedroom     0.007549
27 BHK         0.007549
43 Bedroom     0.007549
16 BHK         0.007549
19 BHK         0.007549
18 Bedroom     0.007549
Name: proportion, dtype: float64

In [40]:
# in real estate, BHK stands for 'bedroom, hall, and kitchen' and indicates the number of bedrooms, halls and kitchens in a property.

# So looks like there might be an inconsistency in this column: In some rows, size appears to indicate number of bedrooms, while in others it seems to indicate number of BHKs.

In [41]:
# create a numerical size column, called size2

def extract_numeric_size(x):

    numeric_size = int(x.split(' ')[0])

    return numeric_size


df['num_rooms'] = df['size'].apply(lambda x: extract_numeric_size(x))

100*df['num_rooms'].value_counts(normalize=True)

num_rooms
2     41.725804
3     36.478937
4     10.531481
1      4.899592
5      2.664955
6      1.668428
7      0.754945
8      0.671901
9      0.407670
10     0.105692
11     0.030198
27     0.007549
19     0.007549
16     0.007549
43     0.007549
14     0.007549
12     0.007549
13     0.007549
18     0.007549
Name: proportion, dtype: float64

In [44]:
# Need to consider whether large values of num_rooms are correct...
# 8,000 sqft = 740 square metres
# So 27 rooms implies average room size is 740/27 = 27 square metres, which I suppose is plausible

# 2,400 sqft = 220 square metres
# So 43 rooms implies average room size is 5 square metres.  This is not plausible.
df[df['num_rooms']>=20]

Unnamed: 0,location,size,total_sqft,bath,price,num_rooms
1705,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4654,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [46]:
8000*0.092903

743.224

In [47]:
2400*0.092903

222.9672