# Predicting The House Prices of Banglore Using LinearRegression

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


# Reading Data

In [20]:
data=pd.read_csv("/Users/jaypanchal/aiml/data/Bengaluru_House_Data.csv")

In [21]:
data.shape

(13320, 9)

In [22]:
data.head(5)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [23]:
data.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [25]:
data.isnull().sum() # Contains many null values lets analyze and try to fill those

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [26]:
data['location'].value_counts() #Most Common Location is Whitefield so we'll fill it in where Null values

location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64

In [27]:
data['location']=data['location'].fillna('Whitefield')

In [28]:
data['size'].value_counts()

size
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: count, dtype: int64

In [29]:
data['size']=data['size'].fillna('2 BHK')

In [30]:
data['bath'].value_counts()


bath
2.0     6908
3.0     3286
4.0     1226
1.0      788
5.0      524
6.0      273
7.0      102
8.0       64
9.0       43
10.0      13
12.0       7
13.0       3
11.0       3
16.0       2
27.0       1
40.0       1
15.0       1
14.0       1
18.0       1
Name: count, dtype: int64

In [31]:
data['bath']=data['bath'].fillna(data['bath'].median) 

In [33]:
data['balcony'].value_counts()

balcony
2.0    5113
1.0    4897
3.0    1672
0.0    1029
Name: count, dtype: int64

In [38]:
data['balcony'] = data['balcony'].fillna(data['balcony'].mode()[0])

In [39]:
data=data.drop(columns=['availability','society']) #Dropping This Columns because they seems irrelevant

KeyError: "['availability', 'society'] not found in axis"

In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13320 non-null  object 
 1   location    13320 non-null  object 
 2   size        13320 non-null  object 
 3   total_sqft  13320 non-null  object 
 4   bath        13320 non-null  object 
 5   balcony     13320 non-null  float64
 6   price       13320 non-null  float64
dtypes: float64(2), object(5)
memory usage: 728.6+ KB


In [41]:
data.describe()

Unnamed: 0,balcony,price
count,13320.0,13320.0
mean,1.603378,112.565627
std,0.803067,148.971674
min,0.0,8.0
25%,1.0,50.0
50%,2.0,72.0
75%,2.0,120.0
max,3.0,3600.0


In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13320 non-null  object 
 1   location    13320 non-null  object 
 2   size        13320 non-null  object 
 3   total_sqft  13320 non-null  object 
 4   bath        13320 non-null  object 
 5   balcony     13320 non-null  float64
 6   price       13320 non-null  float64
dtypes: float64(2), object(5)
memory usage: 728.6+ KB


In [43]:
data['balcony'].value_counts()

balcony
2.0    5722
1.0    4897
3.0    1672
0.0    1029
Name: count, dtype: int64

In [44]:
data.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,1.0,51.0


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13320 non-null  object 
 1   location    13320 non-null  object 
 2   size        13320 non-null  object 
 3   total_sqft  13320 non-null  object 
 4   bath        13320 non-null  object 
 5   balcony     13320 non-null  float64
 6   price       13320 non-null  float64
dtypes: float64(2), object(5)
memory usage: 728.6+ KB
