# Predicting The House Prices of Banglore Using LinearRegression

In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


# Reading Data

In [54]:
data=pd.read_csv("/Users/jaypanchal/aiml/data/Bengaluru_House_Data.csv")

In [55]:
print(data.shape)
print(data.info())

(13320, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB
None


In [56]:
data.head(5)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [57]:
data.isnull().sum() # Contains many null values lets analyze and try to fill those

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [58]:
data=data.drop(columns=['availability','society'])

In [59]:
for col in ["location","size","bath","balcony"]:
   print(f"{col} : {data[col].value_counts()}")

location : location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
size : size
2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK          

In [60]:

def fill_na(data,data_cols):
    for col in data_cols:
        if col=="location":
            data[col]=data[col].fillna('Whitefield')
        elif col=="size":
            data[col]=data[col].fillna('2 BHK')
        elif col=="bath":
            data[col]=data[col].fillna(data[col].median) 
        elif col=="balcony":
            data[col] = data[col].fillna(data[col].mode()[0])
    return data
            
data_cols=["location","size","bath","balcony"]
data=fill_na(data,data_cols)

In [61]:
data.isnull().sum()

area_type     0
location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13320 non-null  object 
 1   location    13320 non-null  object 
 2   size        13320 non-null  object 
 3   total_sqft  13320 non-null  object 
 4   bath        13320 non-null  object 
 5   balcony     13320 non-null  float64
 6   price       13320 non-null  float64
dtypes: float64(2), object(5)
memory usage: 728.6+ KB


In [66]:
data.describe()

Unnamed: 0,total_sqft,balcony,price
count,13274.0,13274.0,13274.0
mean,1559.626694,1.604565,112.453654
std,1238.405258,0.802569,149.070368
min,1.0,0.0,8.0
25%,1100.0,1.0,50.0
50%,1276.0,2.0,72.0
75%,1680.0,2.0,120.0
max,52272.0,3.0,3600.0


In [63]:
data.tail(5)

Unnamed: 0,area_type,location,size,total_sqft,bath,balcony,price
13315,Built-up Area,Whitefield,5 Bedroom,3453,4.0,0.0,231.0
13316,Super built-up Area,Richards Town,4 BHK,3600,5.0,2.0,400.0
13317,Built-up Area,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.0,60.0
13318,Super built-up Area,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.0
13319,Super built-up Area,Doddathoguru,1 BHK,550,1.0,1.0,17.0


In [64]:
for i in data.columns:
    print(data[i].unique())

['Super built-up  Area' 'Plot  Area' 'Built-up  Area' 'Carpet  Area']
['Electronic City Phase II' 'Chikka Tirupathi' 'Uttarahalli' ...
 '12th cross srinivas nagar banshankari 3rd stage' 'Havanur extension'
 'Abshot Layout']
['2 BHK' '4 Bedroom' '3 BHK' '4 BHK' '6 Bedroom' '3 Bedroom' '1 BHK'
 '1 RK' '1 Bedroom' '8 Bedroom' '2 Bedroom' '7 Bedroom' '5 BHK' '7 BHK'
 '6 BHK' '5 Bedroom' '11 BHK' '9 BHK' '9 Bedroom' '27 BHK' '10 Bedroom'
 '11 Bedroom' '10 BHK' '19 BHK' '16 BHK' '43 Bedroom' '14 BHK' '8 BHK'
 '12 Bedroom' '13 BHK' '18 Bedroom']
['1056' '2600' '1440' ... '1133 - 1384' '774' '4689']
[2.0 5.0 3.0 4.0 6.0 1.0 9.0 <bound method Series.median of 0        2.0
                             1        5.0
                             2        2.0
                             3        3.0
                             4        2.0
                                     ...
                             13315    4.0
                             13316    5.0
                             13317 

In [67]:
def convert_sqft(x):
    try:
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return np.nan

data['total_sqft'] = data['total_sqft'].apply(convert_sqft)
data = data.dropna(subset=['total_sqft'])

In [None]:
data['bhk'] = data['size'].apply(lambda x: int(x.split(' ')[0]))
data = data.drop('size', axis=1)

In [None]:
data['bath'] = pd.to_numeric(data['bath'], errors='coerce')
data = data.dropna(subset=['bath'])
data['bath'] = data['bath'].astype(int)


In [None]:
data['balcony'] = data['balcony'].fillna(data['balcony'].median())
data['balcony'] = data['balcony'].astype(int)


In [None]:
data['location'] = data['location'].str.strip()
location_counts = data['location'].value_counts()
data['location'] = data['location'].apply(lambda x: 'other' if location_counts[x] <= 10 else x)
data = pd.get_dummies(data, columns=['area_type', 'location'], drop_first=True)

In [None]:
data = data[(data['total_sqft'] >= 300) & (data['total_sqft'] <= 10000)]

In [None]:
data = data[(data['price'] >= 10) & (data['price'] <= 500)]


In [None]:
data.head()

In [None]:
data.describe()

In [None]:
X = data.drop(['price'], axis=1)
y = data['price']