In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)

In [2]:
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df.shape

(13320, 9)

In [4]:
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [5]:
df["area_type"].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [6]:
df["area_type"].value_counts()

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64

## Drop The Features that no Required to build a model

In [8]:
df2 = df.drop(["area_type","availability","balcony","society"],axis="columns")
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


In [10]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13319 non-null  object 
 1   size        13304 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13247 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


## Data cleaning handle NA Values

In [12]:
df2.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [13]:
df2.shape

(13320, 5)

In [14]:
df3 = df2.dropna()
df3.shape

(13246, 5)

In [15]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13246 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13246 non-null  object 
 1   size        13246 non-null  object 
 2   total_sqft  13246 non-null  object 
 3   bath        13246 non-null  float64
 4   price       13246 non-null  float64
dtypes: float64(2), object(3)
memory usage: 620.9+ KB


## Feature Engineering


### add new feature(integer) for BHK(Bathroom Hall Kitchen)

In [18]:
df3["bhk"] = df3["size"].apply(lambda x: int(x.split(' ')[0]))
df3["bhk"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3["bhk"] = df3["size"].apply(lambda x: int(x.split(' ')[0]))


array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18])

### Explore total _sqft Feature

In [20]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [21]:
df3[~df["total_sqft"].apply(is_float)].sample(20)

  df3[~df["total_sqft"].apply(is_float)].sample(20)


Unnamed: 0,location,size,total_sqft,bath,price,bhk
772,Banashankari Stage VI,2 BHK,1160 - 1195,2.0,59.935,2
12560,Hosa Road,3 BHK,142.84Sq. Meter,3.0,110.0,3
13059,Harlur,2 BHK,1200 - 1470,2.0,72.76,2
11407,Whitefield,3 BHK,1520 - 1759,3.0,92.63,3
3984,Uttarahalli,3 BHK,1125 - 1500,3.0,51.19,3
11490,Sarjapur,4 Bedroom,2580 - 2591,4.0,115.0,4
9434,Bannerghatta Road,2 BHK,1160 - 1315,2.0,49.49,2
1019,Marathi Layout,1 Bedroom,5.31Acres,1.0,110.0,1
11373,Hoskote,2 BHK,929 - 1078,2.0,28.095,2
5631,Thanisandra,3 BHK,204Sq. Meter,4.0,185.0,3


### the data is so dirty, in the total sqft any data using a range eg(1270 - 1275...) and using many unit of length, to make it simple i will replace the data with range to mean of that range and drop the data which using many of length except sqft

In [23]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13246 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13246 non-null  object 
 1   size        13246 non-null  object 
 2   total_sqft  13246 non-null  object 
 3   bath        13246 non-null  float64
 4   price       13246 non-null  float64
 5   bhk         13246 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 724.4+ KB


In [24]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None   

In [25]:
df4 = df3.copy()
df4.total_sqft = df4.total_sqft.apply(convert_sqft_to_num)
df4 = df4[df4.total_sqft.notnull()]
df4.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


## Feature Engineering

### add new feature called price per square feet

In [28]:
df5 = df4.copy()
df5["price_per_sqft"] = df5["price"]*100000/df5["total_sqft"]
df5.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [29]:
df5_stats = df5.describe()
df5_stats

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13200.0,13200.0,13200.0,13200.0,13200.0
mean,1555.302783,2.691136,112.276178,2.800833,7920.759
std,1237.323445,1.338915,149.175995,1.292843,106727.2
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4267.701
50%,1275.0,2.0,71.85,3.0,5438.331
75%,1672.0,3.0,120.0,3.0,7317.073
max,52272.0,40.0,3600.0,43.0,12000000.0


In [61]:
df5

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...
13315,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689.834926
13316,Richards Town,4 BHK,3600.0,5.0,400.00,4,11111.111111
13317,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258.545136
13318,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407.336319


In [54]:
df5.to_csv("bhp.csv",index=False)

### Examine the location which the categorical variablie, we need to apply into dimensionally reduction here to reduce number of the location

In [67]:
df5["location"] = df5["location"].apply(lambda x: x.strip())
location_stats = df5["location"].value_counts()
location_stats

location
Whitefield                   533
Sarjapur  Road               392
Electronic City              304
Kanakpura Road               264
Thanisandra                  235
                            ... 
Rajanna Layout                 1
Subramanyanagar                1
Lakshmipura Vidyaanyapura      1
Malur Hosur Road               1
Abshot Layout                  1
Name: count, Length: 1287, dtype: int64

In [73]:
location_stats.sum()

13200

In [75]:
len(location_stats[location_stats>10])

240

In [77]:
len(location_stats)

1287

In [79]:
len(location_stats[location_stats<=10])

1047

In [83]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

location
BTM 1st Stage                10
Gunjur Palya                 10
Nagappa Reddy Layout         10
Sector 1 HSR Layout          10
Thyagaraja Nagar             10
                             ..
Rajanna Layout                1
Subramanyanagar               1
Lakshmipura Vidyaanyapura     1
Malur Hosur Road              1
Abshot Layout                 1
Name: count, Length: 1047, dtype: int64

In [85]:
len(df5.location.unique())

1287

In [91]:
df5.location = df5.location.apply(lambda x: "Other" if x in location_stats_less_than_10 else x)
df5.sample(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
9529,Electronic City,2 BHK,1100.0,2.0,41.0,2,3727.272727
10236,Kundalahalli,2 BHK,1260.0,2.0,58.0,2,4603.174603
2085,Ramamurthy Nagar,2 BHK,950.0,2.0,50.79,2,5346.315789
4809,Raja Rajeshwari Nagar,2 BHK,1165.0,2.0,45.0,2,3862.660944
12489,Yeshwanthpur,2 BHK,1160.0,2.0,64.08,2,5524.137931
3807,Whitefield,3 BHK,1306.0,3.0,54.65,3,4184.532925
5172,Yelahanka,2 BHK,1060.0,2.0,40.0,2,3773.584906
6282,Other,4 Bedroom,1350.0,4.0,225.0,4,16666.666667
8417,Other,2 BHK,1180.0,2.0,39.0,2,3305.084746
12727,Whitefield,5 Bedroom,4144.0,5.0,331.0,5,7987.451737


### As a data scientist when you have a conversation with your business manager (who has expertise in real estate), he will tell you that normally square ft per bedroom is 300 (i.e. 2 bhk apartment is minimum 600 sqft. If you have for example 400 sqft apartment with 2 bhk than that seems suspicious and can be removed as an outlier. We will remove such outliers by keeping our minimum thresold per bhk to be 300 sqft