In [1]:
# this project is based on bengaluru house data extracted from kaggle

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

In [3]:
# reading data..
path= 'Bengaluru_House_Data.csv'
beng_data = pd.read_csv(path)
beng_data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
beng_data.shape

(13320, 9)

In [5]:
print(beng_data.area_type.value_counts(),"\n\n")
print(beng_data.availability.value_counts(),"\n\n")
print(beng_data.society.value_counts(),"\n\n")
print(beng_data.balcony.value_counts())

Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64 


Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64 


GrrvaGr    80
PrarePa    76
Sryalan    59
Prtates    59
GMown E    56
           ..
Amionce     1
JaghtDe     1
Jauraht     1
Brity U     1
RSntsAp     1
Name: society, Length: 2688, dtype: int64 


2.0    5113
1.0    4897
3.0    1672
0.0    1029
Name: balcony, dtype: int64


In [6]:
# the above columnns are not very important in determinig the price... hence we will drop them
cols_to_drop=['society','availability','area_type','balcony']
new_df=beng_data.drop(cols_to_drop,axis='columns')
new_df.shape

(13320, 5)

In [7]:
new_df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [8]:
# data cleaning
# checking how many rows have null values in each column
new_df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [9]:
# dropping rows with null values
new_df=new_df.dropna()
print(new_df.shape)
print("\n\n",new_df.isnull().sum())

(13246, 5)


 location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64


In [10]:
# data types of each column in the data
new_df.dtypes

location       object
size           object
total_sqft     object
bath          float64
price         float64
dtype: object

In [11]:
# analysing the size column....
new_df['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [12]:
# create a new column with the number of bedrooms
new_df['BHK']=new_df['size'].apply(lambda p:int(p.split(' ')[0]))
new_df.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [13]:
# checking for null values
new_df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
BHK           0
dtype: int64

In [14]:
new_df.BHK.unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

In [15]:
new_df[new_df.BHK>=20]

Unnamed: 0,location,size,total_sqft,bath,price,BHK
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [16]:
new_df.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [17]:
# trying to differentiate values in total square feet that are in ranges..
def float_no(x):
    try:
        float(x)
    except:
        return False
    return True


In [18]:
# applying the function to the total sqr feet column to visualize data in ranges
print(new_df[~new_df.total_sqft.apply(float_no)].head(15),"\n\n")
# the number of rows with such entries...
new_df[~new_df.total_sqft.apply(float_no)].shape[0]

                  location       size      total_sqft  bath    price  BHK
30               Yelahanka      4 BHK     2100 - 2850   4.0  186.000    4
122                 Hebbal      4 BHK     3067 - 8156   4.0  477.000    4
137     8th Phase JP Nagar      2 BHK     1042 - 1105   2.0   54.005    2
165               Sarjapur      2 BHK     1145 - 1340   2.0   43.490    2
188               KR Puram      2 BHK     1015 - 1540   2.0   56.800    2
410                Kengeri      1 BHK  34.46Sq. Meter   1.0   18.500    1
549            Hennur Road      2 BHK     1195 - 1440   2.0   63.770    2
648                Arekere  9 Bedroom       4125Perch   9.0  265.000    9
661              Yelahanka      2 BHK     1120 - 1145   2.0   48.130    2
672           Bettahalsoor  4 Bedroom     3090 - 5002   4.0  445.000    4
772  Banashankari Stage VI      2 BHK     1160 - 1195   2.0   59.935    2
775           Basavanagara      1 BHK   1000Sq. Meter   2.0   93.000    1
850      Bannerghatta Road      2 BHK 

190

In [19]:
# returning the avg when we encounter ranges
def convt_range_to_float(x):
    token=x.split ('-')
    if len(token) == 2:
        avg= (float(token[0])+float(token[1]))/2
        return avg
    try:
        return float(x)
    except:
        return None

In [20]:
new_df.total_sqft = new_df.total_sqft.apply(convt_range_to_float)
new_df.total_sqft.unique()

array([1056. , 2600. , 1440. , ..., 1258.5,  774. , 4689. ])

In [21]:
new_df.total_sqft.dtype

dtype('float64')

In [22]:
new_df.isnull().sum()

location       0
size           0
total_sqft    46
bath           0
price          0
BHK            0
dtype: int64

In [23]:
new_df=new_df.dropna()
new_df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
BHK           0
dtype: int64

In [24]:
new_df.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [25]:
# converting the price from lakh rupees to dollars
# first mul by 100,000 to rupees then divide by 77.84 to dollars
def convt_to_dollars(x):
    x=round(x*100000/77.84,2)
    return x
new_df['price']=new_df.price.apply(convt_to_dollars)

In [26]:
new_df['price_per_sqft']=round(new_df['price']/new_df['total_sqft'],2)
new_df.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,50192.7,2,47.53
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,154162.38,4,59.29
2,Uttarahalli,3 BHK,1440.0,2.0,79650.57,3,55.31
3,Lingadheeranahalli,3 BHK,1521.0,3.0,122045.22,3,80.24
4,Kothanur,2 BHK,1200.0,2.0,65519.01,2,54.6


In [27]:
# analyzing the location column
# striping extra spaces in the data..
new_df.location=new_df.location.apply(lambda x : x.strip())
loc_stats = new_df.groupby('location')['location'].agg('count').sort_values(ascending=False)
print(loc_stats)

location
Whitefield               533
Sarjapur  Road           392
Electronic City          304
Kanakpura Road           264
Thanisandra              235
                        ... 
1 Giri Nagar               1
Kanakapura Road,           1
Kanakapura main  Road      1
Kannur                     1
whitefiled                 1
Name: location, Length: 1287, dtype: int64


In [28]:
# locations with less than 10 data points...
loc_stat_less_ten=loc_stats[loc_stats<=10]
loc_stat_less_ten

location
Sadashiva Nagar          10
Naganathapura            10
Basapura                 10
Nagadevanahalli          10
Kalkere                  10
                         ..
1 Giri Nagar              1
Kanakapura Road,          1
Kanakapura main  Road     1
Kannur                    1
whitefiled                1
Name: location, Length: 1047, dtype: int64

In [29]:
# change their loc to 'other'
new_df.location= new_df.location.apply(lambda y: 'other' if y in loc_stat_less_ten else y)
len(new_df.location.unique())

241

In [30]:
new_df.head(10)

Unnamed: 0,location,size,total_sqft,bath,price,BHK,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,50192.7,2,47.53
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,154162.38,4,59.29
2,Uttarahalli,3 BHK,1440.0,2.0,79650.57,3,55.31
3,Lingadheeranahalli,3 BHK,1521.0,3.0,122045.22,3,80.24
4,Kothanur,2 BHK,1200.0,2.0,65519.01,2,54.6
5,Whitefield,2 BHK,1170.0,2.0,48818.09,2,41.72
6,Old Airport Road,4 BHK,2732.0,4.0,262076.05,4,95.93
7,Rajaji Nagar,4 BHK,3300.0,4.0,770811.92,4,233.58
8,Marathahalli,3 BHK,1310.0,3.0,81256.42,3,62.03
9,other,6 Bedroom,1020.0,6.0,475334.02,6,466.01


In [31]:
# outlier detection
# first we check using no of bedrooms and total square feet
# we assume that threshhold is 350 sqft per bedroom
print(new_df[(new_df.total_sqft/new_df.BHK)<=350])
print(new_df.shape)

               location       size  total_sqft  bath      price  BHK  \
9                 other  6 Bedroom      1020.0   6.0  475334.02    6   
26      Electronic City      2 BHK       660.0   1.0   29676.26    2   
29      Electronic City      3 BHK      1025.0   2.0   60380.27    3   
35       Kanakpura Road      2 BHK       700.0   2.0   46248.72    2   
45           HSR Layout  8 Bedroom       600.0   9.0  256937.31    8   
...                 ...        ...         ...   ...        ...  ...   
13281   Margondanahalli  5 Bedroom      1375.0   5.0  160585.82    5   
13300     Hosakerehalli  5 Bedroom      1500.0   6.0  186279.55    5   
13303    Vidyaranyapura  5 Bedroom       774.0   5.0   89928.06    5   
13306             other  4 Bedroom      1200.0   5.0  417523.12    4   
13311  Ramamurthy Nagar  7 Bedroom      1500.0   9.0  321171.63    7   

       price_per_sqft  
9              466.01  
26              44.96  
29              58.91  
35              66.07  
45             

In [32]:
# filter out such data from the data frame
new_df=new_df[(new_df.total_sqft/new_df.BHK)>300]
new_df.shape

(12274, 7)

In [33]:
# outliers based on price_per_sqft
new_df.price_per_sqft.describe()

count    12274.000000
mean        79.803158
std         52.071129
min          3.440000
25%         53.960000
50%         67.620000
75%         87.687500
max       2267.090000
Name: price_per_sqft, dtype: float64

In [34]:
# removing rows where price per sqrft is more than a std deviation abv or below the mean
def price_per_sqft_out(y):
    newdf=pd.DataFrame()
    for k,df in y.groupby('location'):
        mn=np.mean(df.price_per_sqft)
        std_=np.std(df.price_per_sqft)
        red_df=df[(df.price_per_sqft>(mn-std_)) & (df.price_per_sqft<=(mn+std_))]
        newdf = pd.concat([newdf,red_df],ignore_index=True)
    return newdf
df1=price_per_sqft_out(new_df)
df1.shape

(10018, 7)

In [36]:
# removing data points where 2BHK cost more than 3BHK in the same area
def remov_bdrm_outlier(df):
    exclude_indices=np.array([])
    for loc,loc_df in df.groupby('location'):
        bdrm_stats = {}
        for bdr, bdr_df in loc_df.groupby('BHK'):
            bdrm_stats[bdr]={
                'mean': np.mean(bdr_df.price_per_sqft),
                'std_': np.std(bdr_df.price_per_sqft),
                'count':bdr_df.shape[0]
            }
        for bdr, bdr_df in loc_df.groupby('BHK'):
            stats=bdrm_stats.get(bdr-1)
            if stats and stats['count']>5:
                exclude_indices=np.append(exclude_indices,bdr_df[bdr_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

df2 = remov_bdrm_outlier(df1)
df2.shape

(7153, 7)

In [37]:
# exploring the bathroom property
df2.bath.unique()

array([ 4.,  3.,  2.,  8.,  5.,  1.,  6.,  7.,  9., 12., 16., 13.])

In [38]:
df2[df2.bath>=10]

Unnamed: 0,location,size,total_sqft,bath,price,BHK,price_per_sqft
5138,Neeladri Nagar,10 BHK,4000.0,12.0,205549.85,10,51.39
8304,other,10 BHK,12000.0,12.0,674460.43,10,56.21
8389,other,16 BHK,10000.0,16.0,706577.6,16,70.66
9104,other,11 BHK,6000.0,12.0,192702.98,11,32.12
9429,other,13 BHK,5425.0,13.0,353288.8,13,65.12


In [47]:
df2[(df2.bath>=10) & (df2.total_sqft<10000)]

Unnamed: 0,location,size,total_sqft,bath,price,BHK,price_per_sqft
5138,Neeladri Nagar,10 BHK,4000.0,12.0,205549.85,10,51.39
9104,other,11 BHK,6000.0,12.0,192702.98,11,32.12
9429,other,13 BHK,5425.0,13.0,353288.8,13,65.12


In [51]:
df2=df2[~((df2.bath>=10) & (df2.total_sqft<10000))]
df2.shape

(7150, 7)

In [53]:
# properties with bathrooms greater than bedroom plus 2
df2[df2.bath>df2.BHK+2]

Unnamed: 0,location,size,total_sqft,bath,price,BHK,price_per_sqft
1565,Chikkabanavar,4 Bedroom,2460.0,7.0,102774.92,4,41.78
5099,Nagasandra,4 Bedroom,7000.0,8.0,578108.94,4,82.59
6562,Thanisandra,3 BHK,1806.0,6.0,149023.64,3,82.52
8229,other,6 BHK,11338.0,9.0,1284686.54,6,113.31


In [54]:
# removing them...
df3=df2[df2.bath <= df2.BHK+2]
df3.shape

(7146, 7)

In [55]:
# drop some of the features that may not be necessary for training data
df4=df3.drop(['size','price_per_sqft'],axis='columns')
df4.head(7)

Unnamed: 0,location,total_sqft,bath,price,BHK
0,1st Block Jayanagar,2850.0,4.0,549845.84,4
1,1st Block Jayanagar,1630.0,3.0,249229.19,3
2,1st Block Jayanagar,1875.0,2.0,301901.34,3
3,1st Block Jayanagar,1200.0,2.0,167009.25,3
4,1st Block Jayanagar,1235.0,2.0,190133.61,2
5,1st Block Jayanagar,2750.0,4.0,530575.54,4
6,1st Block Jayanagar,2450.0,4.0,472764.65,4


In [57]:
# we use one hot encoding to rep location
dummies = pd.get_dummies(df4.location)
dummies.head()
# we can drop the last column (other)- it will be rep by zeros in all other columns
dummies.drop('other',axis='columns')

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10013,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10014,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
# concat dummies cols to the main data frame
df5=pd.concat([df4, dummies],axis='columns')
df5.head()

Unnamed: 0,location,total_sqft,bath,price,BHK,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1st Block Jayanagar,2850.0,4.0,549845.84,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1st Block Jayanagar,1630.0,3.0,249229.19,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1st Block Jayanagar,1875.0,2.0,301901.34,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1st Block Jayanagar,1200.0,2.0,167009.25,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1st Block Jayanagar,1235.0,2.0,190133.61,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
# setting up variables for training
X = df5.drop(['location','price'],axis='columns')
y=df5['price']
print(X.shape)
print(y.shape)

(7146, 244)
(7146,)


In [69]:
# spliting the data...
from sklearn.model_selection import train_test_split
# first training and testing data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=1)
print("Training data")
print(X_train.shape,y_train.shape)
print("Testing data")
print(X_test.shape,y_test.shape)

Training data
(6074, 244) (6074,)
Testing data
(1072, 244) (1072,)


In [70]:
# the model - linear regression model
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.9002690878669248

In [74]:
print(lr_clf.predict(X_test.head()))
print(y_test.head().tolist())

[ 47593.91015625  38536.4765625  144474.171875   150183.83203125
 341113.1484375 ]
[56526.21, 59095.58, 109198.36, 179856.12, 346865.36]
