# Project name: Beagaluru House Price Prediction
Dataset link : https://www.kaggle.com/amitabhajoy/bengaluru-house-price-data

In [1]:
#Importing Neccesarry Library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# pd.set_option("display.max_columns",9)
# pd.set_option("display.max_rows",13320)

In [2]:
# Reading the data from CSV file
df = pd.read_csv('Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df.shape

(13320, 9)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [5]:
df.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [6]:
# Lets see the value counts of each category are presented in all features
for feature in df.columns:
    print(feature)
    print(len(df[feature].value_counts()),"\n")
    
#     print("*"*5)

area_type
4 

availability
81 

location
1305 

size
31 

society
2688 

total_sqft
2117 

bath
19 

balcony
4 

price
1994 



In [7]:
# Lest see how many missing values are presented in each columns
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [8]:
#Lets drop some unimpotant or higher missing values container feature
df.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [9]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


# Handling missing values

In [10]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [11]:
df['location'].value_counts()

Whitefield                           540
Sarjapur  Road                       399
Electronic City                      302
Kanakpura Road                       273
Thanisandra                          234
                                    ... 
Ananthapura, T C palaya Main Road      1
M.G Road                               1
Banashankari3rd stage bigbazar         1
Dhanalakshmi Layout                    1
Akash Nagar                            1
Name: location, Length: 1305, dtype: int64

In [12]:
df['location'] = df['location'].fillna('Sarjapur  Road')

In [13]:
df['size'].value_counts()

2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
10 BHK           2
11 Bedroom       2
11 BHK           2
19 BHK           1
14 BHK           1
18 Bedroom       1
27 BHK           1
16 BHK           1
43 Bedroom       1
13 BHK           1
12 Bedroom       1
Name: size, dtype: int64

In [14]:
df['size'] = df['size'].fillna('2 BHK')

In [15]:
#Bathroom contains 73 missing value so we can fill it by median
df['bath'] = df['bath'].fillna(df['bath'].median())

In [16]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13320 non-null  object 
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
dtypes: float64(2), object(3)
memory usage: 520.4+ KB


# Feature Engineering

In [18]:
# Lets grab the number of bhk from size
df['bhk'] = df['size'].str.split().str.get(0).astype(int)

In [19]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [20]:
df[df.bhk>20]

Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [21]:
# Lets fix the issue in total_sqft column like '1133 - 1384' this problem
df['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [22]:
# Fix issue function of total_sqft
def convertRange(x):
    temp = x.split("-")
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1]))/2
    try:
        return float(x)
    except:
        return None
    

In [23]:
df['total_sqft'] = df['total_sqft'].apply(convertRange)

In [24]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   location    13320 non-null  object 
 1   size        13320 non-null  object 
 2   total_sqft  13274 non-null  float64
 3   bath        13320 non-null  float64
 4   price       13320 non-null  float64
 5   bhk         13320 non-null  int32  
dtypes: float64(3), int32(1), object(2)
memory usage: 572.5+ KB


#### price per squre_feet

In [26]:
df['price_per_sqft'] = df['price']*100000 / df['total_sqft']

In [27]:
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [28]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [29]:
# Now lets see the location column
df['location'].value_counts()

Whitefield                           540
Sarjapur  Road                       400
Electronic City                      302
Kanakpura Road                       273
Thanisandra                          234
                                    ... 
Ananthapura, T C palaya Main Road      1
M.G Road                               1
Banashankari3rd stage bigbazar         1
Dhanalakshmi Layout                    1
Akash Nagar                            1
Name: location, Length: 1305, dtype: int64

In [30]:
df['location']

0        Electronic City Phase II
1                Chikka Tirupathi
2                     Uttarahalli
3              Lingadheeranahalli
4                        Kothanur
                   ...           
13315                  Whitefield
13316               Richards Town
13317       Raja Rajeshwari Nagar
13318             Padmanabhanagar
13319                Doddathoguru
Name: location, Length: 13320, dtype: object

In [31]:
df['location'] = df['location'].apply(lambda x: x.strip())

In [32]:
location_count = df['location'].value_counts()

In [33]:
location_count

Whitefield                                      541
Sarjapur  Road                                  400
Electronic City                                 304
Kanakpura Road                                  273
Thanisandra                                     237
                                               ... 
Sampige Layout                                    1
Prasanna layout Herohalli                         1
Lakshminarayanapura, Electronic City Phase 2      1
K G Colony                                        1
Chokkahalli                                       1
Name: location, Length: 1294, dtype: int64

In [34]:
location_count_less_than_10 = location_count[location_count<=10]
location_count_less_than_10

BTM 1st Stage                                   10
Dairy Circle                                    10
Basapura                                        10
Sadashiva Nagar                                 10
Gunjur Palya                                    10
                                                ..
Sampige Layout                                   1
Prasanna layout Herohalli                        1
Lakshminarayanapura, Electronic City Phase 2     1
K G Colony                                       1
Chokkahalli                                      1
Name: location, Length: 1053, dtype: int64

In [35]:
df['location'] = df['location'].apply(lambda x: 'other' if x in location_count_less_than_10 else x)

In [36]:
df.head(100)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...
95,Domlur,3 BHK,1540.0,3.0,90.00,3,5844.155844
96,Kengeri,4 Bedroom,2894.0,4.0,245.00,4,8465.791292
97,Sarjapura - Attibele Road,3 BHK,1330.0,2.0,48.00,3,3609.022556
98,other,2 BHK,1200.0,2.0,65.00,2,5416.666667


In [37]:
df['location'].value_counts()

other                        2885
Whitefield                    541
Sarjapur  Road                400
Electronic City               304
Kanakpura Road                273
                             ... 
Tindlu                         11
Banjara Layout                 11
LB Shastri Nagar               11
Thyagaraja Nagar               11
2nd Phase Judicial Layout      11
Name: location, Length: 242, dtype: int64

# Outliers detection and remove

In [38]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,13274.0,13320.0,13320.0,13320.0,13274.0
mean,1559.626694,2.688814,112.565627,2.802778,7907.501
std,1238.405258,1.338754,148.971674,1.294496,106429.6
min,1.0,1.0,8.0,1.0,267.8298
25%,1100.0,2.0,50.0,2.0,4266.865
50%,1276.0,2.0,72.0,3.0,5434.306
75%,1680.0,3.0,120.0,3.0,7311.746
max,52272.0,40.0,3600.0,43.0,12000000.0


In [39]:
(df['total_sqft']/df['bhk']).describe()

count    13274.000000
mean       575.074878
std        388.205175
min          0.250000
25%        473.333333
50%        552.500000
75%        625.000000
max      26136.000000
dtype: float64

In [40]:
df = df[((df['total_sqft']/df['bhk']) >= 300)]

In [41]:
df.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12530.0,12530.0,12530.0,12530.0,12530.0
mean,1594.564544,2.559537,111.382401,2.650838,6303.979357
std,1261.271296,1.077938,152.077329,0.976678,4162.237981
min,300.0,1.0,8.44,1.0,267.829813
25%,1116.0,2.0,49.0,2.0,4210.526316
50%,1300.0,2.0,70.0,3.0,5294.117647
75%,1700.0,3.0,115.0,3.0,6916.666667
max,52272.0,16.0,3600.0,16.0,176470.588235


In [42]:
df.shape

(12530, 7)

In [43]:
df['price_per_sqft'].describe()

count     12530.000000
mean       6303.979357
std        4162.237981
min         267.829813
25%        4210.526316
50%        5294.117647
75%        6916.666667
max      176470.588235
Name: price_per_sqft, dtype: float64

In [44]:
# removing outliers for price_per_sqft
def remove_oulier_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        
        gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_output = pd.concat([df_output,gen_df], ignore_index= True)
    return df_output

data = remove_oulier_sqft(df)
data.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,10301.0,10301.0,10301.0,10301.0,10301.0
mean,1508.440608,2.471702,91.286372,2.574896,5659.062876
std,880.694214,0.979449,86.342786,0.897649,2265.774749
min,300.0,1.0,10.0,1.0,1250.0
25%,1110.0,2.0,49.0,2.0,4244.897959
50%,1286.0,2.0,67.0,2.0,5175.600739
75%,1650.0,3.0,100.0,3.0,6428.571429
max,30400.0,16.0,2200.0,16.0,24509.803922


In [45]:
def bhk_outlier_remove(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}

        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                "mean": np.mean(bhk_df.price_per_sqft),
                "std": np.std(bhk_df.price_per_sqft),
                "count": bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')         

In [46]:
data = bhk_outlier_remove(data)

In [47]:
data.shape

(7360, 7)

In [48]:
data.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,1st Block Jayanagar,4 BHK,2850.0,4.0,428.0,4,15017.54386
1,1st Block Jayanagar,3 BHK,1630.0,3.0,194.0,3,11901.840491
2,1st Block Jayanagar,3 BHK,1875.0,2.0,235.0,3,12533.333333
3,1st Block Jayanagar,3 BHK,1200.0,2.0,130.0,3,10833.333333
4,1st Block Jayanagar,2 BHK,1235.0,2.0,148.0,2,11983.805668


In [49]:
data.drop(columns=['price_per_sqft','size'],inplace=True)

In [50]:
data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [51]:
data.to_csv('Cleaned_data.csv',index=False)

In [52]:
x = data.drop(columns=['price'])
y = data['price']

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

# Train- Test spliting

In [54]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5888, 4)
(1472, 4)
(5888,)
(1472,)


# Applying Linear Regression

In [55]:
col_transform = make_column_transformer((OneHotEncoder(sparse=False),['location']),
                                        remainder= 'passthrough')

In [56]:
scaler = StandardScaler()

In [57]:
lr = LinearRegression(normalize=True)

In [58]:
pipe = make_pipeline(col_transform,scaler,lr)

In [59]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression(normalize=True))])

In [60]:
y_pred_lr = pipe.predict(X_test)

In [61]:
print("Accuracy Score:")
r2_score(y_test,y_pred_lr)

Accuracy Score:


0.8296235626764411

# Applying Lasso

In [62]:
lasso = Lasso()

In [63]:
pipe = make_pipeline(col_transform,scaler,lasso)

In [64]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('lasso', Lasso())])

In [65]:
y_pres_lasso = pipe.predict(X_test)

In [66]:
print("Accuracy Score:")
r2_score(y_test,y_pres_lasso)

Accuracy Score:


0.8199181874762704

# Applying Ridge

In [67]:
ridge = Ridge()

In [68]:
pipe = make_pipeline(col_transform,scaler,ridge)

In [69]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('ridge', Ridge())])

In [70]:
y_pred_ridge = pipe.predict(X_test)

In [71]:
print("Accuracy Score:")
r2_score(y_test,y_pred_ridge)

Accuracy Score:


0.8296651410179635

In [72]:
# All Model Accuracy
print("Linear Regression:" , r2_score(y_test,y_pred_lr))
print("Lasso Regression:" , r2_score(y_test,y_pres_lasso))
print("Ridge Regression:" , r2_score(y_test,y_pred_ridge))

Linear Regression: 0.8296235626764411
Lasso Regression: 0.8199181874762704
Ridge Regression: 0.8296651410179635


### Here linear regression and ridge regression both are doing same performance i am gonna dump the Ridge regression model in pickle

In [73]:
import pickle

In [74]:
pickle.dump(pipe, open("Ridge_model.pkl",'wb'))