<h1>Real Estate Price Prediction Model<h1>

In [1]:
#importing necessary libraries
import warnings
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
warnings.filterwarnings('ignore')

In [3]:
#Loading housing dataset  
data_raw = pd.read_csv("C:\\Users\Joel2\OneDrive\Desktop\ML Projects\Real Estate Price Predictor\Bengaluru_House_Data.csv")
data_raw.sample(5)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
5332,Super built-up Area,Ready To Move,Volagerekallahalli,2 BHK,PSterAs,1070,2.0,2.0,33.15
9091,Built-up Area,Ready To Move,Basavangudi,3 BHK,,1485,3.0,1.0,140.0
11247,Super built-up Area,Ready To Move,Bannerghatta Road,3 BHK,SNity S,1660,3.0,2.0,85.0
10733,Super built-up Area,Ready To Move,Basavangudi,3 BHK,,1850,3.0,2.0,150.0
3013,Super built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,GrrvaGr,1196,2.0,2.0,51.15


<h1></h1>

<h5>Exploring and cleaning the data </h5>

In [4]:
data_raw.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [5]:
#Dropping unnecessary columns
trimmed_data = data_raw.drop(['area_type','society','balcony','availability'],axis='columns')
trimmed_data.sample(5)

Unnamed: 0,location,size,total_sqft,bath,price
8008,2nd Stage Nagarbhavi,5 Bedroom,1200,4.0,240.0
11207,Bellandur,2 BHK,1200,2.0,62.0
9400,Hosa Road,3 BHK,1730,3.0,72.6
13310,Rachenahalli,2 BHK,1050,2.0,52.71
11674,3rd Block Banashankari,5 Bedroom,2400,4.0,325.0


In [6]:
trimmed_data.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [7]:
#Removing null values
trimmed_data = trimmed_data.dropna()

In [8]:
trimmed_data.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [9]:
 trimmed_data["bhk"] = trimmed_data["size"].apply(lambda x: x.split(' ')[0])

In [10]:
trimmed_data.sample(5)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
11426,Channasandra Layout,3 BHK,2400,2.0,50.0,3
8492,Anugrah Layout,3 BHK,1567,3.0,100.0,3
6905,Billekahalli,2 BHK,1350,3.0,55.0,2
211,Kammasandra,3 BHK,1595,3.0,65.0,3
4107,Balagere,2 BHK,1007,2.0,65.0,2


In [11]:
trimmed_data.bhk.unique()

array(['2', '4', '3', '6', '1', '8', '7', '5', '11', '9', '27', '10',
       '19', '16', '43', '14', '12', '13', '18'], dtype=object)

In [12]:
trimmed_data.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [13]:
 def is_float(x):
        try:
            float(x)
        except:
            return False
        return True

In [14]:
trimmed_data[~trimmed_data['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


In [15]:
def conv_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return(float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [16]:
#converting squarefoot ranges to single values
trimmed_data2 = trimmed_data.copy()
trimmed_data2['total_sqft'] = trimmed_data2['total_sqft'].apply(conv_to_num)
trimmed_data2.head(3)

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3


<h1></h1>

<h5>Applying some feature engineering techniques</h5>

In [17]:
 mod_data = trimmed_data2.copy()

In [18]:
mod_data['price_per_sqft'] = mod_data['price']*100000/mod_data['total_sqft']
mod_data.head(5)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [19]:
mod_data['price_per_sqft'] = mod_data['price_per_sqft'].round(2)

In [20]:
mod_data['price_per_sqft']

0         3699.81
1         4615.38
2         4305.56
3         6245.89
4         4250.00
           ...   
13315     6689.83
13316    11111.11
13317     5258.55
13318    10407.34
13319     3090.91
Name: price_per_sqft, Length: 13246, dtype: float64

In [21]:
len(mod_data.location.unique())

1304

In [22]:
mod_data.location = mod_data.location.apply(lambda x : x.strip())

location_stats = mod_data.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats

location
Whitefield               535
Sarjapur  Road           392
Electronic City          304
Kanakpura Road           266
Thanisandra              236
                        ... 
1 Giri Nagar               1
Kanakapura Road,           1
Kanakapura main  Road      1
Karnataka Shabarimala      1
whitefiled                 1
Name: location, Length: 1293, dtype: int64

In [23]:
location_below_10 = location_stats[location_stats <= 10]
location_below_10

location
Basapura                 10
1st Block Koramangala    10
Gunjur Palya             10
Kalkere                  10
Sector 1 HSR Layout      10
                         ..
1 Giri Nagar              1
Kanakapura Road,          1
Kanakapura main  Road     1
Karnataka Shabarimala     1
whitefiled                1
Name: location, Length: 1052, dtype: int64

In [24]:
mod_data.location = mod_data.location.apply(lambda x: 'other' if x in location_below_10 else x)

mod_data.sample(5)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
8809,Whitefield,2 BHK,1195.0,2.0,62.38,2,5220.08
6012,Kengeri,3 Bedroom,600.0,3.0,90.0,3,15000.0
10309,Ramamurthy Nagar,4 Bedroom,1900.0,4.0,185.0,4,9736.84
2139,Prithvi Layout,2 BHK,1352.0,2.0,87.5,2,6471.89
6216,Yelahanka,3 BHK,1556.0,3.0,86.0,3,5526.99


 <h5></h5>

<h5>Outlier detection and removal</h5>

In [25]:
#Converting specific columns to float type

mod_data["total_sqft"] = mod_data["total_sqft"].astype('float64',copy=False)

mod_data["bhk"] = mod_data["bhk"].astype('float64',copy=False)

In [26]:
mod_data2 = mod_data[~(mod_data.total_sqft/mod_data.bhk<300)]

mod_data2.sample(3)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
7162,other,2 Bedroom,1225.0,2.0,65.0,2.0,5306.12
8137,Sarjapur Road,4 BHK,2425.0,5.0,201.0,4.0,8288.66
6302,Thanisandra,2 BHK,934.0,2.0,55.0,2.0,5888.65


In [27]:
mod_data2.describe()

Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft
count,12456.0,12502.0,12502.0,12502.0,12456.0
mean,1590.189927,2.56479,111.311915,2.650696,6308.502836
std,1260.404795,1.084946,152.089966,0.981698,4168.127366
min,300.0,1.0,9.0,1.0,267.83
25%,1115.0,2.0,49.0,2.0,4210.53
50%,1300.0,2.0,70.0,3.0,5294.12
75%,1700.0,3.0,115.0,3.0,6916.67
max,52272.0,16.0,3600.0,16.0,176470.59


In [28]:
def remove_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out


data_mod = remove_outliers(mod_data2)
    

In [29]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df,groupby('bhk'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean' : np.mean(bhk_df.price_per_sqft),
                'std' : np.std(bhk_df.price_per_sqft),
                'count' : bhk_df.shape[0]
            }
            
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_sats.get(bhk-1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
                
    return df.drop(exclude_indices, axis='index')

In [30]:
data_mod2 = data_mod[data_mod.bath < data_mod.bhk+2]

In [31]:
data_mod3 = data_mod2.drop(['size','price_per_sqft'],axis='columns')
data_mod3.sample(3)

Unnamed: 0,location,total_sqft,bath,price,bhk
1042,Begur Road,1240.0,2.0,39.06,2.0
9635,other,1630.0,3.0,131.0,3.0
8984,other,920.0,2.0,46.0,2.0


In [32]:
dummies = pd.get_dummies(data_mod3.location) 
dummies.head(3)

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
data_mod4 = pd.concat([data_mod3,dummies],axis='columns')
data_mod4.head(3)

Unnamed: 0,location,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1st Block Jayanagar,2850.0,4.0,428.0,4.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1st Block Jayanagar,1630.0,3.0,194.0,3.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1st Block Jayanagar,1875.0,2.0,235.0,3.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
data_mod5 = data_mod4.drop('location',axis='columns')
data_mod5.head(3)

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,2850.0,4.0,428.0,4.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,235.0,3.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
X = data_mod5.drop('price',axis='columns')
X.head()

Unnamed: 0,total_sqft,bath,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,2850.0,4.0,4.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,3.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,3.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1200.0,2.0,3.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1235.0,2.0,2.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
y = data_mod5.price
y.head()

0    428.0
1    194.0
2    235.0
3    130.0
4    148.0
Name: price, dtype: float64

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [38]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.7919477337586649

In [39]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.81155684, 0.77261421, 0.80214981, 0.80426801, 0.79827292])

In [40]:
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

In [41]:
def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model' : LinearRegression(),
            'params' : {
                'normalize' : [True, False]
            }
        },
        'lasso' : {
            'model' : Lasso(),
            'params' : {
                'alpha' : [1,2],
                'selection' : ['random','cyclic']
            }
        },
        'decision_tree' : {
            'model' : DecisionTreeRegressor(),
            'params' : {
                'criterion' : ['mse','friednan_nse'],
                'splitter' : ['best','random']
            }
        }       
    }
    
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model' : algo_name,
            'best_score' : gs.best_score_,
            'best_params' : gs.best_params_
        })
        
        
    return pd.DataFrame(scores, columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.797772,{'normalize': False}
1,lasso,0.665145,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.71854,"{'criterion': 'mse', 'splitter': 'best'}"


In [42]:
def predict_price(location, sqft, bath, bhk):
    loc_index = np.where(X.columns==location)[0][0]
    
    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    
    if loc_index >= 0:
        x[loc_index] = 1
        
    return lr_clf.predict([x])[0]

In [43]:
predict_price('2nd Phase Judicial Layout',3200,4,4)

192.14836808948095

In [44]:
import pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)
    

In [45]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json",'w') as f:
    f.write(json.dumps(columns))
     