Real State Price Prediction System

# 1. Importing necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20,10)
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
import pickle
import json

# 2. Reading Data
We will use pandas `read_csv` function to read the data in csv file.

In [2]:
filepath = r'C:\Users\#stormy\Desktop\python\notebook\project\Bengaluru_House_Data.csv'    #filepath of the dataset
data = pd.read_csv(filepath)
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price in lakhs
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


# 3. Data Preprcessing
Data preprocessing is a data mining technique that involves transforming raw data into an understandable format.

In [3]:
data.shape

(13320, 9)

In [4]:
data.groupby('area_type')['area_type'].agg('count')    # taking the 'area_type' column as a group and counting its values

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

`groupby()` function is used to split the data into groups based on some criteria
and
`agg()` function abbreviation of aggregate is used to define what we want to do with the grouped data.


In [5]:
data2 = data.drop(['area_type','society','balcony','availability'], axis='columns')     # dropping useless columns
data2.head()

Unnamed: 0,location,size,total_sqft,bath,price in lakhs
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [6]:
data2.isnull().sum()     # finding total empty values in each column

location           1
size              16
total_sqft         0
bath              73
price in lakhs     0
dtype: int64

In [7]:
data3 = data2.dropna()    # dropping 'NA' values
data3.isnull().sum()

location          0
size              0
total_sqft        0
bath              0
price in lakhs    0
dtype: int64

In [8]:
data3.shape

(13246, 5)

In [9]:
data3['size'].unique()    # checking unique values in size column

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [10]:
data3['bhk'] = data3['size'].apply(lambda x: int(x.split(' ')[0]))    # getting the number of bedrooms from size column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data3['bhk'] = data3['size'].apply(lambda x: int(x.split(' ')[0]))    # getting the number of bedrooms from size column


In [11]:
data3.head()

Unnamed: 0,location,size,total_sqft,bath,price in lakhs,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [12]:
data3['bhk'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18], dtype=int64)

In [13]:
data3[data3.bhk>20]    

Unnamed: 0,location,size,total_sqft,bath,price in lakhs,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


Finding rows with extraordinary values! (Outliers)

In [14]:
data3.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [15]:
def is_float(x):
    '''
    Function to convert values into float dtype.
    '''
    try:
        float(x)
    except:
        return False
    return True

In [16]:
data3[~data3['total_sqft'].apply(is_float)].head(20)    # finding values those not got converted

Unnamed: 0,location,size,total_sqft,bath,price in lakhs,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


In [17]:
def convert_sqft_to_sum(x):
    '''
    Function to convert those unusual format of data
    '''
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None

In [18]:
convert_sqft_to_sum('2100 - 2850')    # usage of the function

2475.0

In [19]:
data4 = data3.copy()
data4['total_sqft'] = data4['total_sqft'].apply(convert_sqft_to_sum)
data4.head()

Unnamed: 0,location,size,total_sqft,bath,price in lakhs,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [20]:
data4.loc[30]    # loc function is used to see data row-wise

location          Yelahanka
size                  4 BHK
total_sqft             2475
bath                      4
price in lakhs          186
bhk                       4
Name: 30, dtype: object

In [21]:
data5 = data4.copy()    # copy function is used to copy the whole dataframe
data5.head()

Unnamed: 0,location,size,total_sqft,bath,price in lakhs,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [22]:
data5['price_per_sqft'] = data5['price'] * 100000/data5['total_sqft']    # making a new column in the dataframe named `price_per_sqft` and see the logic to create it
data5.head()

KeyError: 'price'

In [None]:
len(data5.location.unique())

In [None]:
data5.location = data5.location.apply(lambda x : x.strip())    # strip is used to remove the white spaces around the data points

location_stats = data5.groupby('location')['location'].agg('count').sort_values(ascending=False)    # sorting the location column in descending order
location_stats

In [None]:
len(location_stats[location_stats<=10])    # totaling the minor locations

In [None]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

In [None]:
data5.location = data5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)    # changing minor locations into `other`

In [None]:
len(data5.location.unique())

In [None]:
data5[data5.total_sqft / data5.bhk < 300].head()    # checking for outliners; like a house with 1407 sq. area can't have 6 bedrooms

In [None]:
data5.shape

In [None]:
data6 = data5[~(data5.total_sqft/data5.bhk<300)]

In [None]:
data6.price_per_sqft.describe()

In [None]:
data6.shape

# 4. Data Cleaning
In this section, we will find outliers and try to remove them.

In [None]:
def remove_pps_outliers(df):
    '''
    Function to clear stuff (outliers) in the price_per_sqft column so that we don't live in a hypothetical dataset. 😄
    '''
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
        

In [None]:
data7 = remove_pps_outliers(data6)
data7.shape

In [None]:
data7.head()

In [None]:
def remove_bhk_outliers(df):
    '''
    Function to clear stuff (outliers) in the bhk column so that we don't live in a hypothetical dataset. 😄
    '''
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

data8 = remove_bhk_outliers(data7)
data8.shape

# 5. Data Visualization
Time to visualize our data

In [None]:
def plot_scatter_chart(df,location):
    '''
    Function that will help us to visualize the data of the different locations 
    '''
    bhk2 = df[(df.location==location) & (df.bhk==2)]
    bhk3 = df[(df.location==location) & (df.bhk==3)]
    matplotlib.rcParams['figure.figsize'] = (15, 10)
    plt.scatter(bhk2.total_sqft, bhk2.price,color='blue', label='2 BHK', s=50)
    plt.scatter(bhk3.total_sqft, bhk3.price,marker='+',color='green', label='3 BHK', s=50)
    plt.xlabel('Total Square Feet Area')
    plt.ylabel('Price Per Square Feet')
    plt.title(location)
    plt.legend()

In [None]:
plot_scatter_chart(data7, 'Whitefield')

In [None]:
plot_scatter_chart(data7,"Hebbal")

In [None]:
plt.hist(data8.price_per_sqft, rwidth=0.8)    # visualization the price_per_sqft column
plt.xlabel('Price per Square Feet')
plt.ylabel('Count')

In [None]:
data8.bath.unique()

In [None]:
data8[data8.bath>10]    # Again some idiot outliers😆

In [None]:
plt.hist(data8.bath, rwidth=0.8)
plt.xlabel('Number of bathrooms')
plt.ylabel('Count')

In [None]:
data8[data8.bath > data8.bhk + 2]   

In [None]:
data9 = data8[data8.bath < data8.bhk + 2]
data9.shape

In [None]:
data9.sample()    # sample return a random row from the dataset

In [None]:
data10 = data9.drop(['size',"price_per_sqft"],axis='columns')    # removing or dropping 'size' and 'prize_per_sqft' as we don't require them any more
data10.head()

# 6. Creating Dummies
We will use pandas' `get_dummies()` to create dummies variables.
It is used for data manipulation. It converts categorical data into dummy or indicator variables.

In [None]:
dummies = pd.get_dummies(data10.location)     
dummies.head()

In [None]:
data11 = pd.concat([data10, dummies.drop('other', axis='columns')], axis='columns')    # joining the dummy values again with the dataset except 'other' column
data11.head()

In [None]:
data12 = data11.drop('location', axis='columns')    # dropping original location as now we have dummmies in its place.
data12.head()

# 7. Splitting Data for Training and Testing
Before Training the model, it is required to split the data into train and test data. For this we will use, sklearn's `train_test_split`


In [None]:
X = data12.drop('price', axis='columns')    # dropping price column as we don't want it in our train dataset
X.head()

In [None]:
X.shape

In [None]:
y = data12.price    # taking the price column as our target to predict

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)    # splitting the the data into train and test data as 80 : 20 ratio

In [None]:
X_train.shape

# 8. Training the Models
Preciesly, we will be trying more than one model, therefore, Training the `Models`

In [None]:
lr_clf = LinearRegression()     # first trying training with LinearRegression 
lr_clf.fit(X_train, y_train) #Training the dataset
lr_clf.score(X_test, y_test) 

In [None]:
cv = ShuffleSplit(n_splits = 5, test_size = 0.2, random_state = 0)     # ShuffleSplit is just a another type of splitting data
cross_val_score(LinearRegression(), X, y, cv=cv)     

Cross Validation is mainly used for the comparison of different models. For each model, you may get the average generalization error on the k validation sets. Then you will be able to choose the model with the lowest average generation error as your optimal model.

In [None]:
def find_best_model_using_gridsearchcv(X,y):
    '''
    Function to try different models at once of the data with different parameters to find the best ones.
    '''
    algos = {
        'linear_regression':{
            'model': LinearRegression(),
            'params':{
                'normalize':[True,False]
            }
        },
        'lasso':{
            'model': Lasso(),
            'params':{
                'alpha' : [1,2],
                'selection':['random','cyclic']
            }
        },
        'decision_tree':{
            'model': DecisionTreeRegressor(),
            'params':{
                'criterion':['mse','friedman_mse'],
                'splitter':['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=0)
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'],config['params'], cv=cv, return_train_score=False)     # GridSearchCV is the main focus as it helps to try out the different parameters for the different models.
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score':gs.best_score_,
            'best_params':gs.best_params_
        })
    return pd.DataFrame(scores,columns=['model','best_score','best_params'])     # At last binding the results of the models with best params. into a DataFrame.

In [None]:
find_best_model_using_gridsearchcv(X, y)

As we can see the LinearRegression model performed the best, so we are going to use as it for prediction.

# 9. Prediction Time
Predicting the prices using LinearRegression in Lakhs.

In [None]:
def predict_price(location,sqft,bath,bhk):
    '''
    Function which helps to actually predict the prices.
    '''
    loc_index = np.where(X.columns==location)[0][0]     # np.where() function returns the indices of elements in an input array where the given condition is satisfied.
    
    x = np.zeros(len(X.columns))    # np.zeros() function returns a new array of given shape and type, with zeros.
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1
        
    return lr_clf.predict([x])[0]

In [None]:
print(predict_price('1st Phase JP Nagar', 1000, 2, 3).round(3),'Lakhs')


In [None]:
print(predict_price('1st Block Jayanagar',2900,4,4))

# 10. Saving Model
For saving the model, we will be using `pickle` module and `json` module for saving the locations' names.

In [None]:
with open('BHP_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

In [None]:
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open('columns.json','w') as f:
    f.write(json.dumps(columns))


## Thank You!😊