In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Load the dataset
data = pd.read_csv("pune_house_data.csv")
data.head()

Unnamed: 0,area_type,availability,size,society,total_sqft,bath,balcony,price,site_location
0,Super built-up Area,19-Dec,2 BHK,Coomee,1056,2.0,1.0,39.07,Alandi Road
1,Plot Area,Ready To Move,4 Bedroom,Theanmp,2600,5.0,3.0,120.0,Ambegaon Budruk
2,Built-up Area,Ready To Move,3 BHK,,1440,2.0,3.0,62.0,Anandnagar
3,Super built-up Area,Ready To Move,3 BHK,Soiewre,1521,3.0,1.0,95.0,Aundh
4,Super built-up Area,Ready To Move,2 BHK,,1200,2.0,1.0,51.0,Aundh Road


In [3]:
data.shape

(13320, 9)

In [4]:
data.describe

<bound method NDFrame.describe of                   area_type   availability       size  society total_sqft  \
0      Super built-up  Area         19-Dec      2 BHK  Coomee        1056   
1                Plot  Area  Ready To Move  4 Bedroom  Theanmp       2600   
2            Built-up  Area  Ready To Move      3 BHK      NaN       1440   
3      Super built-up  Area  Ready To Move      3 BHK  Soiewre       1521   
4      Super built-up  Area  Ready To Move      2 BHK      NaN       1200   
...                     ...            ...        ...      ...        ...   
13315        Built-up  Area  Ready To Move  5 Bedroom  ArsiaEx       3453   
13316  Super built-up  Area  Ready To Move      4 BHK      NaN       3600   
13317        Built-up  Area  Ready To Move      2 BHK  Mahla T       1141   
13318  Super built-up  Area         18-Jun      4 BHK  SollyCl       4689   
13319  Super built-up  Area  Ready To Move      1 BHK      NaN        550   

       bath  balcony   price         site

In [5]:
data.isnull().sum()

area_type           0
availability        0
size               16
society          5502
total_sqft          0
bath               73
balcony           609
price               0
site_location       1
dtype: int64

# Data Cleaning

In [6]:
#handle null values
data = data.dropna()

In [7]:
data.area_type.unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [8]:
#drop availability because it's not required 
data = data.drop('availability',axis=1)

In [9]:
data.site_location.unique()

array(['Alandi Road', 'Ambegaon Budruk', 'Aundh', 'Balaji Nagar',
       'Bopodi', 'Budhwar Peth', 'Camp', 'Chandan Nagar', 'Dapodi',
       'Deccan Gymkhana', 'Dehu Road', 'Dhole Patil Road', 'Erandwane',
       'Fergusson College Road', 'Ganeshkhind', 'other', 'Gokhale Nagar',
       'Gultekdi', 'Guruwar peth', 'Hadapsar',
       'Hadapsar Industrial Estate', 'Jangali Maharaj Road',
       'Karve Nagar', 'Khadaki', 'Khadki', 'Kondhwa', 'Kondhwa Khurd',
       'Koregaon Park', 'Laxmi Road', 'Lulla Nagar',
       'Mahatma Gandhi Road', 'Mangalwar peth', 'Manik Bagh',
       'Market yard', 'Mukund Nagar', 'Narayan Peth', 'Narayangaon',
       'Navi Peth', 'Pashan', 'Rasta Peth', 'Raviwar Peth',
       'Sadashiv Peth', 'Sahakar Nagar', 'Satara Road', 'Shivaji Nagar',
       'Sinhagad Road', 'Swargate', 'Vadgaon Budruk', 'Wadgaon Sheri',
       'Vishrant Wadi', 'Warje', 'Aundh Road', 'Baner', 'Baner road',
       'Bhandarkar Road', 'Bund Garden Road', 'Dhankawadi',
       'Dhayari Phata',

In [10]:
len(data.site_location.unique())

97

In [11]:
location_count = data.groupby('site_location').size().sort_values(ascending=False)
location_count

site_location
Koregaon Park       92
Rasta Peth          90
Viman Nagar         89
Kondhwa             89
Dhole Patil Road    87
                    ..
Katraj              68
Salunke Vihar       67
Model colony        65
Aundh Road          62
other                1
Length: 97, dtype: int64

In [12]:
len(location_count[location_count<=10])

1

In [13]:
location_less_10 = location_count[location_count<=10]

In [14]:
#if the location have less than 10 or 10 houses than that location are known as other location
data.site_location = data.site_location.apply(lambda x: 'other' if x in location_less_10 else x)

In [15]:
len(data.site_location.unique())

97

In [16]:
data['bhk'] = data['size'].apply(lambda x: int(x.split(' ')[0]))
data.bhk.unique() 

array([ 2,  4,  3,  1,  5, 11,  9,  6,  7], dtype=int64)

In [17]:
def convert_total_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [18]:
data.total_sqft = data.total_sqft.apply(convert_total_sqft_to_num)
data = data[data.total_sqft.notnull()]
data.head()

Unnamed: 0,area_type,size,society,total_sqft,bath,balcony,price,site_location,bhk
0,Super built-up Area,2 BHK,Coomee,1056.0,2.0,1.0,39.07,Alandi Road,2
1,Plot Area,4 Bedroom,Theanmp,2600.0,5.0,3.0,120.0,Ambegaon Budruk,4
3,Super built-up Area,3 BHK,Soiewre,1521.0,3.0,1.0,95.0,Aundh,3
5,Super built-up Area,2 BHK,DuenaTa,1170.0,2.0,1.0,38.0,Balaji Nagar,2
11,Plot Area,4 Bedroom,Prrry M,2785.0,5.0,3.0,295.0,Bopodi,4


In [19]:
#drop society column because it's not required
data = data.drop('society',axis=1)

In [20]:
data.head()

Unnamed: 0,area_type,size,total_sqft,bath,balcony,price,site_location,bhk
0,Super built-up Area,2 BHK,1056.0,2.0,1.0,39.07,Alandi Road,2
1,Plot Area,4 Bedroom,2600.0,5.0,3.0,120.0,Ambegaon Budruk,4
3,Super built-up Area,3 BHK,1521.0,3.0,1.0,95.0,Aundh,3
5,Super built-up Area,2 BHK,1170.0,2.0,1.0,38.0,Balaji Nagar,2
11,Plot Area,4 Bedroom,2785.0,5.0,3.0,295.0,Bopodi,4


In [21]:
#create new column price_sqft that contain price per sqft
data['price_sqft'] = data.price*100000/data.total_sqft

In [22]:
#remove outlier using standard deviation
upper_limit = data.price_sqft.mean() + data.price_sqft.std()
lower_limit = data.price_sqft.mean() - data.price_sqft.std()
data = data[data.price_sqft < upper_limit] 
data = data[data.price_sqft > lower_limit]

In [23]:
#remove 2BHK house that price per sqft is less than the mean of 1bhk house price
def remove_bhk_outliers(data):
    exclude_indices = np.array([])
    for location, location_df in data.groupby('site_location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_sqft),
                'std': np.std(bhk_df.price_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_sqft<(stats['mean'])].index.values)
    return data.drop(exclude_indices,axis='index')
data = remove_bhk_outliers(data)

In [24]:
data = data.drop('price_sqft',axis=1)

In [25]:
#apply one hot encoding in location column
dummies = pd.get_dummies(data.site_location)
site_location_cat = pd.get_dummies(data.site_location)
dummies.head()

Unnamed: 0,Alandi Road,Ambegaon Budruk,Anandnagar,Aundh,Aundh Road,Balaji Nagar,Baner,Baner road,Bhandarkar Road,Bhavani Peth,...,Vadgaon Budruk,Viman Nagar,Vishrant Wadi,Wadgaon Sheri,Wagholi,Wakadewadi,Wanowrie,Warje,Yerawada,other
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
data = pd.concat([data,dummies.drop('Ambegaon Budruk',axis='columns')],axis='columns')
data.head()

Unnamed: 0,area_type,size,total_sqft,bath,balcony,price,site_location,bhk,Alandi Road,Anandnagar,...,Vadgaon Budruk,Viman Nagar,Vishrant Wadi,Wadgaon Sheri,Wagholi,Wakadewadi,Wanowrie,Warje,Yerawada,other
0,Super built-up Area,2 BHK,1056.0,2.0,1.0,39.07,Alandi Road,2,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Super built-up Area,3 BHK,1521.0,3.0,1.0,95.0,Aundh,3,0,0,...,0,0,0,0,0,0,0,0,0,0
5,Super built-up Area,2 BHK,1170.0,2.0,1.0,38.0,Balaji Nagar,2,0,0,...,0,0,0,0,0,0,0,0,0,0
11,Plot Area,4 Bedroom,2785.0,5.0,3.0,295.0,Bopodi,4,0,0,...,0,0,0,0,0,0,0,0,0,0
12,Super built-up Area,2 BHK,1000.0,2.0,1.0,38.0,Budhwar Peth,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
#apply one hot encoding in area column
dummies = pd.get_dummies(data.area_type)
dummies.head()

Unnamed: 0,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,0,0,0,1
3,0,0,0,1
5,0,0,0,1
11,0,0,1,0
12,0,0,0,1


In [28]:
data = pd.concat([data,dummies],axis='columns')
data.head()

Unnamed: 0,area_type,size,total_sqft,bath,balcony,price,site_location,bhk,Alandi Road,Anandnagar,...,Wagholi,Wakadewadi,Wanowrie,Warje,Yerawada,other,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,Super built-up Area,2 BHK,1056.0,2.0,1.0,39.07,Alandi Road,2,1,0,...,0,0,0,0,0,0,0,0,0,1
3,Super built-up Area,3 BHK,1521.0,3.0,1.0,95.0,Aundh,3,0,0,...,0,0,0,0,0,0,0,0,0,1
5,Super built-up Area,2 BHK,1170.0,2.0,1.0,38.0,Balaji Nagar,2,0,0,...,0,0,0,0,0,0,0,0,0,1
11,Plot Area,4 Bedroom,2785.0,5.0,3.0,295.0,Bopodi,4,0,0,...,0,0,0,0,0,0,0,0,1,0
12,Super built-up Area,2 BHK,1000.0,2.0,1.0,38.0,Budhwar Peth,2,0,0,...,0,0,0,0,0,0,0,0,0,1


In [29]:
data = data.drop(['site_location','size','area_type'] , axis=1)

In [30]:
data.head()

Unnamed: 0,total_sqft,bath,balcony,price,bhk,Alandi Road,Anandnagar,Aundh,Aundh Road,Balaji Nagar,...,Wagholi,Wakadewadi,Wanowrie,Warje,Yerawada,other,Built-up Area,Carpet Area,Plot Area,Super built-up Area
0,1056.0,2.0,1.0,39.07,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1521.0,3.0,1.0,95.0,3,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
5,1170.0,2.0,1.0,38.0,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
11,2785.0,5.0,3.0,295.0,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
12,1000.0,2.0,1.0,38.0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [31]:
#split data into dependent feature & independent feature
X = data.drop('price',axis=1)

In [32]:
y=data['price']

# Model Building

# Linear Regression

In [40]:
#split data into train-test split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
regressor = cross_val_score(LinearRegression(), X, y, cv=cv)
regressor

array([0.84982164, 0.83011645, 0.83543534, 0.83085598, 0.85272815])

In [41]:
np.average(regressor)

0.8397915141584764

In [42]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(X,y,test_size=0.3,random_state=20)

In [44]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(normalize=True)
model.fit(x_train, y_train)

LinearRegression(normalize=True)

In [46]:
# prediction on Training data 
training_data_prediction = model.predict(x_train) 

In [48]:
print("The accuracy for the training dataset is:",training_data_prediction)

The accuracy for the training dataset is: [ 67.28594954  47.99878865  53.99995149 ...  63.92722698 115.64148847
  95.67881286]


In [49]:
y_pred = model.predict(x_test)

In [50]:
print("The accuracy for the test data is",y_pred)

The accuracy for the test data is [ 97.881687   165.93870089  80.81084434 ... 113.86825645  78.88280414
  15.8755363 ]


In [53]:
# Importing r2_square
from sklearn.metrics import r2_score

# Checking the R-squared value
r_squared = r2_score(y_test, y_pred)
r_squared

print("The accuracy for the Test data is:",r_squared)

The accuracy for the Test data is: 0.8557970048876309


In [54]:
# Making Predictions of y_value
y_train_pred = model.predict(x_train)
y_test_pred = model.predict(x_test)

# Comparing the r2 value of both train and test data
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

0.8577036891940111
0.8557970048876309


In [55]:
from sklearn import metrics
print(metrics.mean_absolute_error(y_test,y_pred))
print(metrics.mean_squared_error(y_test,y_pred))
print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))

21.89201948927694
1202.767416315423
34.6809373621219


# XG BOOST

In [56]:
import xgboost
classifier=xgboost.XGBRegressor()
classifier.fit(x_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [57]:
Y_pred_train=classifier.predict(x_train)
y_pred = classifier.predict(x_test)

In [58]:
def rmsle(y_pred,y_test) :
    error = np.square(np.log10(y_pred +1) - np.log10(y_test +1)).mean() ** 0.5
    Acc = 1 - error
    return Acc

print("Accuracy attained on Training Set = ",rmsle(Y_pred_train, y_train))
print("Accuracy attained on Test Set = ",rmsle(y_pred,y_test))


Accuracy attained on Training Set =  0.9182718991889975
Accuracy attained on Test Set =  0.8855422663101676


# Decision Tree

In [59]:
from sklearn.metrics import explained_variance_score
from sklearn.tree import DecisionTreeRegressor

tr_regressor = DecisionTreeRegressor(random_state=0)
tr_regressor.fit(x_train,y_train)
tr_regressor.score(x_test,y_test)
pred_tr = tr_regressor.predict(x_test)
decision_score=tr_regressor.score(x_test,y_test)
expl_tr = explained_variance_score(pred_tr,y_test)
print("Accuracy attained on Test Set = ",decision_score)

Accuracy attained on Test Set =  0.7654053292457118


# Multiple Linear Regression

In [61]:
mlr = LinearRegression()
mlr.fit(x_train,y_train)
mlr_score = mlr.score(x_test,y_test)
pred_mlr = mlr.predict(x_test)
expl_mlr = explained_variance_score(pred_mlr,y_test)
print("Accuracy attained on Test Set = ",mlr_score)

Accuracy attained on Test Set =  0.8557970048876298


# Random Forest

In [62]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor(n_estimators=28,random_state=0)
rf_regressor.fit(x_train,y_train)
rf_regressor.score(x_test,y_test)
rf_pred =rf_regressor.predict(x_test)
rf_score=rf_regressor.score(x_test,y_test)
expl_rf = explained_variance_score(rf_pred,y_test)
print("Accuracy attained on Test Set = ",rf_score)

Accuracy attained on Test Set =  0.8441892691948252


In [63]:
print("Xg Boost Regression Model Score is",round(rmsle(y_pred,y_test)*100),"%")
print("Linear Regression Model Score is",round(model.score(x_test, y_test)*100),"%")
print("Multiple Linear Regression Model Score is ",round(mlr.score(x_test,y_test)*100),"%")
print("Decision tree  Regression Model Score is ",round(tr_regressor.score(x_test,y_test)*100),"%")
print("Random Forest Regression Model Score is ",round(rf_regressor.score(x_test,y_test)*100),"%")

Xg Boost Regression Model Score is 89 %
Linear Regression Model Score is 86 %
Multiple Linear Regression Model Score is  86 %
Decision tree  Regression Model Score is  77 %
Random Forest Regression Model Score is  84 %
