In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [15]:
data=pd.read_csv("Bengaluru_House_Data_Cleaned_no_index.csv")

In [16]:
#Splitting data into feature and response matrix
#X is feature matrix 
#y is response matrix

X=data.drop(columns=["price"])
y=data["price"]

In [17]:
X

Unnamed: 0,total_sqft,bath,Availability,BHK,Locations
0,2850.0,4,Not Ready To Move,4,1st Block Jayanagar
1,1630.0,3,Not Ready To Move,3,1st Block Jayanagar
2,1875.0,2,Ready To Move,3,1st Block Jayanagar
3,1200.0,2,Not Ready To Move,3,1st Block Jayanagar
4,1235.0,2,Not Ready To Move,2,1st Block Jayanagar
...,...,...,...,...,...
9882,1256.0,2,Ready To Move,2,other
9883,1353.0,2,Ready To Move,2,other
9884,812.0,1,Not Ready To Move,1,other
9885,1440.0,2,Not Ready To Move,3,other


In [18]:
y

0       428.00
1       194.00
2       235.00
3       130.00
4       148.00
         ...  
9882     65.00
9883    110.00
9884     26.00
9885     63.93
9886     48.00
Name: price, Length: 9887, dtype: float64

In [20]:
#Giving size to feature and response matrix. Ie we want 20% of values to be tested upon by 80% values used to train model

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
print(X_test.shape)
print(X_train.shape)

(1978, 5)
(7909, 5)


In [37]:
#OneHotEncoder is a dummy encoding system which binarizes the categorical data in numpy array of numerical data
#IE Sex: Male/Female is converted to [0,1] or[1,0] if you are Female and Male respectively

#make_colunm_transformer is used to apply preprocessing steps like OneHotEncoder only on certain colunms
#while passing by the rest of them

column_trans=make_column_transformer((OneHotEncoder(sparse=False,),
                                      ["Availability","Locations"]),remainder='passthrough')

column_trans.fit_transform(X)

array([[1.000e+00, 0.000e+00, 1.000e+00, ..., 2.850e+03, 4.000e+00,
        4.000e+00],
       [1.000e+00, 0.000e+00, 1.000e+00, ..., 1.630e+03, 3.000e+00,
        3.000e+00],
       [0.000e+00, 1.000e+00, 1.000e+00, ..., 1.875e+03, 2.000e+00,
        3.000e+00],
       ...,
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 8.120e+02, 1.000e+00,
        1.000e+00],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 1.440e+03, 2.000e+00,
        3.000e+00],
       [0.000e+00, 1.000e+00, 0.000e+00, ..., 1.075e+03, 2.000e+00,
        2.000e+00]])

In [38]:
#StandardScaler removes the mean and scales each feature/variable to unit variance. 
#This operation is performed feature-wise in an independent way. 
scaler=StandardScaler()

In [None]:
#THis concludes setting basic stuff now we will pass through pipelines for different models

USING LINEAR REGRESSION MODEL APPROACH

In [39]:
#Establishing model which is Linear regression in this case
lr=LinearRegression(normalize=True)

In [40]:
#Making Pipeline-A structured manner of following steps

#So in this case first the data will be preproscessed and categorical converts into numerical
#Then it is passed through Scaler where the data is scaled
#And now lastly it is passed through our ML model of linear regression

#This whole proscess happens in a pipe as a sequential manner

pipe=make_pipeline(column_trans,scaler,lr)

#Fitting training data into pipe as compared to the simpler fitting data to model
pipe.fit(X_train,y_train)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)




Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['Availability',
                                                   'Locations'])])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression(normalize=True))])

In [41]:
y_pred_lr=pipe.predict(X_test)

#Checking accuracy by finding r2score
r2_score(y_test,y_pred_lr)

0.8556147128434505

In [None]:
#We see 85.5% accuracy in predictions

USING LASSO MODEL APPROACH

In [43]:
lasso=Lasso()
pipe1=make_pipeline(column_trans,scaler,lasso)

pipe1.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['Availability',
                                                   'Locations'])])),
                ('standardscaler', StandardScaler()), ('lasso', Lasso())])

In [44]:
y_pred_lasso=pipe1.predict(X_test)

#Checking accuracy by finding r2score
r2_score(y_test,y_pred_lasso)

0.8454328482610449

USING RIDGE APPROACH

In [45]:
ridge=Ridge()
pipe2=make_pipeline(column_trans,scaler,ridge)

pipe2.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['Availability',
                                                   'Locations'])])),
                ('standardscaler', StandardScaler()), ('ridge', Ridge())])

In [46]:
y_pred_ridge=pipe2.predict(X_test)

#Checking accuracy by finding r2score
r2_score(y_test,y_pred_ridge)

0.8571773918265543

In [48]:
#We see that the RIDGE approach gives us best accuracy
#So we will pickle the ridge approach and store it

import pickle

In [49]:
pickle.dump(pipe2, open("Ridge_Model.pkl","wb"))

In [None]:
#Testing whether Pipeline predicts correctly

In [52]:
input=pd.DataFrame([["5th Phase JP Nagar","3","4","Ready To Move","2000"]],
                       columns=['Locations','bath','BHK','Availability','total_sqft'])
input

Unnamed: 0,Locations,bath,BHK,Availability,total_sqft
0,5th Phase JP Nagar,3,4,Ready To Move,2000


In [55]:
prediction=pipe.predict(input)[0]
prediction

119.9778190644226

In [57]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9887 entries, 0 to 9886
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   total_sqft    9887 non-null   float64
 1   bath          9887 non-null   int64  
 2   price         9887 non-null   float64
 3   Availability  9887 non-null   object 
 4   BHK           9887 non-null   int64  
 5   Locations     9887 non-null   object 
dtypes: float64(2), int64(2), object(2)
memory usage: 463.6+ KB


In [60]:
output=data(data["Locations"]=="5th Phase JP Nagar" & data["bath"]==3 &
                data["BHK"]==4 & data["Availability"]=="Ready To Move" &
                data["total_sqft"]==2000)
output

TypeError: 'DataFrame' object is not callable