In [1]:
#importing required modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [2]:
#reading our dataset file

data = pd.read_csv("final_cleaned_data.csv")
data = data.drop(columns='Unnamed: 0')

In [3]:
#Reading data head

data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [4]:
#x is the explanatory variable and Y is the dependent variable

x = data.drop(columns='price')
y = data['price']

In [5]:
#splitting training data and test data from above data

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [6]:
#checking shape of splitted data

print(x_train.shape)
print(x_test.shape)

(5888, 4)
(1473, 4)


### Applying Linera Regression

In [7]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['location']), remainder='passthrough')

In [8]:
scaler = StandardScaler()

In [9]:
lr =  LinearRegression()

In [10]:
pipe = make_pipeline(column_trans, scaler, lr)

In [11]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

In [12]:
y_pred_lr = pipe.predict(x_test)

In [13]:
r2_score(y_test, y_pred_lr)

0.8643330241436922

### Applying Lasso

In [14]:
lasso = Lasso()

In [15]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [16]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('lasso', Lasso())])

In [17]:
y_pred_lasso = pipe.predict(x_test)

In [18]:
r2_score(y_test, y_pred_lasso)

0.852100326451466

### Applying Ridge

In [19]:
ridge = Ridge()

In [20]:
pipe = make_pipeline(column_trans, scaler, ridge)

In [21]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('ridge', Ridge())])

In [22]:
y_pred_ridge = pipe.predict(x_test)

In [23]:
r2_score(y_test, y_pred_ridge)

0.8644804708201209

In [24]:
#printing all obtained results of different regression

print("No Regularization:", r2_score(y_test, y_pred_lr))
print("Lasso:", r2_score(y_test, y_pred_lasso))
print("Ridge:", r2_score(y_test, y_pred_ridge))

No Regularization: 0.8643330241436922
Lasso: 0.852100326451466
Ridge: 0.8644804708201209


In [25]:
#dumping one of the models

import pickle
pickle.dump(pipe, open('ridge_model.pkl', 'wb'))