In [1]:
#importing required modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error

In [2]:
#reading our dataset file

data = pd.read_csv("final_cleaned_data.csv")
data = data.drop(columns='Unnamed: 0')

In [3]:
#Reading data head

data.head()

Unnamed: 0,location,total_sqft,bath,price,bhk
0,1st Block Jayanagar,2850.0,4.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,148.0,2


In [4]:
#x is the explanatory variable and Y is the dependent variable

x = data.drop(columns='price')
y = data['price']

In [5]:
#splitting training data and test data from above data

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [6]:
#checking shape of splitted data

print(x_train.shape)
print(x_test.shape)

(5888, 4)
(1473, 4)


### Applying Linera Regression

In [7]:
column_trans = make_column_transformer((OneHotEncoder(sparse=False), ['location']), remainder='passthrough')

In [8]:
scaler = StandardScaler()

In [9]:
lr =  LinearRegression()

In [10]:
pipe = make_pipeline(column_trans, scaler, lr)

In [11]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

In [12]:
y_pred_lr = pipe.predict(x_test)

In [13]:
r2_score(y_test, y_pred_lr)

0.8643330241436922

#### What is One Hot Encoding?

As a machine can only understand numbers and cannot understand the text in the first place, this essentially becomes the case with Deep Learning & Machine Learning algorithms. One hot encoding can be defined as the essential process of converting the categorical data variables to be provided to machine and deep learning algorithms which in turn improve predictions as well as classification accuracy of a model. One Hot Encoding is a common way of preprocessing categorical features for machine learning models. This type of encoding creates a new binary feature for each possible category and assigns a value of 1 to the feature of each sample that corresponds to its original category. 

One hot encoding is a highly essential part of the feature engineering process in training for learning techniques. For example, we had our variables like colors and the labels were “red,” “green,” and “blue,” we could encode each of these labels as a three-element binary vector as Red: [1, 0, 0], Green: [0, 1, 0], Blue: [0, 0, 1]. The Categorical data while processing, must be converted to a numerical form.

#### sparse = False

sparsebool, default=True
Will return sparse matrix if set True else will return an array.

#### remainder = 'passthrough'

Setting remainder='passthrough' will mean that all columns not specified in the list of “transformers” will be passed through without transformation, instead of being dropped.

#### What is standard scaler Sklearn?

The idea behind StandardScaler is that it will transform your data such that its distribution will have a mean value 0 and standard deviation of 1. In case of multivariate data, this is done feature-wise (in other words independently for each column of the data).

Scaling the target value is a good idea in regression modelling; scaling of the data makes it easy for a model to learn and understand the problem.

### Applying Lasso

In [14]:
lasso = Lasso()

In [15]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [16]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('lasso', Lasso())])

In [17]:
y_pred_lasso = pipe.predict(x_test)

In [18]:
r2_score(y_test, y_pred_lasso)

0.852100326451466

### Applying Ridge

In [19]:
ridge = Ridge()

In [20]:
pipe = make_pipeline(column_trans, scaler, ridge)

In [21]:
pipe.fit(x_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(sparse=False),
                                                  ['location'])])),
                ('standardscaler', StandardScaler()), ('ridge', Ridge())])

In [22]:
y_pred_ridge = pipe.predict(x_test)

In [23]:
r2_score(y_test, y_pred_ridge)

0.8644804708201209

In [24]:
#printing all obtained results of different regression

print("No Regularization:", r2_score(y_test, y_pred_lr))
print("Lasso:", r2_score(y_test, y_pred_lasso))
print("Ridge:", r2_score(y_test, y_pred_ridge))

No Regularization: 0.8643330241436922
Lasso: 0.852100326451466
Ridge: 0.8644804708201209


#### R or Coefficient of Correlation

It is the degree of relationship between two variables say x and y. It can go between -1 and 1.  
1 indicates that the two variables are moving in unison. They rise and fall together and have perfect correlation.
-1 means that the two variables are in perfect opposites. One goes up and other goes down, in perfect negative way. 
Any two variables in this universe can be argued to have a correlation value. 
If they are not correlated then the correlation value can still be computed which would be 0.

#### R square (R^2) or Coeff. of Determination 

It shows percentage variation in y which is explained by all the x variables together.
Higher the better. It is always between 0 and 1. It can never be negative – since it is a squared value.

In [25]:
#Analysing MAE and MAPE for linear regression model

print("Mean Absolute Error(MAE) : ", mean_absolute_error(y_test, y_pred_lr))
print("Mean Absolute Percentage Error(MAPE) : ", mean_absolute_percentage_error(y_test, y_pred_lr))

Mean Absolute Error(MAE) :  17.575578073287218
Mean Absolute Percentage Error(MAPE) :  0.21333334735268608


#### Mean Absolute Error(MAE)

MAE = 10 implies that, on average, the Prediction's distance from the true value is 10 
(e.g true value is 100 and forecast is 90 or true value is 100 and forecast is 110 would be a distance of 10). 

#### Mean Absolute Percentage Error(MAPE)

MAPE=10 implies that, on average, the forecast's distance from the true value is 10% of the true value.
(e.g true value is 100 and forecast is 90 or true value is 100 and forecast is 110 would be a distance of 10%).

In [26]:
#dumping one of the models

import pickle
pickle.dump(pipe, open('ridge_model.pkl', 'wb'))