Class 12 Linear Regression With Multiple Inputs

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

%matplotlib notebook
#plt.style.use('../test/deeplearing.mpstyle')

Define the task: House price prediction with multiple inputs using linear regression

y_pred = w*X + b where X = [x1, x2, x3 .......]

Objective:
    1. Define the task
    2. Data Cleaning and processing
    3. Data splitting 
    4. Model Training

In [3]:
ROOT_DIR = "/home/dipu/Desktop/BongoDev/MachineLearning"
DATA_DIR = os.path.join(ROOT_DIR, "data")
DATASET_PATH = os.path.join(DATA_DIR, "Housing.csv")

housing_dataset = pd.read_csv(DATASET_PATH)
housing_dataset.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


DATA cleaning and preprocessing

In [4]:
housing_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [5]:
housing_dataset.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [6]:
numerical_cols = housing_dataset.select_dtypes(include=[np.number]).columns
categorical_cols = housing_dataset.select_dtypes(include=[object]).columns

print("Numerical Columns: ", numerical_cols)
print("Categorical Columns: ", categorical_cols)

Numerical Columns:  Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking'], dtype='object')
Categorical Columns:  Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')


Standardization of numerical columns

In [7]:
mean = housing_dataset[numerical_cols].mean()
std = housing_dataset[numerical_cols].std()
housing_dataset[numerical_cols] = (housing_dataset[numerical_cols] - mean) / std
housing_dataset[numerical_cols].head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
0,4.562174,1.045766,1.402131,1.420507,1.376952,1.516299
1,4.000809,1.755397,1.402131,5.400847,2.5297,2.67695
2,4.000809,2.216196,0.047235,1.420507,0.224204,1.516299
3,3.982096,1.08263,1.402131,1.420507,0.224204,2.67695
4,3.551716,1.045766,1.402131,-0.569663,0.224204,1.516299


if ranking matter the label encoding else Onehotencoding

In [8]:
housing_dataset['furnishingstatus'].value_counts()

furnishingstatus
semi-furnished    227
unfurnished       178
furnished         140
Name: count, dtype: int64

In [9]:
housing_dataset[['furnishingstatus']] = housing_dataset[['furnishingstatus']].replace(['furnished', 'semi-furnished', 'unfurnished'], [0, 1, 2])
housing_dataset['furnishingstatus'].value_counts()

furnishingstatus
1    227
2    178
0    140
Name: count, dtype: int64

In [10]:
housing_dataset[categorical_cols] = housing_dataset[categorical_cols].apply(
    lambda col: pd.Categorical(col).codes
)
housing_dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,4.562174,1.045766,1.402131,1.420507,1.376952,1,0,0,0,1,1.516299,1,0
1,4.000809,1.755397,1.402131,5.400847,2.5297,1,0,0,0,1,2.67695,0,0
2,4.000809,2.216196,0.047235,1.420507,0.224204,1,0,1,0,0,1.516299,1,1
3,3.982096,1.08263,1.402131,1.420507,0.224204,1,0,1,0,1,2.67695,1,0
4,3.551716,1.045766,1.402131,-0.569663,0.224204,1,1,1,0,1,1.516299,0,0
