Class 12 Linear Regression With Multiple Inputs

In [83]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

%matplotlib notebook
#plt.style.use('../test/deeplearing.mpstyle')

Define the task: House price prediction with multiple inputs using linear regression

y_pred = w*X + b where X = [x1, x2, x3 .......]

Objective:
    1. Define the task
    2. Data Cleaning and processing
    3. Data splitting 
    4. Model Training

In [84]:
ROOT_DIR = "/home/dipu/Desktop/BongoDev/MachineLearning"
DATA_DIR = os.path.join(ROOT_DIR, "data")
DATASET_PATH = os.path.join(DATA_DIR, "Housing.csv")

housing_dataset = pd.read_csv(DATASET_PATH)
housing_dataset.head()


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


DATA cleaning and preprocessing

In [85]:
housing_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [86]:
housing_dataset.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [87]:
numerical_cols = housing_dataset.select_dtypes(include=[np.number]).columns
categorical_cols = housing_dataset.select_dtypes(include=[object]).columns

print("Numerical Columns: ", numerical_cols)
print("Categorical Columns: ", categorical_cols)

Numerical Columns:  Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking'], dtype='object')
Categorical Columns:  Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')


Standardization of numerical columns

In [88]:
mean = housing_dataset[numerical_cols].mean()
std = housing_dataset[numerical_cols].std()
housing_dataset[numerical_cols] = (housing_dataset[numerical_cols] - mean) / std
housing_dataset[numerical_cols].head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,parking
0,4.562174,1.045766,1.402131,1.420507,1.376952,1.516299
1,4.000809,1.755397,1.402131,5.400847,2.5297,2.67695
2,4.000809,2.216196,0.047235,1.420507,0.224204,1.516299
3,3.982096,1.08263,1.402131,1.420507,0.224204,2.67695
4,3.551716,1.045766,1.402131,-0.569663,0.224204,1.516299


if ranking matter the label encoding else Onehotencoding

In [89]:
housing_dataset['furnishingstatus'].value_counts()

furnishingstatus
semi-furnished    227
unfurnished       178
furnished         140
Name: count, dtype: int64

In [90]:
housing_dataset[['furnishingstatus']] = housing_dataset[['furnishingstatus']].replace(['furnished', 'semi-furnished', 'unfurnished'], [0, 1, 2])
housing_dataset['furnishingstatus'].value_counts()

furnishingstatus
1    227
2    178
0    140
Name: count, dtype: int64

In [91]:
housing_dataset[categorical_cols] = housing_dataset[categorical_cols].apply(
    lambda col: pd.Categorical(col).codes
)
housing_dataset.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,4.562174,1.045766,1.402131,1.420507,1.376952,1,0,0,0,1,1.516299,1,0
1,4.000809,1.755397,1.402131,5.400847,2.5297,1,0,0,0,1,2.67695,0,0
2,4.000809,2.216196,0.047235,1.420507,0.224204,1,0,1,0,0,1.516299,1,1
3,3.982096,1.08263,1.402131,1.420507,0.224204,1,0,1,0,1,2.67695,1,0
4,3.551716,1.045766,1.402131,-0.569663,0.224204,1,1,1,0,1,1.516299,0,0


##Splitting the data

In [92]:
seed = 42
np.random.seed(seed)

def split_dataset(dataset, training_ratio = 0.8, val_ratio = 0.1):
    random_indices = np.random.permutation(len(dataset))
    
    train_size = int(len(dataset) * training_ratio)
    val_size = int(len(dataset) * val_ratio)

    train_indices = random_indices[:train_size]
    val_indices = random_indices[train_size:train_size + val_size]
    test_indices = random_indices[train_size + val_size:]

    train_ds = dataset.iloc[train_indices]
    val_ds = dataset.iloc[val_indices]
    test_ds = dataset.iloc[test_indices]    

    train_X = train_ds.iloc[:, :-1]
    train_y = train_ds.iloc[:, -1]

    val_X = val_ds.iloc[:, :-1]
    val_y = val_ds.iloc[:, -1]

    test_X = test_ds.iloc[:, :-1]
    test_y = test_ds.iloc[:, -1]

    return train_X, train_y, val_X, val_y, test_X, test_y

train_X, train_y, val_X, val_y, test_X, test_y = split_dataset(housing_dataset)

Model training

In [93]:
def get_house_price(X, w, b):
    return np.dot(X, w) + b

In [94]:
w = np.random.randint(low=100, high=1000, size=(train_X.columns.size))
b = np.random.randint(low=100, high=1000)

print(f"w = {w}, b = {b}")
y_pred = get_house_price(train_X, w, b)

w = [716 295 925 600 725 592 174 512 475 519 828 376], b = 860


Cost Function

In [95]:
def cost_function(x, y_true, w, b):
    y_pred = get_house_price(x, w, b)
    mse = np.mean((y_pred - y_true) ** 2)
    return mse

mse  = cost_function(train_X, train_y, w, b)
print(f"Mean Squared Error: {mse} (Parameters not learned yet)")

Mean Squared Error: 12025352.400294252 (Parameters not learned yet)


###Gradient Descent

In [96]:
def compute_gradient(x, y_true, w, b):
    delta = 1e-9

    cost_1 = cost_function(x, y_true, w, b)
    cost_2 = cost_function(x, y_true, w + delta, b)
    cost_3 = cost_function(x, y_true, w, b + delta)
    dw = (cost_2 - cost_1) / delta
    db = (cost_3 - cost_1) / delta
    return dw, db  


###Train the model

In [97]:
w = np.zeros(train_X.columns.size)
b = 0
learning_rate = 0.003

epoch = 10000

for epochs in range(epoch):


    """Compute Lofis"""
    train_loss = cost_function(train_X, train_y, w, b)
    val_loss = cost_function(val_X, val_y, w, b)
    print(f"Train Loss: {train_loss}")
    print(f"Validation Loss: {val_loss}")


    """"Compute Gradient Descent"""
    dw, db = compute_gradient(train_X, train_y, w, b)


    """Update Parameters"""
    w = w - learning_rate * dw
    b = b - learning_rate * db

    if epochs % 100 == 0:
        print(f"Epoch: {epoch}, Train Loss: {train_loss:0.2f}, Validation Loss: {val_loss:0.2f}")
        print("=====================================")

Train Loss: 1.6857798165137614
Validation Loss: 1.8333333333333333
Epoch: 10000, Train Loss: 1.69, Validation Loss: 1.83
Train Loss: 1.654465814864641
Validation Loss: 1.8143716680828585
Train Loss: 1.628934369228102
Validation Loss: 1.7991627856809866
Train Loss: 1.6076463114215382
Validation Loss: 1.7864467454428932
Train Loss: 1.589478863868599
Validation Loss: 1.7753449778864374
Train Loss: 1.5736129220064619
Validation Loss: 1.7652510256738874
Train Loss: 1.559450867019634
Validation Loss: 1.7557517731469514


Train Loss: 1.5465566018480645
Validation Loss: 1.7465707423281884
Train Loss: 1.5346118355679248
Validation Loss: 1.7375273935436661
Train Loss: 1.5233841914851796
Validation Loss: 1.7285079853134175
Train Loss: 1.5127039598051513
Validation Loss: 1.7194448041784236
Train Loss: 1.5024471313952512
Validation Loss: 1.7103014075381162
Train Loss: 1.4925230304251236
Validation Loss: 1.7010622071258814
Train Loss: 1.4828652982304353
Validation Loss: 1.6917251554638202
Train Loss: 1.4734253061102207
Validation Loss: 1.6822966653787543
Train Loss: 1.4641673606259389
Validation Loss: 1.6727881150281017
Train Loss: 1.4550652059135971
Validation Loss: 1.663213470626568
Train Loss: 1.4460994718948958
Validation Loss: 1.6535877199660616
Train Loss: 1.4372558075358948
Validation Loss: 1.6439258363734626
Train Loss: 1.4285235281582833
Validation Loss: 1.6342421619974628
Train Loss: 1.4198946259263379
Validation Loss: 1.6245500330393297
Train Loss: 1.411363043727909
Validation Loss: 1.61486159320603

In [98]:
print(f"Final Parameters: w = {w}, b = {b}")

Final Parameters: w = [-0.04665697 -0.04665697 -0.04665697 -0.04665697 -0.04665697 -0.04665697
 -0.04665697 -0.04665697 -0.04665697 -0.04665697 -0.04665697 -0.04665697], b = 1.146310115163196


Evaluate

In [99]:
test_loss = cost_function(test_X, test_y, w, b)
print(f"MSE is  {test_loss}")

MSE is  0.48620323140031224


"""The Phenomenon of overfitting occurs when the model learns the training data too well, including its noise and outliers, leading to poor generalization on unseen data. In this case, the model performs well on the training set but fails to predict accurately on the validation and test sets. To mitigate overfitting, techniques such as regularization, dropout, and early stopping can be employed. Regularization adds a penalty term to the loss function to discourage complex models, while dropout randomly drops units during training to prevent co-adaptation. Early stopping monitors validation loss and halts training when it starts to increase, ensuring the model retains its ability to generalize. By implementing these strategies, we can improve the model's performance on unseen data and reduce overfitting."""