In [1]:
import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler 
from sklearn.linear_model import LinearRegression  
from sklearn.metrics import r2_score 

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('./data/Housing.csv')  
data.head() 

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


Understanding our data

In [3]:
print("Rows and Columns of the dataset :- ",data.shape)

Rows and Columns of the dataset :-  (545, 13)


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [5]:
data.columns

Index(['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad',
       'guestroom', 'basement', 'hotwaterheating', 'airconditioning',
       'parking', 'prefarea', 'furnishingstatus'],
      dtype='object')

### Columns or Features Description
- 'price': The price of the house (**target variable**).
- 'area': The area or size of the house in square feet.
- 'bedrooms': The number of bedrooms in the house.
- 'bathrooms': The number of bathrooms in the house.
- 'stories': The number of stories or floors in the house.
- 'mainroad': Categorical variable indicating whether the house is located near the main road or not.
- 'guestroom': Categorical variable indicating whether the house has a guest room or not.
- 'basement': Categorical variable indicating whether the house has a basement or not.
- 'hotwaterheating': Categorical variable indicating whether the house has hot water heating or not.
- 'airconditioning': Categorical variable indicating whether the house has air conditioning or not.
- 'parking': The number of parking spaces available with the house.
- 'prefarea': Categorical variable indicating whether the house is in a preferred area or not.
- 'furnishingstatus': The furnishing status of the house (e.g., unfurnished, semi-furnished, fully furnished).

In [6]:
data.describe(include ='all')

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
count,545.0,545.0,545.0,545.0,545.0,545,545,545,545,545,545.0,545,545
unique,,,,,,2,2,2,2,2,,2,3
top,,,,,,yes,no,no,no,no,,no,semi-furnished
freq,,,,,,468,448,354,520,373,,417,227
mean,4766729.0,5150.541284,2.965138,1.286239,1.805505,,,,,,0.693578,,
std,1870440.0,2170.141023,0.738064,0.50247,0.867492,,,,,,0.861586,,
min,1750000.0,1650.0,1.0,1.0,1.0,,,,,,0.0,,
25%,3430000.0,3600.0,2.0,1.0,1.0,,,,,,0.0,,
50%,4340000.0,4600.0,3.0,1.0,2.0,,,,,,0.0,,
75%,5740000.0,6360.0,3.0,2.0,2.0,,,,,,1.0,,


In [7]:
data.describe().T.style.background_gradient(cmap='coolwarm')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,545.0,4766729.247706,1870439.615657,1750000.0,3430000.0,4340000.0,5740000.0,13300000.0
area,545.0,5150.541284,2170.141023,1650.0,3600.0,4600.0,6360.0,16200.0
bedrooms,545.0,2.965138,0.738064,1.0,2.0,3.0,3.0,6.0
bathrooms,545.0,1.286239,0.50247,1.0,1.0,1.0,2.0,4.0
stories,545.0,1.805505,0.867492,1.0,1.0,2.0,2.0,4.0
parking,545.0,0.693578,0.861586,0.0,0.0,0.0,1.0,3.0


Checking NULL Values

In [8]:
data.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [9]:
numericalCol = 0
categoricalCol = 0

numericalColList = []
categoricalColList = []

for col in data.columns:
    if data[col].dtype == 'object':
        categoricalCol += 1
        categoricalColList.append(col)
    else:
        numericalCol += 1
        numericalColList.append(col)

print("Numerical columns = ", numericalCol, " : ", numericalColList)
print("Categorical columns = ", categoricalCol, " : ", categoricalColList)

Numerical columns =  6  :  ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking']
Categorical columns =  7  :  ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']


### Performing EDA

In [10]:
def binary_map(x):
    """
    Function to map 'yes' and 'no' values to 1 and 0, respectively.
    
    Parameters:
    x (pandas Series): Input Series containing 'yes' and 'no' values.
    
    Returns:
    pandas Series: Mapped Series with 'yes' mapped to 1 and 'no' mapped to 0.
    """
    return x.map({'yes': 1, 'no': 0})

**1. Handling Binary Categorical Variables (Yes/No) Categories**

In [11]:
categorical_col = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
data[categorical_col] = data[categorical_col].apply(binary_map)
data[categorical_col]

Unnamed: 0,mainroad,guestroom,basement,hotwaterheating,airconditioning,prefarea
0,1,0,0,0,1,1
1,1,0,0,0,1,0
2,1,0,1,0,0,1
3,1,0,1,0,1,1
4,1,1,1,0,1,0
...,...,...,...,...,...,...
540,1,0,1,0,0,0
541,0,0,0,0,0,0
542,1,0,0,0,0,0
543,0,0,0,0,0,0


In [12]:
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished


**2. Handling Categorical data with Dummy Variable**

In [13]:
dummy_col = pd.get_dummies(data['furnishingstatus'])
dummy_col.head()

Unnamed: 0,furnished,semi-furnished,unfurnished
0,True,False,False
1,True,False,False
2,False,True,False
3,True,False,False
4,True,False,False


In [14]:
dummy_col = pd.get_dummies(data['furnishingstatus'], drop_first=True)
dummy_col.head()

Unnamed: 0,semi-furnished,unfurnished
0,False,False
1,False,False
2,True,False
3,False,False
4,False,False


In [15]:
data = pd.concat([data, dummy_col], axis=1)
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,furnished,False,False
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,furnished,False,False
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,semi-furnished,True,False
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,furnished,False,False
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,furnished,False,False


In [16]:
data.drop(['furnishingstatus'], axis=1, inplace=True)
data.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,False,False
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,False,False
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,True,False
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,False,False
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,False,False


### Splitting data into Training and Testing data

In [17]:
np.random.seed(0)
df_train, df_test = train_test_split(data, train_size=0.7, test_size=0.3, random_state=100)

In [18]:
df_train.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
359,3710000,3600,3,1,1,1,0,0,0,0,1,0,False,True
19,8855000,6420,3,2,2,1,0,0,0,1,1,1,True,False
159,5460000,3150,3,2,1,1,1,1,0,1,0,0,False,False
35,8080940,7000,3,2,4,1,0,0,0,1,2,0,False,False
28,8400000,7950,5,2,2,1,0,1,1,0,2,0,False,True


In [19]:
df_train.shape

(381, 14)

In [20]:
df_test.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,semi-furnished,unfurnished
265,4403000,2880,3,1,2,1,0,0,0,0,0,1,True,False
54,7350000,6000,3,2,2,1,1,0,0,1,1,0,True,False
171,5250000,10269,3,1,1,1,0,0,0,0,1,1,True,False
244,4550000,5320,3,1,2,1,1,1,0,0,0,1,True,False
268,4382000,4950,4,1,2,1,0,0,0,1,0,0,True,False


In [21]:
df_test.shape

(164, 14)

### Scaling Training Data: MinMaxScaler¶

In [25]:
categoricalColList

['mainroad',
 'guestroom',
 'basement',
 'hotwaterheating',
 'airconditioning',
 'prefarea',
 'furnishingstatus']

In [26]:
numericalColList

['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking']

In [22]:
scaler = MinMaxScaler()

In [27]:
col_to_scale = numericalColList

In [28]:
df_train[col_to_scale] = scaler.fit_transform(df_train[col_to_scale])

### Training the model

In [29]:
y_train = df_train.pop('price')
x_train = df_train

In [31]:
linear_regression = LinearRegression()
linear_regression.fit(x_train, y_train)

In [32]:
coefficients = linear_regression.coef_
print(coefficients)

[ 0.23466354  0.04673453  0.19082319  0.10851563  0.05044144  0.03042826
  0.02159488  0.08486327  0.06688093  0.06073533  0.05942788  0.00092052
 -0.03100561]


In [33]:
score = linear_regression.score(x_train, y_train)
print(score)

0.6814893088451202


In [34]:
df_test[col_to_scale] = scaler.fit_transform(df_test[col_to_scale])

In [35]:
y_test = df_test.pop('price')
x_test = df_test

In [36]:
prediction = linear_regression.predict(x_test)

In [37]:
r2 = r2_score(y_test, prediction)