In [29]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.metrics import mean_squared_error, r2_score

In [30]:
df = pd.read_csv('Housing.csv')
print(df)

        price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0    13300000  7420         4          2        3      yes        no       no   
1    12250000  8960         4          4        4      yes        no       no   
2    12250000  9960         3          2        2      yes        no      yes   
3    12215000  7500         4          2        2      yes        no      yes   
4    11410000  7420         4          1        2      yes       yes      yes   
..        ...   ...       ...        ...      ...      ...       ...      ...   
540   1820000  3000         2          1        1      yes        no      yes   
541   1767150  2400         3          1        1       no        no       no   
542   1750000  3620         2          1        1      yes        no       no   
543   1750000  2910         3          1        1       no        no       no   
544   1750000  3850         3          1        2      yes        no       no   

    hotwaterheating aircond

In [31]:
data = df.where(pd.notnull(df), '')

In [32]:
data.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
5,10850000,7500,3,3,1,yes,no,yes,no,yes,2,yes,semi-furnished
6,10150000,8580,4,3,4,yes,no,no,no,yes,2,yes,semi-furnished
7,10150000,16200,5,3,2,yes,no,no,no,no,0,no,unfurnished
8,9870000,8100,4,1,2,yes,yes,yes,no,yes,2,yes,furnished
9,9800000,5750,3,2,4,yes,yes,no,no,yes,1,yes,unfurnished


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 40.5+ KB


In [34]:
data.shape

(545, 13)

In [35]:
data.loc[data['furnishingstatus'] == 'furnished', 'furnishingstatus',] = 1
data.loc[data['furnishingstatus'] == 'unfurnished', 'furnishingstatus',] = 0
data.loc[data['furnishingstatus'] == 'semi-furnished', 'furnishingstatus',] = 0.5

In [36]:
data.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,1.0
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,1.0
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,0.5
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,1.0
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,1.0
5,10850000,7500,3,3,1,yes,no,yes,no,yes,2,yes,0.5
6,10150000,8580,4,3,4,yes,no,no,no,yes,2,yes,0.5
7,10150000,16200,5,3,2,yes,no,no,no,no,0,no,0.0
8,9870000,8100,4,1,2,yes,yes,yes,no,yes,2,yes,1.0
9,9800000,5750,3,2,4,yes,yes,no,no,yes,1,yes,0.0


In [37]:
data.loc[data['mainroad'] == 'yes', 'mainroad',] = 1
data.loc[data['mainroad'] == 'no', 'mainroad',] = 0

In [38]:
data.loc[data['guestroom'] == 'yes', 'guestroom',] = 1
data.loc[data['guestroom'] == 'no', 'guestroom',] = 0

In [39]:
data.loc[data['basement'] == 'yes', 'basement',] = 1
data.loc[data['basement'] == 'no', 'basement',] = 0

In [40]:
data.loc[data['hotwaterheating'] == 'yes', 'hotwaterheating',] = 1
data.loc[data['hotwaterheating'] == 'no', 'hotwaterheating',] = 0

In [41]:
data.loc[data['airconditioning'] == 'yes', 'airconditioning',] = 1
data.loc[data['airconditioning'] == 'no', 'airconditioning',] = 0

In [42]:
data.loc[data['hotwaterheating'] == 'yes', 'hotwaterheating',] = 1
data.loc[data['hotwaterheating'] == 'no', 'hotwaterheating',] = 0

In [43]:
data.loc[data['prefarea'] == 'yes', 'prefarea',] = 1
data.loc[data['prefarea'] == 'no', 'prefarea',] = 0

In [44]:
data.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,1.0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,1.0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,0.5
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,1.0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,1.0
5,10850000,7500,3,3,1,1,0,1,0,1,2,1,0.5
6,10150000,8580,4,3,4,1,0,0,0,1,2,1,0.5
7,10150000,16200,5,3,2,1,0,0,0,0,0,0,0.0
8,9870000,8100,4,1,2,1,1,1,0,1,2,1,1.0
9,9800000,5750,3,2,4,1,1,0,0,1,1,1,0.0


In [45]:
X = data[['area','bedrooms', 'bathrooms','stories','mainroad','guestroom','basement','hotwaterheating','airconditioning','parking','prefarea','furnishingstatus',]]
Y = data['price']
print(X)

     area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0    7420         4          2        3        1         0        0   
1    8960         4          4        4        1         0        0   
2    9960         3          2        2        1         0        1   
3    7500         4          2        2        1         0        1   
4    7420         4          1        2        1         1        1   
..    ...       ...        ...      ...      ...       ...      ...   
540  3000         2          1        1        1         0        1   
541  2400         3          1        1        0         0        0   
542  3620         2          1        1        1         0        0   
543  2910         3          1        1        0         0        0   
544  3850         3          1        2        1         0        0   

    hotwaterheating airconditioning  parking prefarea furnishingstatus  
0                 0               1        2        1                1  
1

In [84]:
# # #standardizing input features before applying linear regression
scaler = MaxAbsScaler()
columns_to_scale = ['area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestroom',
                     'basement', 'hotwaterheating', 'airconditioning', 'parking',
                     'prefarea', 'furnishingstatus']
X_scaled = scaler.fit_transform(data[columns_to_scale])
X_scaled = pd.DataFrame(X_scaled, columns=columns_to_scale)
print(X_scaled)

         area  bedrooms  bathrooms  stories  mainroad  guestroom  basement  \
0    0.458025  0.666667       0.50     0.75       1.0        0.0       0.0   
1    0.553086  0.666667       1.00     1.00       1.0        0.0       0.0   
2    0.614815  0.500000       0.50     0.50       1.0        0.0       1.0   
3    0.462963  0.666667       0.50     0.50       1.0        0.0       1.0   
4    0.458025  0.666667       0.25     0.50       1.0        1.0       1.0   
..        ...       ...        ...      ...       ...        ...       ...   
540  0.185185  0.333333       0.25     0.25       1.0        0.0       1.0   
541  0.148148  0.500000       0.25     0.25       0.0        0.0       0.0   
542  0.223457  0.333333       0.25     0.25       1.0        0.0       0.0   
543  0.179630  0.500000       0.25     0.25       0.0        0.0       0.0   
544  0.237654  0.500000       0.25     0.50       1.0        0.0       0.0   

     hotwaterheating  airconditioning   parking  prefarea  furn

In [85]:
print(X.head())

       area  bedrooms  bathrooms   stories mainroad guestroom basement  \
0  0.458025  0.340992   0.263016  0.544314        1         0        0   
1  0.553086  0.340992   1.000000  1.000000        1         0        0   
2  0.614815  0.011487   0.263016  0.088629        1         0        1   
3  0.462963  0.340992   0.263016  0.088629        1         0        1   
4  0.458025  0.340992  -0.105477  0.088629        1         1        1   

  hotwaterheating airconditioning   parking prefarea furnishingstatus  
0               0               1  0.566428        1                1  
1               0               1  1.000000        0                1  
2               0               0  0.566428        1              0.5  
3               0               1  1.000000        1                1  
4               0               1  0.566428        0                1  


In [86]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=3)

In [87]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(545, 12)
(490, 12)
(55, 12)


In [88]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(545,)
(490,)
(55,)


In [89]:
model = LinearRegression()

In [90]:
model.fit(X_train, Y_train)

In [91]:
prediction_on_training_data = model.predict(X_train)

In [92]:
r_sqaure_train = r2_score(Y_train, prediction_on_training_data)

In [93]:
print(f"R-squared value for training data = {r_sqaure_train}")

R-squared value for training data = 0.6812635398623481


In [94]:
prediction_on_testing_data = model.predict(X_test)
r_square_test = r2_score(Y_test, prediction_on_testing_data)
print(f"R-squared value for testing data = {r_sqaure_train}")

R-squared value for testing data = 0.6812635398623481
