# **Data Collection**


In [None]:
#making a root directory kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
#giving access to the directory
!chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
#downloading the dataset
!kaggle datasets download -d harishkumardatalab/housing-price-prediction

Dataset URL: https://www.kaggle.com/datasets/harishkumardatalab/housing-price-prediction
License(s): CC0-1.0
housing-price-prediction.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
#unzip the dataset
!unzip housing-price-prediction.zip

Archive:  housing-price-prediction.zip
replace Housing.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

# **Data Preprocessing**

In [None]:
#importing necessary packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression



In [None]:
df_houseprice = pd.read_csv("Housing.csv")
df_houseprice.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [None]:
df_houseprice.tail()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,unfurnished
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,semi-furnished
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,unfurnished
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,furnished
544,1750000,3850,3,1,2,yes,no,no,no,no,0,no,unfurnished


In [None]:
df_houseprice.shape

(545, 13)

In [None]:
df_houseprice.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545 entries, 0 to 544
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   price             545 non-null    int64 
 1   area              545 non-null    int64 
 2   bedrooms          545 non-null    int64 
 3   bathrooms         545 non-null    int64 
 4   stories           545 non-null    int64 
 5   mainroad          545 non-null    object
 6   guestroom         545 non-null    object
 7   basement          545 non-null    object
 8   hotwaterheating   545 non-null    object
 9   airconditioning   545 non-null    object
 10  parking           545 non-null    int64 
 11  prefarea          545 non-null    object
 12  furnishingstatus  545 non-null    object
dtypes: int64(6), object(7)
memory usage: 55.5+ KB


In [None]:
df_houseprice.isnull().sum()

price               0
area                0
bedrooms            0
bathrooms           0
stories             0
mainroad            0
guestroom           0
basement            0
hotwaterheating     0
airconditioning     0
parking             0
prefarea            0
furnishingstatus    0
dtype: int64

In [None]:
#creating a dummy dataframe
df_houseprice = pd.get_dummies(df_houseprice, columns=['furnishingstatus'])
df_houseprice

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,True,False,False
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,True,False,False
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,False,True,False
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,True,False,False
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,yes,no,yes,no,no,2,no,False,False,True
541,1767150,2400,3,1,1,no,no,no,no,no,0,no,False,True,False
542,1750000,3620,2,1,1,yes,no,no,no,no,0,no,False,False,True
543,1750000,2910,3,1,1,no,no,no,no,no,0,no,True,False,False


In [None]:
# Mapping categorical variables yes, no to 1, 0 respectively
yes_no_map = {'yes': 1, 'no': 0}
true_false_map = {True: 1, False: 0}

In [None]:
# Applying the mapping to the relevant columns
df_houseprice['mainroad'] = df_houseprice['mainroad'].map(yes_no_map)
df_houseprice['guestroom'] = df_houseprice['guestroom'].map(yes_no_map)
df_houseprice['basement'] = df_houseprice['basement'].map(yes_no_map)
df_houseprice['hotwaterheating'] = df_houseprice['hotwaterheating'].map(yes_no_map)
df_houseprice['airconditioning'] = df_houseprice['airconditioning'].map(yes_no_map)
df_houseprice['prefarea'] = df_houseprice['prefarea'].map(yes_no_map)
df_houseprice['furnishingstatus_furnished'] = df_houseprice['furnishingstatus_furnished'].map(true_false_map)
df_houseprice['furnishingstatus_semi-furnished'] = df_houseprice['furnishingstatus_semi-furnished'].map(true_false_map)
df_houseprice['furnishingstatus_unfurnished'] = df_houseprice['furnishingstatus_unfurnished'].map(true_false_map)

In [None]:
df_houseprice

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus_furnished,furnishingstatus_semi-furnished,furnishingstatus_unfurnished
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,1,0,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,1,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,0,1,0
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,1,0,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
540,1820000,3000,2,1,1,1,0,1,0,0,2,0,0,0,1
541,1767150,2400,3,1,1,0,0,0,0,0,0,0,0,1,0
542,1750000,3620,2,1,1,1,0,0,0,0,0,0,0,0,1
543,1750000,2910,3,1,1,0,0,0,0,0,0,0,1,0,0


In [None]:
# Split the data into features (X) and target (y)
X = df_houseprice.drop('price', axis=1)
y = df_houseprice['price']




In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

array([[ 1.04672629,  1.40341936,  1.42181174, ...,  1.70084013,
        -0.84488844, -0.6964292 ],
       [ 1.75700953,  1.40341936,  5.40580863, ...,  1.70084013,
        -0.84488844, -0.6964292 ],
       [ 2.21823241,  0.04727831,  1.42181174, ..., -0.58794474,
         1.18358821, -0.6964292 ],
       ...,
       [-0.70592066, -1.30886273, -0.57018671, ..., -0.58794474,
        -0.84488844,  1.43589615],
       [-1.03338891,  0.04727831, -0.57018671, ...,  1.70084013,
        -0.84488844, -0.6964292 ],
       [-0.5998394 ,  0.04727831, -0.57018671, ..., -0.58794474,
        -0.84488844,  1.43589615]])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# **Training the model by linear regression**

In [None]:
# Creating a linear regression model
lr_model = LinearRegression()


In [None]:
# Train the model on the training data
lr_model.fit(X_train, y_train)


In [None]:
#making predictions
y_pred = lr_model.predict(X_test)

In [None]:
lr_model.score(X_train,y_train)



0.6859438988560158

# **Model Evaluation**

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)


In [None]:
print(f'MAE: {mae:.2f}')
print(f'MSE: {mse:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R2: {r2:.2f}')


MAE: 970043.40
MSE: 1754318687330.67
RMSE: 1324506.96
R2: 0.65
