In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
import kagglehub
import pickle

  from .autonotebook import tqdm as notebook_tqdm


# Dataset Preparation

**About Dataset**

https://www.kaggle.com/datasets/denkuznetz/housing-prices-regression

This task involves predicting the price of real estate properties based on various features that influence the value of a property. The dataset contains several attributes of real estate properties such as square footage, the number of bedrooms, bathrooms, floors, the year the property was built, whether the property has a garden or pool, the size of the garage, the location score, and the distance from the city center.

The goal is to build a regression model that can predict the Price of a property based on the provided features.

Dataset Columns:

| Column Name | Description |
|---|---|
|ID  | A unique identifier for each property. |
| Square_Feet | The area of the property in square meters. |
| Num_Bedrooms | The number of bedrooms in the property. |
| Num_Bathrooms | The number of bathrooms in the property. |
| Num_Floors | The number of floors in the property. |
| Year_Built | The year the property was built. |
| Has_Garden | Indicates whether the property has a garden (1 for yes, 0 for no). |
| Has_Pool | Indicates whether the property has a pool (1 for yes, 0 for no). |
| Garage_Size | The size of the garage in square meters. |
| Location_Score | A score from 0 to 10 indicating the quality of the neighborhood (higher scores indicate better neighborhoods). |
| Distance_to_Center | The distance from the property to the city center in kilometers. |
| Price | The target variable that represents the price of the property. This is the value we aim to predict.|


**Objective**:
The goal of this task is to develop a regression model that predicts the Price of a real estate property using the other features as inputs. The model should be able to learn the relationship between these features and the price, providing an accurate prediction for unseen data

In [2]:
# Download dataset
dataset_name = "real_estate_dataset.csv"
dataset_path = kagglehub.dataset_download("denkuznetz/housing-prices-regression")
df = pd.read_csv(dataset_path + "/" + dataset_name, index_col=False)
df.head()



Unnamed: 0,ID,Square_Feet,Num_Bedrooms,Num_Bathrooms,Num_Floors,Year_Built,Has_Garden,Has_Pool,Garage_Size,Location_Score,Distance_to_Center,Price
0,1,143.63503,1,3,3,1967,1,1,48,8.297631,5.935734,602134.816747
1,2,287.678577,1,2,1,1949,0,1,37,6.061466,10.827392,591425.135386
2,3,232.998485,1,3,2,1923,1,0,14,2.911442,6.904599,464478.69688
3,4,199.664621,5,2,2,1918,0,0,17,2.070949,8.284019,583105.655996
4,5,89.00466,4,3,3,1999,1,0,34,1.523278,14.648277,619879.142523


## Data Cleaning

In [3]:
df = df.drop(columns=["ID"])
df = df.drop_duplicates()
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Square_Feet         500 non-null    float64
 1   Num_Bedrooms        500 non-null    int64  
 2   Num_Bathrooms       500 non-null    int64  
 3   Num_Floors          500 non-null    int64  
 4   Year_Built          500 non-null    int64  
 5   Has_Garden          500 non-null    int64  
 6   Has_Pool            500 non-null    int64  
 7   Garage_Size         500 non-null    int64  
 8   Location_Score      500 non-null    float64
 9   Distance_to_Center  500 non-null    float64
 10  Price               500 non-null    float64
dtypes: float64(4), int64(7)
memory usage: 43.1 KB


In [24]:
df.to_csv("dataset.csv")

In [4]:
X = df.drop(columns=["Price"]).to_numpy()
y = df["Price"].to_numpy()

In [14]:
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

In [25]:
scalerfile = 'scaler.sav'
pickle.dump(scaler, open(scalerfile, 'wb'))

# Modeling

In [15]:
reg = LinearRegression().fit(X, y)
reg.score(X, y)

0.9759766171790658

In [21]:
reg.predict(np.array([X[4]]))

array([632796.56637811])

In [23]:
filename = 'model.sav'
pickle.dump(reg, open(filename, 'wb'))