In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option("display.float_format", lambda x: "%.3f" % x)


# Read Data

Use Kaggle USA Real Estate dataset to predict the price of the house. The dataset contains the following columns:

- **brokered by** (categorically encoded agency/broker)
- **status** (Housing status - a. ready for sale or b. ready to build)
- **price** (Housing price, it is either the current listing price or recently sold price if the house is sold recently)
- **bed** (# of beds)
- **bath** (# of bathrooms)
- **acre_lot** (Property / Land size in acres)
- **street** (categorically encoded street address)
- **city** (city name)
- **state** (state name)
- **zip_code** (postal code of the area)
- **house_size** (house area/size/living space in square feet)
- **prev_sold_date** (Previously sold date)
- **prev_sold_price** (Previously sold price)

The goal is to predict the price of the house based on the given features.

URL: https://www.kaggle.com/datasets/ahmedshahriarsakib/usa-real-estate-dataset/code

In [2]:
import os 

df = pd.read_csv(os.path.join("data", "usa-house-data.csv"))
df.head(5)

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
0,103378.0,for_sale,105000.0,3.0,2.0,0.12,1962661.0,Adjuntas,Puerto Rico,601.0,920.0,
1,52707.0,for_sale,80000.0,4.0,2.0,0.08,1902874.0,Adjuntas,Puerto Rico,601.0,1527.0,
2,103379.0,for_sale,67000.0,2.0,1.0,0.15,1404990.0,Juana Diaz,Puerto Rico,795.0,748.0,
3,31239.0,for_sale,145000.0,4.0,2.0,0.1,1947675.0,Ponce,Puerto Rico,731.0,1800.0,
4,34632.0,for_sale,65000.0,6.0,2.0,0.05,331151.0,Mayaguez,Puerto Rico,680.0,,


# Data pre-processing

## Data cleaning

In [3]:
# Remove outliers values
df = df[(df["bed"] < 10) & (df["bath"] < 10) & (df["price"] < 10000000) & (df["house_size"] > 10)]

# Remove rows without price
df = df.dropna(subset=["price"])

# Remove rows with price equal to 0
df = df[df["price"] > 0]

# Remove rows with missing values on bed, bath, and house_size
df = df.dropna(subset=["bed", "bath", "house_size", "acre_lot"])

df.describe()

Unnamed: 0,brokered_by,price,bed,bath,acre_lot,street,zip_code,house_size
count,1351907.0,1354142.0,1354142.0,1354142.0,1354142.0,1350179.0,1354055.0,1354142.0
mean,53574.308,536161.494,3.348,2.507,12.43,931701.796,55272.663,2090.818
std,30460.767,638705.289,1.012,1.091,797.7,533250.237,29424.713,4141.133
min,0.0,1.0,1.0,1.0,0.0,63.0,601.0,100.0
25%,24829.5,239000.0,3.0,2.0,0.14,468477.5,30252.0,1358.0
50%,53008.0,375900.0,3.0,2.0,0.21,934059.0,55391.0,1808.0
75%,79221.0,599900.0,4.0,3.0,0.46,1392525.5,83330.0,2464.0
max,110142.0,9999999.0,9.0,9.0,100000.0,2001321.0,99999.0,1560780.0


In [4]:
# Remove unnecessary columns for Machine Learning
df = df.drop(
    columns=[
        "prev_sold_date",
        # "street",
        "brokered_by",
        # "zip_code",
        "acre_lot",
    ]
)

In [5]:
# Because we will predict the price of the house, we will remove houses that are not sold
df = df[df["status"] == "sold"]

# Remove the status column
df = df.drop(columns=["status"])

df.head()

Unnamed: 0,price,bed,bath,street,city,state,zip_code,house_size
1414374,524900.0,3.0,2.0,1850809.0,Aguada,Puerto Rico,602.0,2200.0
1414376,90000.0,3.0,2.0,141421.0,Aguadilla,Puerto Rico,603.0,1421.0
1414377,22500.0,2.0,1.0,1889894.0,Anasco,Puerto Rico,610.0,850.0
1414378,168000.0,6.0,4.0,1157317.0,Anasco,Puerto Rico,610.0,3422.0
1414379,200000.0,3.0,1.0,1855411.0,Arecibo,Puerto Rico,612.0,580.0


## Data encoding

### Label encoding
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html

In [6]:
from sklearn.preprocessing import LabelEncoder

# Label encoder city
df["city"] = LabelEncoder().fit_transform(df["city"])

df.head(10)

Unnamed: 0,price,bed,bath,street,city,state,zip_code,house_size
1414374,524900.0,3.0,2.0,1850809.0,56,Puerto Rico,602.0,2200.0
1414376,90000.0,3.0,2.0,141421.0,57,Puerto Rico,603.0,1421.0
1414377,22500.0,2.0,1.0,1889894.0,219,Puerto Rico,610.0,850.0
1414378,168000.0,6.0,4.0,1157317.0,219,Puerto Rico,610.0,3422.0
1414379,200000.0,3.0,1.0,1855411.0,305,Puerto Rico,612.0,580.0
1414380,380000.0,2.0,2.0,1857001.0,305,Puerto Rico,612.0,1210.0
1414382,250000.0,5.0,2.0,1946165.0,305,Puerto Rico,612.0,1210.0
1414387,130000.0,3.0,1.0,1911843.0,1370,Puerto Rico,623.0,847.0
1414390,165000.0,2.0,1.0,1856948.0,1370,Puerto Rico,623.0,696.0
1414398,120000.0,4.0,2.0,1921908.0,8879,Puerto Rico,637.0,1188.0


### One hot encoding

#### Pandas
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html

In [7]:
data = pd.get_dummies(df, prefix=["state_"], columns=["state"])

data.head()

Unnamed: 0,price,bed,bath,street,city,zip_code,house_size,state__Alabama,state__Arizona,state__Arkansas,...,state__Tennessee,state__Texas,state__Utah,state__Vermont,state__Virgin Islands,state__Virginia,state__Washington,state__West Virginia,state__Wisconsin,state__Wyoming
1414374,524900.0,3.0,2.0,1850809.0,56,602.0,2200.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1414376,90000.0,3.0,2.0,141421.0,57,603.0,1421.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1414377,22500.0,2.0,1.0,1889894.0,219,610.0,850.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1414378,168000.0,6.0,4.0,1157317.0,219,610.0,3422.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1414379,200000.0,3.0,1.0,1855411.0,305,612.0,580.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False


## Feature normalization

In [8]:
# Randomize the data
data = data.sample(frac=1).reset_index(drop=True)

# Split data into features and label
label = data["price"]
data = data.drop(columns=["price"])

In [9]:
from sklearn import preprocessing

# Standarize data
scaler = preprocessing.MinMaxScaler()

# fill missing values with 0
dataset = data.fillna(0)

dataset = pd.DataFrame(scaler.fit_transform(dataset), columns=dataset.columns)

dataset.head()


Unnamed: 0,bed,bath,street,city,zip_code,house_size,state__Alabama,state__Arizona,state__Arkansas,state__California,...,state__Tennessee,state__Texas,state__Utah,state__Vermont,state__Virgin Islands,state__Virginia,state__Washington,state__West Virginia,state__Wisconsin,state__Wyoming
0,0.25,0.375,0.014,0.129,0.303,0.003,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.25,0.125,0.675,0.025,0.705,0.002,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.25,0.0,0.9,0.075,0.354,0.001,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.375,0.125,0.343,0.641,0.557,0.002,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.916,0.788,0.947,0.001,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Dimensionality reduction

In [10]:
from sklearn.feature_selection import VarianceThreshold

print(dataset.shape)

# Remove features with low variance
selector = VarianceThreshold(threshold=0.01)
dataset = pd.DataFrame(selector.fit_transform(dataset), columns=dataset.columns[selector.get_support()])

print(dataset.shape)

dataset.head()

(608528, 57)
(608528, 27)


Unnamed: 0,bed,bath,street,city,zip_code,state__Arizona,state__California,state__Florida,state__Georgia,state__Illinois,...,state__New York,state__North Carolina,state__Ohio,state__Oklahoma,state__Oregon,state__Pennsylvania,state__Texas,state__Virginia,state__Washington,state__Wisconsin
0,0.25,0.375,0.014,0.129,0.303,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.25,0.125,0.675,0.025,0.705,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.25,0.0,0.9,0.075,0.354,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.375,0.125,0.343,0.641,0.557,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.916,0.788,0.947,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Prediction

## Generate train and test datasets

In [11]:
# Generate train and test datasets'
from sklearn.model_selection import train_test_split

X = dataset
y = label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

X_train.head()

Unnamed: 0,bed,bath,street,city,zip_code,state__Arizona,state__California,state__Florida,state__Georgia,state__Illinois,...,state__New York,state__North Carolina,state__Ohio,state__Oklahoma,state__Oregon,state__Pennsylvania,state__Texas,state__Virginia,state__Washington,state__Wisconsin
521633,0.25,0.125,0.251,0.202,0.78,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
413918,0.125,0.0,0.247,0.781,0.635,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
174184,0.25,0.125,0.158,0.554,0.331,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
520957,0.25,0.25,0.053,0.996,0.238,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
575908,0.375,0.25,0.822,0.787,0.946,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
y.head()

0   527900.000
1   219000.000
2   120000.000
3   225000.000
4   825000.000
Name: price, dtype: float64

# Train and evaluate models

In [13]:
# Train a Linear Regression model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score

model1 = LinearRegression()
model1.fit(X_train, y_train)

y_pred = model1.predict(X_test)

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print("Explained Variance Score: ", explained_variance_score(y_test, y_pred))


Mean Squared Error:  177012562825.09674
R2 Score:  0.397906615493161
Mean Absolute Error:  222646.35233273127
Explained Variance Score:  0.39791074751750466


In [14]:
# Train a Decision Tree Regressor model
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score

model2 = XGBRegressor()
model2.fit(X_train, y_train)

y_pred = model2.predict(X_test)

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print("Explained Variance Score: ", explained_variance_score(y_test, y_pred))


Mean Squared Error:  87300263593.86206
R2 Score:  0.7030554761951887
Mean Absolute Error:  141382.34277066123
Explained Variance Score:  0.7030555830444891


In [15]:
# Train a Neural Network model
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score

model3 = MLPRegressor(
    hidden_layer_sizes=(100, 50),  # One hidden layer with 100 neurons
    activation="relu",  # 'relu' activation function
    solver="adam",  # 'adam' solver for weight optimization
    max_iter=500,  # Set the number of iterations (epochs)
    random_state=42,  # Set the seed for reproducibility
)
model3.fit(X_train, y_train)

y_pred = model3.predict(X_test)

print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R2 Score: ", r2_score(y_test, y_pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print("Explained Variance Score: ", explained_variance_score(y_test, y_pred))

Mean Squared Error:  146837689403.4055
R2 Score:  0.500543916346678
Mean Absolute Error:  191049.58211100174
Explained Variance Score:  0.5005449879961086




In [16]:
# Compare y_pred and y_test
y_test = y_test.reset_index(drop=True)
y_pred = pd.Series(y_pred)

df = pd.DataFrame({"y_test": y_test, "y_pred": y_pred})
df["diff"] = df["y_test"] - df["y_pred"]

df.head(10)


Unnamed: 0,y_test,y_pred,diff
0,495000.0,802883.885,-307883.885
1,200000.0,193971.9,6028.1
2,429000.0,358423.457,70576.543
3,425000.0,456439.477,-31439.477
4,5988000.0,3514992.594,2473007.406
5,259900.0,165754.739,94145.261
6,305000.0,428931.944,-123931.944
7,295000.0,309298.541,-14298.541
8,363500.0,555800.513,-192300.513
9,369990.0,422074.854,-52084.854


## Save the model

In [17]:
# Save the model

model2.save_model("model.json")


# Load the model

In [18]:
# Load the model
model2 = XGBRegressor()
model2.load_model("model.json")
