## Use our trained XGB model to predict house prices in Amsterdam

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle


df = pd.read_csv("HousingPrices-Amsterdam-August-2021.csv")
df.columns = df.columns.str.lower()

df['pc4'] = df.zip.str.split().str[0]
used_cols = ['pc4', 'area', 'room', 'price']
df = df[used_cols]

# df.head(1)

In [2]:
df.iloc[[100]] # 100	1092	67	3	500000.0


Unnamed: 0,pc4,area,room,price
100,1092,67,3,500000.0


In [3]:
# Load saved model from local drive
with open("Ams_xgb_pipeline.pkl", "rb") as f:
    loaded = pickle.load(f)

dv = loaded["dv"]
model = loaded["model"]


In [4]:
# helper function to return price from values entered

def predict_price(pc4, area, room, dv, model):
    # 1. Create a record exactly like training data
    record = {
        "pc4": str(pc4),   # pc4 is categorical → string
        "area": float(area),
        "room": float(room)
    }

    # 2. Transform with DictVectorizer
    X = dv.transform([record])

    # 3. Predict (log scale)
    dmatrix = xgb.DMatrix(X, feature_names=dv.feature_names_)
    y_log = model.predict(dmatrix)[0]

    # 4. Convert log1p → price
    return np.expm1(y_log)


In [5]:
# take row 100 from dataset and test it
row = df.iloc[100]

pc4  = row['pc4']
area = row['area']
room = row['room']
real = row['price']

pred = predict_price(pc4, area, room, dv, model)

print("Real price:     ", real)
print("Predicted price:", pred)
print("Error:          ", pred - real)
# Error: -42163.1875 - about 8%

Real price:      500000.0
Predicted price: 457836.8
Error:           -42163.1875


## Appendix

In [6]:
# # dv, model, list(dv.get_feature_names_out()) # unpickling works
# # Lets try to predict manually:

# # if we use df.iloc[[100]].copy() we will get a dataframe
# # row = df.iloc[[100]].copy()
# # real = int(row.price.values[0])

# # if we use single brackets we get pandas series object
# row = df.iloc[100]
# # type(row) # pandas.core.series.Series
# pc4  = row['pc4']
# area = row['area']
# room = row['room']
# real = row['price']
# # pc4, area, room, real
# record = {
#     "pc4": str(pc4),   # pc4 is categorical → string
#     "area": float(area),
#     "room": float(room)
# }
# # record

# # Transform with DictVectorizer
# X = dv.transform([record])
# # Predict (log scale)
# dmatrix = xgb.DMatrix(X, feature_names=dv.feature_names_)
# y_log = model.predict(dmatrix)[0]
# print(y_log)