In [6]:
# Basic imports
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import  r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.base import BaseEstimator, TransformerMixin

In [7]:
# Loading of the dataset via pandas
kc_data = pd.read_csv("../data/King_County_House_prices_dataset.csv")

In [5]:
kc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     19221 non-null  float64
 9   view           21534 non-null  float64
 10  condition      21597 non-null  int64  
 11  grade          21597 non-null  int64  
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   17755 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

## Data Cleaning


#### Bedrooms and bathrooms

In [8]:
def bath_bed_ratio_outlier(df):
    df.copy()
    df["bath_bed_ratio"] = df["bathrooms"] / df["bedrooms"]
    for idx, ratio in enumerate(df["bath_bed_ratio"]):
        if ratio >= 2:
            df.drop(idx, inplace=True)
        elif ratio <= 0.10:
            df.drop(idx, inplace=True)
    return df

In [9]:
class BathBedRoomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    

    def transform(self, X, y=None):
        X.copy()
        X["bath_bed_ratio"] = X["bathrooms"] / X["bedrooms"]
        for idx, ratio in enumerate(X["bath_bed_ratio"]):
            if ratio >= 2:
                X.drop(idx, inplace=True)
            elif ratio <= 0.10:
                X.drop(idx, inplace=True)
        return X

In [5]:
kc_data.drop(15856, axis=0, inplace=True)

#### Sqft Basement

In [10]:
def sqft_basement(df):
    df.copy()
    df["sqft_basement"] = df["sqft_basement"].replace("?", np.nan)
    df["sqft_basement"] = df["sqft_basement"].astype(float)
    df["sqft_basement"] = df["sqft_living"] - df["sqft_above"]
    return df

In [11]:
class SqftBasementTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    

    def transform(self, X, y=None):
        X.copy()
        X["sqft_basement"] = X["sqft_basement"].replace("?", np.nan)
        X["sqft_basement"] = X["sqft_living"] - X["sqft_above"]
        X["sqft_basement"] = X["sqft_basement"].astype(float)
        return X

In [8]:
basement_transformer = SqftBasementTransformer()

kc_data = basement_transformer.fit_transform(
    kc_data
)

#### Missing Values

In [12]:
def fill_missings_view_wf(df):
    df.copy()
    df["view"] = df["view"].fillna(0)
    df["waterfront"] = df["waterfront"].fillna(0)
    return df

In [13]:
class ViewWaterfrontTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self,X, y=None):
        X.copy()
        X["view"] = X["view"].fillna(0)
        X["waterfront"] = X["waterfront"].fillna(0)
        return X

In [11]:
missing_transformer = ViewWaterfrontTransformer()

kc_data = missing_transformer.fit_transform(
    kc_data
)

#### Last Known Change

In [14]:
def calculate_last_change(df):
    df.copy()
    last_known_change = []
    for idx, yr_re in df.yr_renovated.items():
        if str(yr_re) == "nan" or yr_re == 0.0:
            last_known_change.append(df.yr_built[idx])
        else:
            last_known_change.append(int(yr_re))
    df["last_known_change"] = last_known_change
    df.drop("yr_renovated", axis=1, inplace=True)
    df.drop("yr_built", axis=1, inplace=True)
    return df

In [15]:
class LastKnownChangeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self,X, y=None):
        X.copy()
        last_known_change = []
        for idx, yr_re in X.yr_renovated.items():
            if str(yr_re) == "nan" or yr_re == 0.0:
                last_known_change.append(X.yr_built[idx])
            else:
                last_known_change.append(int(yr_re))
        X["last_known_change"] = last_known_change
        X.drop("yr_renovated", axis=1, inplace=True)
        X.drop("yr_built", axis=1, inplace=True)
        return X

In [10]:
last_change_transformer = LastKnownChangeTransformer()

kc_data = last_change_transformer.fit_transform(
    kc_data
)

NameError: name 'LastKnownChangeTransformer' is not defined

In [16]:
from sklearn.pipeline import Pipeline

data_cleaning_pipeline = Pipeline([
    ('bathroom_bedroom_ratio',BathBedRoomTransformer()),
    ('sqft_basement', SqftBasementTransformer()),
    ('view_and_waterfront', ViewWaterfrontTransformer()),
    ('last_known_change', LastKnownChangeTransformer())
     ]
)

## Feature Engineering

In [17]:
class SqftPriceCreator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self,X, y=None):
        X.copy()
        X['sqft_price'] = (X['price']/(X['sqft_living'] + X['sqft_lot'])).round(2)
        return X

In [23]:
sqft_price_transformer = SqftPriceCreator()

kc_data = sqft_price_transformer.fit_transform(
    kc_data
)

#### Distance to the beach promenade

In [18]:
class CenterDistanceCreator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self,X, y=None):
        X.copy()
        # Absolute difference of latitude between centre and property
        X['delta_lat'] = np.absolute(47.62774- X['lat'])
        # Absolute difference of longitude between centre and property
        X['delta_long'] = np.absolute(-122.24194-X['long'])
        # Distance between centre and property
        X['center_distance']= ((X['delta_long']* np.cos(np.radians(47.6219)))**2 
                                   + X['delta_lat']**2)**(1/2)*2*np.pi*6378/360
        return X

In [25]:
center_distance_transformer = CenterDistanceCreator()

kc_data = center_distance_transformer.fit_transform(
    kc_data
)


#### Distance to the beach promenade

In [19]:
# This function helps us to calculate the distance between the house overlooking the seafront and the other houses.
def dist(long, lat, ref_long, ref_lat):
    '''dist computes the distance in km to a reference location. Input: long and lat of 
    the location of interest and ref_long and ref_lat as the long and lat of the reference location'''
    delta_long = long - ref_long
    delta_lat = lat - ref_lat
    delta_long_corr = delta_long * np.cos(np.radians(ref_lat))
    return ((delta_long_corr)**2 +(delta_lat)**2)**(1/2)*2*np.pi*6378/360

In [20]:
class WaterDistanceCreator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self,X, y=None):
        X.copy()
        water_list= X.query('waterfront == 1')
        
        water_distance = []
        # For each row in our data frame we now calculate the distance to the seafront
        for idx, lat in X.lat.items():
            ref_list = []
            for i,j in zip(list(water_list.long), list(water_list.lat)):
                ref_list.append(dist(X.long[idx], X.lat[idx],i,j).min())
            water_distance.append(min(ref_list))
        X['water_distance'] = water_distance
        return X

In [42]:
water_distance_transformer = WaterDistanceCreator()

kc_data = water_distance_transformer.fit_transform(
    kc_data
)

NameError: name 'WaterDistanceCreator' is not defined

In [21]:
class DropNoPredictionValues(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self,X, y=None):
        X.copy()
        drop_lst = ['sqft_price', 'date', 'delta_lat', 'delta_long','bath_bed_ratio']
        features_label = [x for x in X.columns if x not in drop_lst]
        
        return X[features_label]
        

In [22]:
from sklearn.pipeline import Pipeline

feature_engineering_pipeline = Pipeline([
    ('sqft_price',SqftPriceCreator()),
    ('center_distance', CenterDistanceCreator()),
    ('water_distance', WaterDistanceCreator()),
    ('no_pred_values', DropNoPredictionValues())
     ]
)

In [37]:
pd.DataFrame([['sqft_basement', '?']])

Unnamed: 0,0,1
0,sqft_basement,?


In [23]:
feature_engineering_pipeline

### Preprocessor Pipeline

In [24]:
preprocessor_pipe = Pipeline([
    ('data_cleaning', data_cleaning_pipeline),
    ('feature_engineering', feature_engineering_pipeline),
])

In [25]:
preprocessor_pipe

In [34]:
import sys
sys.path.insert(1,'../pipeline/')
from preprocessing_king_county import PreprocessingKingCountyData

In [35]:
preprocessor_pipe_skript = PreprocessingKingCountyData()

In [28]:
kc_data_test = pd.read_csv("../data/King_County_House_prices_dataset.csv")

In [33]:
kc_data_test

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,sqft_above,sqft_basement,zipcode,lat,long,sqft_living15,sqft_lot15,last_known_change,center_distance,water_distance
0,7129300520,221900.0,3,1.00,1180,5650,1.0,0.0,0.0,3,...,1180,0.0,98178,47.5112,-122.257,1340,5650,1955,13.022012,0.678977
1,6414100192,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,3,...,2170,400.0,98125,47.7210,-122.319,1690,7639,1991,11.882906,2.910551
2,5631500400,180000.0,2,1.00,770,10000,1.0,0.0,0.0,3,...,770,0.0,98028,47.7379,-122.233,2720,8062,1933,12.281023,2.327626
3,2487200875,604000.0,4,3.00,1960,5000,1.0,0.0,0.0,5,...,1050,910.0,98136,47.5208,-122.393,1360,5000,1965,16.436889,0.467532
4,1954400510,510000.0,3,2.00,1680,8080,1.0,0.0,0.0,3,...,1680,0.0,98074,47.6168,-122.045,1800,7503,1987,14.826499,1.726771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,263000018,360000.0,3,2.50,1530,1131,3.0,0.0,0.0,3,...,1530,0.0,98103,47.6993,-122.346,1530,1509,2009,11.154088,5.346761
21593,6600060120,400000.0,4,2.50,2310,5813,2.0,0.0,0.0,3,...,2310,0.0,98146,47.5107,-122.362,1830,7200,2014,15.839476,1.724718
21594,1523300141,402101.0,2,0.75,1020,1350,2.0,0.0,0.0,3,...,1020,0.0,98144,47.5944,-122.299,1020,2007,2009,5.665915,0.925361
21595,291310100,400000.0,3,2.50,1600,2388,2.0,0.0,0.0,3,...,1600,0.0,98027,47.5345,-122.069,1410,1287,2004,16.616144,2.402901


In [32]:
kc_data_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21584 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 21584 non-null  int64  
 1   price              21584 non-null  float64
 2   bedrooms           21584 non-null  int64  
 3   bathrooms          21584 non-null  float64
 4   sqft_living        21584 non-null  int64  
 5   sqft_lot           21584 non-null  int64  
 6   floors             21584 non-null  float64
 7   waterfront         21584 non-null  float64
 8   view               21584 non-null  float64
 9   condition          21584 non-null  int64  
 10  grade              21584 non-null  int64  
 11  sqft_above         21584 non-null  int64  
 12  sqft_basement      21584 non-null  float64
 13  zipcode            21584 non-null  int64  
 14  lat                21584 non-null  float64
 15  long               21584 non-null  float64
 16  sqft_living15      21584 no

In [29]:
kc_data_test = preprocessor_pipe_skript.preprocess_fit_transform(kc_data_test)

In [21]:
kc_data = preprocessor_pipe.fit_transform(kc_data)

In [1]:
kc_data.info()

NameError: name 'kc_data' is not defined

# Modeling


Now that we have looked at the data in more depth, we can start to forecast possible new houses.

Our goal is to determine the price for which new houses will sell.

In this scenario, we realise that we do not yet know the true sales price until the house has been successfully sold. We can only test our model if we have such new and unknown data. However, we do not have this data at the moment.

Therefore, we only work with part of the data to train our model and keep another part as test data. We consider this test set as new house sales that we want to estimate with our model. We can then compare the results of the model with the true sales prices and determine how well our model determines the true sales prices.
To separate the dataset into training and testing data, we use a feature of Scikit-Learn: Train-Test-Split. Here we specify what our independent variables are (X) and what our predicted dependent variable is (y). We also specify what percentage of the data the test set should be. Another important parameter we submit is called " random state". With this parameter, our data is first shuffled before it is split into the train and the test set. This is important because the order of the data should not affect the prices of the houses (imagine if we sorted the data by price and then cut the bottom 30% for our test set, this would mean that our model would never have seen houses in that price category and would therefore be very difficult to apply to the new data). By assigning a number to the random state, we ensure that our data is mixed but always identically separated so that the results are reproducible.

Before we make this important split, we remove the columns in our table that have no predictive value. It is also important to store our dependent variable (y) separately and remove any variables that could cause data leakage.

In [None]:
# We have to remove these columns because of Data Leakage: price or because they do not provide prognostic information.
drop_lst = ['price', 'sqft_price', 'date', 'delta_lat', 'delta_long',]

In [None]:
# we would like to consider all variables except the ones mentioned above
all_features = [x for x in kc_data.columns if x not in drop_lst]

In [None]:
# X contains all descriptive variables defined above
X = kc_data[all_features]

In [None]:
# we define y (our dependent variable): we take the price
y = kc_data.price

In [None]:
# We separate our data into train and test data. In the process, 30 % of the data is used for the subsequent testing of the prognostic quality.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# we can look at how much data is in each dataset
print("X_train (features for the model to learn from): ", X_train.shape)
print("y_train (labels for the model to learn from): ", y_train.shape)
print("X_test (features to test the model's accuracy against): ", X_test.shape)
print("y_test (labels to test the model's accuracy with): ", y_test.shape)

In [None]:
# If we look at the first 5 lines of our training data, we see that the index is no longer sorted, it has been shuffled.
X_train.head()

After these preparations, we now come to modelling. 
For this we will continue to use the scikit-learn library, in which many different algorithms are implemented.
The procedure is always the same:

- we import the algorithm from scikit-learn which we want to use.

- we determine the model, often there are additional hyperparameters we have to specify
- we determine which variables to pass to the model
- we train the model (we call the method `.fit(X_train, y_train)` on our model)
- we test the model with our test data and get the adjusted R^2 as metric (we call the method `.score(X_test, y_test)` on our trained model and clean up the score).


In [None]:
# We determine the model, there must be 2 round brackets behind the model name!
model_lin_reg = LinearRegression()

Next, we determine which variables we pass to our model to determine the price of the houses.
The simplest model calculates the price using only one variable: for example, the "grade" of the house.

In [None]:
# We determine which variables we pass to the model
variables = ['grade',]

In [None]:
# Training of the model
model_lin_reg.fit(X_train[variables], y_train)

In [None]:
# We look at how well our model performs on the test data
print('adj. R^2:', (1-(1-model_lin_reg.score(X_test[variables], y_test))*(X_test.shape[0]- 1)/(X_test.shape[0]-len(variables)-1)).round(2))

The adjusted R^2 indicates the percentage of variance of the target variable (price per square foot) explained by the model. Adjusted R² is a modified version of R² that has been adjusted with the number of explanatory variables. It penalises the addition of unnecessary variables and allows comparison of regression models with different numbers of explanatory variables.
The value 1 means 100 % of the variance of the target variable could be explained by the model. The value 0 means 0 % of the variance of the target variable could be explained by the model. 
This means for our case: The variable "grade" can explain 43 % of the variance in the price per square foot of the houses in our test set.
Perhaps more variables could explain more variance. 
We can look again at what variables we have:

In [None]:
# Names of the variables in the data set
X_train.columns

We can use all these variables to predict the price per square foot.
Maybe the age of the house could also play a big influence.
Therefore, we will now try a new linear regression with these 2 variables.

In [None]:
# We determine the model, there must be 2 round brackets behind the model name!
model_lin_reg = LinearRegression()

In [None]:
# We determine which variables we pass to the model
variables = ['grade','last_known_change']

In [None]:
# we train the model
model_lin_reg.fit(X_train[variables], y_train)

In [None]:
# We look at how well our model performs on the test data
print('adj. R^2:', (1-(1-model_lin_reg.score(X_test[variables], y_test))*(X_test.shape[0]- 1)/(X_test.shape[0]-len(variables)-1)).round(2))

We see that with this additional variable, 48% of the variance in the price per square foot could be explained.

So far we have only looked at the linear relationships between the variables and the price. However, it is possible that the relationship is not linear, but rather quadratic. 
We can easily extend our model by squaring our variables. Thus, instead of:

$price = b*(grade) + c$

we can also use 

$price=a*(grade)^{2}+b*(grade)+c$

can be obtained.
This is a type of feature engineering. We will apply it to our complete data set and see if we can improve our model even further.

In [None]:
# We want to create only polynomial variables of second order (^2)
poly = PolynomialFeatures(2)

In [None]:
# create a copy of the Train and Test data
X_train_poly = X_train.copy()
X_test_poly = X_test.copy()

# drop the id column
X_train_poly = X_train_poly.drop(columns=['id'])
X_test_poly = X_test_poly.drop(columns=['id'])

In [None]:
# We create new variables by calling poly
X_train_sq = poly.fit_transform(X_train_poly)

# We have to do the same for our test data, of course
X_test_sq = poly.transform(X_test_poly)

In [None]:
# We determine the model, there must be 2 round brackets behind the model name!
model_lin_reg = LinearRegression()

In [None]:
# We also train the model with squared variables
model_lin_reg.fit(X_train_sq, y_train)

In [None]:
# We look at how well our model performs on the test data
print('adjusted R^2:', (1-(1-model_lin_reg.score(X_test_sq, y_test))*(X_test_sq.shape[0]- 1)/(X_test_sq.shape[0]-X_test_sq.shape[1]-1)).round(2))

With the additional squared variables, we were able to improve our result a bit more.


With the adjusted R^2 value we have a possibility to evaluate the quality of our model, but it may be worthwhile to have a look at the real errors of the model graphically. This may help to identify systematic errors.
For ease of interpretation, we choose the percentage price difference between our forecast and the true values.

We see a few outliers here. We can take a closer look at the highest one. 

In [None]:
# Error analysis
# In order to better analyse the errors of our model, we create a new dataframe with the
# columns "price" (the real price), as well as the latitudes and longitudes
y_predictions = model_lin_reg.predict(X_test_sq)
df_error = pd.DataFrame(y_test)
df_error['latitude'] = X_test['lat']
df_error['longitude'] = X_test['long']
df_error['id'] = X_test['id']
df_error.head(2)

In [None]:
# To add the predicted price as a column as well, we must first reset the index
df_error.reset_index(inplace=True, drop=True)
df_error.head(2)

In [None]:
# Now we can also add the predicted price as a column and calculate the difference
df_error['price_prediction'] = y_predictions.round(2)
df_error['price_difference'] = (df_error['price_prediction'] - df_error['price']).round(2)
df_error['price_difference_procent'] = ((df_error['price_difference']/df_error['price'])*100).round(2)
df_error.head(2)

In [None]:
fig = px.scatter_mapbox(df_error,
                        lat="latitude",
                        lon="longitude",
                        hover_data=["price", "price_prediction", 'id'],
                        color='price_difference_procent',
                        color_continuous_scale=['green', 'yellow', 'red'],
                        zoom=7.7,
                        height=400)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()

In [None]:
df_error[df_error['price_difference_procent']==df_error['price_difference_procent'].max()]

In [None]:
X_test[X_test['id']==9272202260]

We want to take a closer look at these houses. King County also provides very good information on this. On [this page](https://localscape.property/#kingcountyassessor/My-Property) you can search for houses by their ID and get both the neighbourhood on a map and a picture of the house.
In the field at the top left, change the selection "Address" to "Parcel ID" and add the "id" of our outlier.

Under "Basic Property Characteristics" and on the map under "KC Aerial Images" we see that there is no longer a house on this property. Therefore, our data is misleading and our model estimates a much higher price.


## Regularisation and hyperparameter tuning of linear regression

In addition to our variables, we have also passed the squared variables to our last linear model. So we have passed a lot of variables to our model. Some may have no effect on the price at all. However, models try to extract some information from all variables. This leads to random variance in the data also being recognised as a pattern. This phenomenon is called "overfitting" the model to the data.
For each algorithm there are ways to prevent this overfitting.

In the case of linear regression, we force the model not to use variables for forecasting. We "regularise" the model. But instead of us telling the model which variables not to use, we let the model learn which variables offer the least added value and remove those variables (in the case of linear regression, variables are no longer considered if the learned coefficient (b) is zero). How much we regularise is up to us.
[Here](https://scikit-learn.org/stable/modules/linear_model.html#elastic-net) you can find information on the ElasticNet model used.
To find out how much we should regularize our model, we can test different values for the regularzation parameters and see which will give us the best model. For this purpose we can use [GridSearch](https://scikit-learn.org/stable/modules/grid_search.html#grid-search) with [Cross-Validation](https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation).



In [None]:
# We specify which hyperparameters we want to change:
# alpha: specifies how much we regularise:
param_grid = {'alpha':[0.1,.5,1,5,10,],
             'l1_ratio': [1,0.5,0]}

# We determine the model, there must be 2 round brackets behind the model name!
elastic = ElasticNet(max_iter=50000, tol=0.2)

# Passing the model to a so-called parameter grid with 5-fold cross-validation
elastic_cv= GridSearchCV(elastic,param_grid,cv=5, verbose=True,n_jobs=-1)

# We train the model and optimise it via GridSearch
elastic_cv.fit(X_train_sq,y_train)

In [None]:
# Output of the best parameters found by the GridSearch.
elastic_cv.best_params_

In [None]:
# We train the model with the optimal hyperparameters
elastic = ElasticNet(max_iter=50000, tol=0.2,**elastic_cv.best_params_)

elastic.fit(X_train_sq, y_train)

In [None]:
# We look at how well our model performs on the test data
adj_r2 = (1-(1-elastic.score(X_test_sq, y_test))*(X_test_sq.shape[0]- 1)/(X_test_sq.shape[0]-(X_test_sq.shape[1]-sum(elastic.coef_== 0))-1)).round(2)
print('adjusted R^2:',adj_r2 )

We were able to further improve our model through hyperparameter tuning and regularisation. 
As mentioned above, regularisation eliminates variables from the forecast. This is done by giving the coefficients of the linear regression a value of zero. With the following code we can look at the learned coefficients (here only a section of the first 5 coefficients) and see that for the first variable a coefficient of zero was calculated. This variable was therefore removed by our regularisation.

In [None]:
# Output of the first 5 learned coefficients of linear regression
elastic.coef_[0:5]

In [None]:
# Error analysis
# In order to better analyse the errors of our model, we create a new dataframe with the
# columns "price" (the real price), as well as the latitudes and longitudes
y_predictions = elastic.predict(X_test_sq)
df_error = pd.DataFrame(y_test)
df_error['latitude'] = X_test['lat']
df_error['longitude'] = X_test['long']
df_error['id'] = X_test['id']
df_error.head(2)

In [None]:
# To add the predicted price as a column as well, we must first reset the index
df_error.reset_index(inplace=True, drop=True)
df_error.head(2)

In [None]:
# Now we can also add the predicted price as a column and calculate the difference 
df_error['price_prediction'] = y_predictions.round(2)
df_error['price_difference'] = (df_error['price_prediction'] - df_error['price']).round(2)
df_error['price_difference_procent'] = ((df_error['price_difference']/df_error['price'])*100).round(2)
df_error.head(2)

In [None]:
fig = px.scatter_mapbox(df_error,
                        lat="latitude",
                        lon="longitude",
                        hover_data=["price", "price_prediction", 'id'],
                        color='price_difference_procent',
                        color_continuous_scale=['green', 'yellow', 'red'],
                        zoom=7.7,
                        height=400)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()

In [None]:
df_error[df_error['price_difference_procent']==df_error['price_difference_procent'].max()]

In [None]:
X_test[X_test['id']==5111400086]

Let's take a closer look at this house. King County also provides very good information about it. On [this page](https://localscape.property/#kingcountyassessor/My-Property) you can search for houses by their ID and get both the neighbourhood on a map and a picture of the house.
In the box at the top left, they change the "Address" selection to "Parcel ID" and add the "id" of our outlier there.

This house sold in 2018 for a price of  295,000 USD. But King County appraisers also assessed a higher price in 2014: under "Historical Value" they see that in 2014 the "Total Assessed Value" was 212,000 USD. So this house sold for about half the appraised price!



## Save the model

We have now trained a model that can predict the price of a house in King County. We can now save this model using [skops](https://skops.readthedocs.io/en/stable/modules/classes.html#module-skops.io).

In [None]:
import skops.io as sio

with open('model/model.bin', 'wb') as f_out:
    sio.dump(elastic, f_out)

### Conclusion
Despite the outliers, we succeeded in creating a model that predicts prices with an accuracy of 76%. We found that the creation of new variables, but also the squaring of these variables and the regularisation of the model play an important role in the quality of the prediction. 