In [25]:
import json
import pathlib
import pickle
from typing import List
from typing import Tuple

import pandas
from sklearn import model_selection
from sklearn import neighbors
from sklearn import pipeline
from sklearn import preprocessing

In [26]:
SALES_PATH = "data/kc_house_data.csv"  # path to CSV with home sale data
DEMOGRAPHICS_PATH = "data/kc_house_data.csv"  # path to CSV with demographics
# List of columns (subset) that will be taken from home sale data
SALES_COLUMN_SELECTION = [
    'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
    'sqft_above', 'sqft_basement', 'zipcode'
]
OUTPUT_DIR = "model"  # Directory where output artifacts will be saved

In [27]:
def load_data(
    sales_path: str, demographics_path: str, sales_column_selection: List[str]
) -> Tuple[pandas.DataFrame, pandas.Series]:
    """Load the target and feature data by merging sales and demographics.

    Args:
        sales_path: path to CSV file with home sale data
        demographics_path: path to CSV file with home sale data
        sales_column_selection: list of columns from sales data to be used as
            features

    Returns:
        Tuple containg with two elements: a DataFrame and a Series of the same
        length.  The DataFrame contains features for machine learning, the
        series contains the target variable (home sale price).

    """
    data = pandas.read_csv(sales_path,
                           usecols=sales_column_selection,
                           dtype={'zipcode': str})
    demographics = pandas.read_csv("data/zipcode_demographics.csv",
                                   dtype={'zipcode': str})

    merged_data = data.merge(demographics, how="left",
                             on="zipcode").drop(columns="zipcode")
    data_unfileterd = pandas.read_csv(sales_path, dtype={'zipcode': str})
    # data_unfileterd = data.merge(demographics, how="left",
    #                             on="zipcode").drop(columns="zipcode")
    
    # Remove the target variable from the dataframe, features will remain
    y = merged_data.pop('price')
    x = merged_data

    return x, y, data_unfileterd

In [28]:
x, y, data_unfiltered = load_data(SALES_PATH, DEMOGRAPHICS_PATH, SALES_COLUMN_SELECTION)

In [29]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21613 entries, 0 to 21612
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   bedrooms                21613 non-null  int64  
 1   bathrooms               21613 non-null  float64
 2   sqft_living             21613 non-null  int64  
 3   sqft_lot                21613 non-null  int64  
 4   floors                  21613 non-null  float64
 5   sqft_above              21613 non-null  int64  
 6   sqft_basement           21613 non-null  int64  
 7   ppltn_qty               21613 non-null  float64
 8   urbn_ppltn_qty          21613 non-null  float64
 9   sbrbn_ppltn_qty         21613 non-null  float64
 10  farm_ppltn_qty          21613 non-null  float64
 11  non_farm_qty            21613 non-null  float64
 12  medn_hshld_incm_amt     21613 non-null  float64
 13  medn_incm_per_prsn_amt  21613 non-null  float64
 14  hous_val_amt            21613 non-null

In [30]:
x.describe()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,sqft_above,sqft_basement,ppltn_qty,urbn_ppltn_qty,sbrbn_ppltn_qty,...,per_sbrbn,per_farm,per_non_farm,per_less_than_9,per_9_to_12,per_hsd,per_some_clg,per_assoc,per_bchlr,per_prfsnl
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,...,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,3.370842,2.114757,2079.899736,15106.97,1.494309,1788.390691,291.509045,28980.783001,28008.3044,147.21029,...,1.703188,0.049137,4.622542,1.836996,5.077638,14.394392,18.678295,5.131634,19.41364,8.838754
std,0.930062,0.770163,918.440897,41420.51,0.539989,828.090978,442.575043,10926.187336,11858.536684,1153.766217,...,10.842697,0.275455,13.241714,2.156718,2.47525,4.437998,2.789687,0.780764,7.048811,5.094015
min,0.0,0.0,290.0,520.0,1.0,290.0,0.0,3037.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,5.0,11.0,2.0,6.0,2.0
25%,3.0,1.75,1427.0,5040.0,1.0,1190.0,0.0,22269.0,20815.0,0.0,...,0.0,0.0,0.0,1.0,3.0,11.0,17.0,5.0,13.0,4.0
50%,3.0,2.25,1910.0,7618.0,1.5,1560.0,0.0,26819.0,25593.0,0.0,...,0.0,0.0,0.0,1.0,5.0,15.0,19.0,5.0,19.0,7.5
75%,4.0,2.5,2550.0,10688.0,2.0,2210.0,560.0,37695.0,35624.0,0.0,...,0.0,0.0,0.0,2.0,7.0,17.0,20.0,6.0,25.0,12.0
max,33.0,8.0,13540.0,1651359.0,3.5,9410.0,4820.0,64181.0,64181.0,11176.0,...,81.0,2.0,98.0,11.0,11.0,25.0,34.0,7.0,39.0,24.0


In [31]:
data_unfiltered.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [32]:
# Missing cols - Geo data? I would use a geohash to encode the lat/long - embedding?
[col for col in data_unfiltered.columns if col not in x.columns]

['id',
 'date',
 'price',
 'waterfront',
 'view',
 'condition',
 'grade',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [33]:
print("condition", data_unfiltered['condition'].unique())
print("waterfront", data_unfiltered['waterfront'].unique())
print("view", data_unfiltered['view'].unique())

condition [3 5 4 1 2]
waterfront [0 1]
view [0 3 4 2 1]


In [34]:
# geohash encoding
# https://pypi.org/project/pygeohash/

In [35]:
import pygeohash as pgh

# encode lat/long

data_unfiltered['geohash'] = data_unfiltered.apply(lambda x: pgh.encode(x['lat'], x['long'], precision=10), axis=1)
data_unfiltered['geohash'].nunique()



20832

In [36]:
# import folium
# from folium.plugins import HeatMap

# # Create a map centered at the mean latitude and longitude
# m = folium.Map(location=[data_unfiltered['lat'].mean(), data_unfiltered['long'].mean()], zoom_start=10)

# # Add a heatmap to the map
# HeatMap(data=data_unfiltered[['lat', 'long']], radius=10).add_to(m)

# # Display the map
# # m   


In [37]:
# load pipeline model from /model/model.pkl and the model feautures from model/model_features.json
import joblib
import json


model = joblib.load("model/model.pkl")
model_features = json.load(open("model/model_features.json", "r"))
predictions = model.predict(x[model_features])





Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [38]:
# eval regression model with rmse mae and r2

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rmse = mean_squared_error(y, predictions, squared=False)
mae = mean_absolute_error(y, predictions)
r2 = r2_score(y, predictions)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R2: {r2}")





RMSE: 160017.47981660964
MAE: 82689.78274186833
R2: 0.8100138984242224


In [40]:
model_features

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'sqft_above',
 'sqft_basement',
 'ppltn_qty',
 'urbn_ppltn_qty',
 'sbrbn_ppltn_qty',
 'farm_ppltn_qty',
 'non_farm_qty',
 'medn_hshld_incm_amt',
 'medn_incm_per_prsn_amt',
 'hous_val_amt',
 'edctn_less_than_9_qty',
 'edctn_9_12_qty',
 'edctn_high_schl_qty',
 'edctn_some_clg_qty',
 'edctn_assoc_dgre_qty',
 'edctn_bchlr_dgre_qty',
 'edctn_prfsnl_qty',
 'per_urbn',
 'per_sbrbn',
 'per_farm',
 'per_non_farm',
 'per_less_than_9',
 'per_9_to_12',
 'per_hsd',
 'per_some_clg',
 'per_assoc',
 'per_bchlr',
 'per_prfsnl']

In [36]:
import json
model_features = json.load(open("model/model_features.json", "r"))

In [11]:
x, _ , _ = load_data(SALES_PATH, DEMOGRAPHICS_PATH, SALES_COLUMN_SELECTION)

In [21]:
dtypes = x.dtypes

In [22]:
# save dtypes as dict to model/model_dtypes.json

dtypes.to_json("model/model_dtypes.json", default_handler=str)


In [15]:
data_unseen.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [17]:
x.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'sqft_above', 'sqft_basement', 'ppltn_qty', 'urbn_ppltn_qty',
       'sbrbn_ppltn_qty', 'farm_ppltn_qty', 'non_farm_qty',
       'medn_hshld_incm_amt', 'medn_incm_per_prsn_amt', 'hous_val_amt',
       'edctn_less_than_9_qty', 'edctn_9_12_qty', 'edctn_high_schl_qty',
       'edctn_some_clg_qty', 'edctn_assoc_dgre_qty', 'edctn_bchlr_dgre_qty',
       'edctn_prfsnl_qty', 'per_urbn', 'per_sbrbn', 'per_farm', 'per_non_farm',
       'per_less_than_9', 'per_9_to_12', 'per_hsd', 'per_some_clg',
       'per_assoc', 'per_bchlr', 'per_prfsnl'],
      dtype='object')

In [16]:
for key, value in dtypes_loaded.items():
    data_unseen[key] = data_unseen[key].astype(value)
    

KeyError: 'ppltn_qty'

In [30]:
data_unseen.dtypes

bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object