In [1]:
import json
from shapely.geometry import shape, GeometryCollection, Point
import pandas as pd



### GeoJson
Let's figure out if we can determine the neighborhood of a set of coordinates.

In [2]:
with open("../data/neighbourhoods.geojson", "r") as f:
    js = json.load(f)

js.keys()

dict_keys(['type', 'features'])

In [3]:
listing_df = pd.read_csv("../data/listings.csv")
listing_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,1944,cafeheaven Pberg/Mitte,2164,Lulah,Mitte,Brunnenstr. Nord,52.54425,13.39749,Private room,21,60,18,2018-11-11,0.24,1,251
1,3176,Fabulous Flat in great Location,3718,Britta,Pankow,Prenzlauer Berg Südwest,52.535,13.41758,Entire home/apt,90,62,145,2019-06-27,1.14,1,344
2,3309,BerlinSpot Schöneberg near KaDeWe,4108,Jana,Tempelhof - Schöneberg,Schöneberg-Nord,52.49885,13.34906,Private room,28,7,27,2019-05-31,0.35,1,317
3,6883,Stylish East Side Loft in Center with AC & 2 b...,16149,Steffen,Friedrichshain-Kreuzberg,Frankfurter Allee Süd FK,52.51171,13.45477,Entire home/apt,125,3,128,2019-10-21,1.08,1,20
4,7071,BrightRoom with sunny greenview!,17391,BrightRoom,Pankow,Helmholtzplatz,52.54316,13.41509,Private room,33,3,266,2019-11-09,2.13,2,30


In [4]:
for col in listing_df.columns:
    print(col)

id
name
host_id
host_name
neighbourhood_group
neighbourhood
latitude
longitude
room_type
price
minimum_nights
number_of_reviews
last_review
reviews_per_month
calculated_host_listings_count
availability_365


In [5]:
num_rows = 5
sample = listing_df.head(num_rows)

In [6]:


for ix in range(5):
    lat, long, n = sample[["latitude", "longitude", "neighbourhood"]].iloc[ix]
    p = Point(long, lat)
    for neighbourhood in js["features"]:
        polygon = shape(neighbourhood["geometry"])
        if polygon.contains(p):
            print("found neighbourhood!")
            print(n)
            print(neighbourhood["properties"])


found neighbourhood!
Brunnenstr. Nord
{'neighbourhood': 'Brunnenstr. Nord', 'neighbourhood_group': 'Mitte'}
found neighbourhood!
Prenzlauer Berg Südwest
{'neighbourhood': 'Prenzlauer Berg Südwest', 'neighbourhood_group': 'Pankow'}
found neighbourhood!
Schöneberg-Nord
{'neighbourhood': 'Schöneberg-Nord', 'neighbourhood_group': 'Tempelhof - Schöneberg'}
found neighbourhood!
Frankfurter Allee Süd FK
{'neighbourhood': 'Frankfurter Allee Süd FK', 'neighbourhood_group': 'Friedrichshain-Kreuzberg'}
found neighbourhood!
Helmholtzplatz
{'neighbourhood': 'Helmholtzplatz', 'neighbourhood_group': 'Pankow'}


Success!

### Explore the listings data

In [7]:
import os

dfs = [
    pd.read_csv("../data/" + f) for f in os.listdir("../data")
    if f.startswith("listings") and f.endswith(".csv")
]

listings_df = pd.concat(dfs)
listings_df.shape

(482969, 16)

In [8]:
listings_df.isnull().sum()

id                                    0
name                               1149
host_id                               0
host_name                           641
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       86906
reviews_per_month                 86975
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [9]:
listings_df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,482969.0,482969.0,482969.0,482969.0,482969.0,482969.0,482969.0,395994.0,482969.0,482969.0
mean,16806080.0,57972570.0,52.509931,13.406321,67.231367,6.518605,18.054121,1.081348,1.991583,74.631525
std,9547035.0,64276160.0,0.031018,0.058233,200.052182,32.02052,38.268796,1.48083,3.97941,114.264903
min,1944.0,1581.0,52.3458,13.084345,0.0,1.0,0.0,0.01,1.0,0.0
25%,8442867.0,9601373.0,52.48911,13.375949,30.0,2.0,1.0,0.17,1.0,0.0
50%,17661620.0,32661390.0,52.509223,13.41689,48.0,2.0,4.0,0.51,1.0,3.0
75%,23439890.0,86759240.0,52.53279,13.43949,70.0,4.0,16.0,1.4,1.0,114.0
max,40086300.0,308937600.0,52.66389,13.760739,9000.0,5000.0,596.0,94.0,55.0,365.0


In [10]:
listings_df.describe(exclude="number")

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,room_type,last_review
count,481820,482328,482969,482969,482969,396063
unique,53615,10458,12,138,4,1831
top,Gemütliches Zimmer in Neukölln,Anna,Friedrichshain-Kreuzberg,Frankfurter Allee Süd FK,Entire home/apt,2019-01-02
freq,183,4254,117613,28236,238964,3000


In [11]:
import numpy as np
from datetime import datetime

listings_df["last_review_utc"] = pd.to_datetime(listings_df["last_review"])
listings_df["last_review_seconds_ago"] = (datetime.now() - listings_df["last_review_utc"]).astype(np.int64)

### Establish Baseline

In [12]:
from sklearn.metrics import mean_absolute_error

baseline = [listings_df["price"].mean()] * len(listings_df)
mae = mean_absolute_error(baseline, listings_df["price"])
print(f"baseline mae: {mae}")

baseline mae: 40.37990203771003


In [13]:
from sklearn.model_selection import train_test_split

train, val_test = train_test_split(listings_df, test_size=0.3)
val, test = train_test_split(val_test, test_size=0.5)

target = "price"
features = listings_df.columns.drop([target] + ["name", "host_name", "neighbourhood_group", "last_review_utc", "last_review", "id", "host_id", "latitude", "longitude"])

X_train = train[features]
X_val = val[features]
X_test = test[features]

y_train = train[target]
y_val = val[target]
y_test = test[target]

In [15]:
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    RandomForestRegressor(
        n_estimators=150,
        n_jobs=-1,
        max_depth=75,
    )
)

pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('onehotencoder',
                 OneHotEncoder(cols=['neighbourhood', 'room_type'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', return_df=True,
                               use_cat_names=True, verbose=0)),
                ('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=75, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, m

In [16]:
from sklearn.metrics import mean_absolute_error

pred = pipe.predict(X_val)

r2 = 0 # r2_score(pred, y_val)
mae = mean_absolute_error(pred, y_val)

print(f"r2: {r2}, mae: {mae}")

r2: 0, mae: 12.164486937270233


In [22]:
from xgboost import XGBRegressor

xgb_pipe = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    XGBRegressor(
        n_estimators=150,
        n_jobs=-1,
        learning_rate=0.25,
        max_depth=30,
    ),
)

xgb_pipe.fit(X_train, y_train)

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




Pipeline(memory=None,
         steps=[('onehotencoder',
                 OneHotEncoder(cols=['neighbourhood', 'room_type'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', return_df=True,
                               use_cat_names=True, verbose=0)),
                ('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('xgbregressor',
                 XGB...
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, gamma=0,
                              importance_type='gain', learning_rate=0.25,
                              max_delta_step=0, max_depth=30,
                              min_child_weight=1, missing=None,
                              n_estimators=150, n_j

In [23]:
from sklearn.metrics import mean_absolute_error

pred = xgb_pipe.predict(X_val)

r2 = 0 # r2_score(pred, y_val)
mae = mean_absolute_error(pred, y_val)

print(f"r2: {r2}, mae: {mae}")

r2: 0, mae: 10.936560460586744


### Result

Beats baseline by about \$33.

In [None]:
print(features)

In [25]:
import pickle

with open("../pickles/primitive_model.pickle", "wb") as f:
    pickle.dump(xgb_pipe, f)

In [None]:
import joblib

with open("../pickles/primitive_model.joblib", "wb") as f:
    joblib.dump(pipe, f)