#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### Chapter 16
**CH16A Predicting apartment prices with random forest**

using the airbnb dataset

version 0.92 2021-07-05

In [1]:
import os
import re
import sys
import warnings
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")


In [2]:
path = Path(os.getcwd())

base_dir = str(path.parent.parent)

data_in = os.path.join(base_dir, "da_data_repo/airbnb/clean/")
data_out = os.path.join(base_dir, "da_case_studies/ch16-airbnb-random-forest/")
output = os.path.join(base_dir, "da_case_studies/ch16-airbnb-random-forest/output/")
func = os.path.join(base_dir, "da_case_studies/ch00-tech-prep/")

sys.path.append(func)
from py_helper_functions import *


-------------------------------------------------------
### Import data

In [3]:
area = "london"
data = pd.read_csv(data_in + "airbnb_" + area + "_cleaned_book.csv", index_col=0)


In [4]:
data["property_type"].value_counts()


Apartment             38270
House                 13055
Bed & Breakfast        1066
Townhouse               372
Other                   267
Loft                    254
Dorm                    127
Guesthouse               81
Boat                     69
Serviced apartment       65
Condominium              56
Bungalow                 47
Boutique hotel           35
Hostel                   32
Cabin                    32
Villa                    12
Camper/RV                 9
Chalet                    9
Yurt                      4
Hut                       3
Castle                    3
Tent                      2
Parking Space             2
Ryokan (Japan)            1
Lighthouse                1
Igloo                     1
Cave                      1
Name: property_type, dtype: int64

In [5]:
# keep if property type is Apartment, House or Townhouse
data = data.loc[lambda x: x["property_type"].isin(["Apartment", "House", "Townhouse"])]


In [6]:
# rename Townhouse to House

data["property_type"] = np.where(
    data["property_type"] == "Townhouse", "House", data["property_type"]
)
data["f_property_type"] = data["property_type"].astype("category")


In [7]:
data["room_type"].value_counts()


Entire home/apt    26742
Private room       24415
Shared room          540
Name: room_type, dtype: int64

In [8]:
# Room type as factor

data["f_room_type"] = data["room_type"].astype("category")


In [9]:
# Rename roomt type because it is too long
data["f_room_type2"] = data["f_room_type"].map(
    {
        "Entire home/apt": "Entire/Apt",
        "Private room": "Private",
        "Shared room": "Shared",
    }
)


In [10]:
# cancellation policy as factor
data["cancellation_policy"].value_counts()


strict             21287
flexible           18435
moderate           11959
super_strict_30       15
super_strict_60        1
Name: cancellation_policy, dtype: int64

In [11]:
# if cancellation policy is super strict 30 or 60, rename it as strict
data["cancellation_policy"] = np.where(
    (data["cancellation_policy"] == "super_strict_30")
    | (data["cancellation_policy"] == "super_strict_60"),
    "strict",
    data["cancellation_policy"],
)
data["f_cancellation_policy"] = data["cancellation_policy"].astype("category")


In [12]:
# bed_type and neighbourhood_cleansed as factors

data["bed_type"] = np.where(
    data["bed_type"].isin(["Futon", "Pull-out Sofa", "Airbed"]),
    "Couch",
    data["bed_type"],
)

data["f_bed_type"] = data["bed_type"].astype("category")
data["f_neighbourhood_cleansed"] = data["neighbourhood_cleansed"].astype("category")


---------

### Create Numerical variables

In [13]:
data["usd_price_day"] = data["price"]
data["p_host_response_rate"] = data["host_response_rate"].fillna(0).astype(int)
# rename cleaning_fee column

data = data.rename(columns={"cleaning_fee": "usd_cleaning_fee"})


In [14]:
# add new numeric columns from certain columns

numericals = [
    "accommodates",
    "bathrooms",
    "review_scores_rating",
    "number_of_reviews",
    "guests_included",
    "reviews_per_month",
    "extra_people",
    "minimum_nights",
    "beds",
]

for col in numericals:
    data["n_" + col] = pd.to_numeric(data[col], errors="coerce")


In [15]:
# create days since first review

data["n_days_since"] = (
    data.calendar_last_scraped.apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
    - data.first_review.fillna("1950-01-01").apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d")
    )
).dt.days

data["n_days_since"] = np.where(data.first_review.isnull(), np.nan, data.n_days_since)


In [16]:
# create dummy vars
dummies = data.columns[71:121]

for col in dummies:
    data["d_" + (re.sub("/|\s|-", "", col)).replace("(s)", "s").lower()] = data[col]


In [17]:
# keep columns if contain d_, n_,f_, p_, usd_ and some others

data = pd.concat(
    [
        data.filter(
            regex="^d_.*|^n_.*|^f_.*|^p_.*|^usd_.*",
        ),
        data[
            [
                "price",
                "id",
                "neighbourhood_cleansed",
                "cancellation_policy",
                "room_type",
                "property_type",
            ]
        ],
    ],
    axis=1,
)


In [18]:
#####################
### look at price ###
#####################

data["price"] = data["price"].str.replace(",", "").astype(float)

data = data.loc[lambda x: x.price < 1000]


In [19]:
# Squares and further values to create
data = data.assign(
    n_accommodates2=lambda x: x["n_accommodates"] ** 2,
    ln_accommodates=lambda x: np.log(x["n_accommodates"]),
    ln_accommodates2=lambda x: np.log(x["n_accommodates"]) ** 2,
    ln_beds=lambda x: np.log(x["n_beds"]),
    ln_number_of_reviews=lambda x: np.log(x["n_number_of_reviews"] + 1),
)


In [20]:
# Pool accomodations with 0,1,2,10 bathrooms

bins = pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (2, 10)], closed="left")
f_bath = pd.cut(data["n_bathrooms"].to_list(), bins, labels=["0", "1", "2"])
f_bath.categories = [0, 1, 2]
data["f_bathroom"] = f_bath

f_bath.value_counts(dropna=False)


0        364
1      41417
2       9628
NaN      237
dtype: int64

In [21]:
# Pool num of reviews to 3 categories: none, 1-51 and >51

bins = pd.IntervalIndex.from_tuples(
    [(0, 1), (1, 51), (51, max(data.n_number_of_reviews))], closed="left"
)
fnor = pd.cut(data["n_number_of_reviews"].to_list(), bins, labels=["0", "1", "2"])
fnor.categories = [0, 1, 2]
data["f_number_of_reviews"] = fnor
data["f_number_of_reviews"].value_counts(dropna=False)


1      32683
0      15741
2       3221
NaN        1
Name: f_number_of_reviews, dtype: int64

In [22]:
# Pool and categorize the number of minimum nights: 1,2,3, 3+

bins = pd.IntervalIndex.from_tuples(
    [(1, 2), (2, 3), (3, max(data.n_minimum_nights))], closed="left"
)
f_min_n = pd.cut(data["n_minimum_nights"].to_list(), bins, labels=["1", "2", "3"])
f_min_n.categories = [1, 2, 3]
data["f_minimum_nights"] = f_min_n
data["f_minimum_nights"].value_counts(dropna=False)


1      19454
3      18075
2      14116
NaN        1
Name: f_minimum_nights, dtype: int64

In [23]:
# Change Infinite values with NaNs
data = data.replace([np.inf, -np.inf], np.nan)


In [24]:
# ------------------------------------------------------------------------------------------------
# where do we have missing variables now?
to_filter = data.isna().sum()
to_filter[to_filter > 0]


usd_cleaning_fee          20017
n_bathrooms                 237
n_review_scores_rating    16501
n_reviews_per_month       15741
n_beds                      167
n_days_since              15741
ln_beds                     168
f_bathroom                  237
f_number_of_reviews           1
f_minimum_nights              1
dtype: int64

In [25]:
# what to do with missing values?
# 1. drop if no target
data = data.loc[lambda x: x.price.notnull()]


In [26]:
# 2. imput when few, not that important
data = data.assign(
    n_bathrooms=lambda x: x["n_bathrooms"].fillna(np.median(x["n_bathrooms"].dropna())),
    n_beds=lambda x: np.where(x["n_beds"].isnull(), x["n_accommodates"], x["n_beds"]),
    f_bathroom=lambda x: x["f_bathroom"].fillna(1),
    f_minimum_nights=lambda x: x["f_minimum_nights"].fillna(1),
    f_number_of_reviews=lambda x: x["f_number_of_reviews"].fillna(1),
    ln_beds=lambda x: x["ln_beds"].fillna(0),
)


In [27]:
# 3. drop columns when many missing not important
data = data.drop(["usd_cleaning_fee", "p_host_response_rate"], axis=1)


In [28]:
to_filter = data.isna().sum()
to_filter[to_filter > 0]


n_review_scores_rating    16501
n_reviews_per_month       15741
n_days_since              15741
dtype: int64

In [29]:
# 4. Replace missing variables re reviews with zero, when no review + add flags
data = data.assign(
    flag_days_since=np.multiply(data.n_days_since.isna(), 1),
    n_days_since=data.n_days_since.fillna(np.median(data.n_days_since.dropna())),
    flag_review_scores_rating=np.multiply(data.n_review_scores_rating.isna(), 1),
    n_review_scores_rating=data.n_review_scores_rating.fillna(
        np.median(data.n_review_scores_rating.dropna())
    ),
    flag_reviews_per_month=np.multiply(data.n_reviews_per_month.isna(), 1),
    n_reviews_per_month=data.n_reviews_per_month.fillna(
        np.median(data.n_reviews_per_month.dropna())
    ),
    flag_n_number_of_reviews=np.multiply(data.n_number_of_reviews.isna(), 1),
)


In [30]:
data.flag_days_since.value_counts()


0    35905
1    15741
Name: flag_days_since, dtype: int64

In [31]:
# redo features
# Create variables, measuring the time since: squared, cubic, logs
data = data.assign(
    ln_days_since=lambda x: np.log(x["n_days_since"] + 1),
    ln_days_since2=lambda x: np.log(x["n_days_since"] + 1) ** 2,
    ln_days_since3=lambda x: np.log(x["n_days_since"] + 1) ** 3,
    n_days_since2=lambda x: x["n_days_since"] ** 2,
    n_days_since3=lambda x: x["n_days_since"] ** 3,
    ln_review_scores_rating=lambda x: np.log(x["n_review_scores_rating"]),
)


In [32]:
data.ln_days_since = data["ln_days_since"].fillna(0)
data.ln_days_since2 = data["ln_days_since2"].fillna(0)
data.ln_days_since3 = data["ln_days_since3"].fillna(0)


In [33]:
to_filter = data.isna().sum()
to_filter[to_filter > 0]


Series([], dtype: int64)

In [34]:
data.describe()


Unnamed: 0,n_accommodates,n_bathrooms,n_review_scores_rating,n_number_of_reviews,n_guests_included,n_reviews_per_month,n_extra_people,n_minimum_nights,n_beds,n_days_since,...,flag_days_since,flag_review_scores_rating,flag_reviews_per_month,flag_n_number_of_reviews,ln_days_since,ln_days_since2,ln_days_since3,n_days_since2,n_days_since3,ln_review_scores_rating
count,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,...,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0
mean,3.057178,1.260708,92.439627,12.350327,1.415773,1.13897,6.66555,3.310266,1.708884,418.132595,...,0.304786,0.319502,0.304786,0.0,5.714039,33.4644,199.676817,293617.1,305114100.0,4.520661
std,1.888509,0.527094,8.438353,25.860475,1.044884,1.236552,12.691355,29.083719,1.168387,344.651296,...,0.460321,0.466288,0.460321,0.0,0.902317,9.507929,80.289121,564084.9,962119500.0,0.120946
min,1.0,0.0,20.0,0.0,1.0,0.01,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.995732
25%,2.0,1.0,92.0,0.0,1.0,0.47,0.0,1.0,1.0,228.0,...,0.0,0.0,0.0,0.0,5.433722,29.525335,160.432461,51984.0,11852350.0,4.521789
50%,2.0,1.0,94.0,3.0,1.0,0.77,0.0,2.0,1.0,327.0,...,0.0,0.0,0.0,0.0,5.793014,33.559007,194.407782,106929.0,34965780.0,4.543295
75%,4.0,1.5,97.0,12.0,1.0,1.17,10.0,3.0,2.0,504.0,...,1.0,1.0,1.0,0.0,6.224558,38.745128,241.171311,254016.0,128024100.0,4.574711
max,16.0,8.0,100.0,396.0,16.0,15.0,240.0,5000.0,16.0,2722.0,...,1.0,1.0,1.0,0.0,7.909489,62.560024,494.817853,7409284.0,20168070000.0,4.60517


In [35]:
data.to_csv(data_out + "airbnb_london_workfile_adj.csv", index=False)
