#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### Chapter 16
**CH16A Predicting apartment prices with random forest**

using the airbnb dataset

version 0.92 2021-07-05

In [None]:
import os
import re
import sys
import warnings
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
from skimpy import skim

warnings.filterwarnings("ignore")


-------------------------------------------------------
### Import data

In [None]:
data = pd.read_csv("https://osf.io/7n96w/download", index_col=0)

In [None]:
data.head()

Keep if property type is Apartment, House or Townhouse

In [None]:
data["property_type"].value_counts()


In [None]:
data = data.loc[lambda x: x["property_type"].isin(["Apartment", "House", "Townhouse"])]

Rename Townhouse to House

In [None]:
data["property_type"] = np.where(
    data["property_type"] == "Townhouse", "House", data["property_type"]
)
data["f_property_type"] = data["property_type"].astype("category")

In [None]:
data["room_type"].value_counts()

Room type as factor

In [None]:
data["f_room_type"] = data["room_type"].astype("category")

 Rename roomt type because it is too long

In [None]:
data["f_room_type2"] = data["f_room_type"].map(
    {
        "Entire home/apt": "Entire/Apt",
        "Private room": "Private",
        "Shared room": "Shared",
    }
)

Cancellation policy as factor

In [None]:
data["cancellation_policy"].value_counts()

If cancellation policy is super strict 30 or 60, rename it as strict

In [None]:
data["cancellation_policy"] = np.where(
    (data["cancellation_policy"] == "super_strict_30")
    | (data["cancellation_policy"] == "super_strict_60"),
    "strict",
    data["cancellation_policy"],
)
data["f_cancellation_policy"] = data["cancellation_policy"].astype("category")

 bed_type and neighbourhood_cleansed as factors


In [None]:
data["bed_type"] = np.where(
    data["bed_type"].isin(["Futon", "Pull-out Sofa", "Airbed"]),
    "Couch",
    data["bed_type"],
)

data["f_bed_type"] = data["bed_type"].astype("category")
data["f_neighbourhood_cleansed"] = data["neighbourhood_cleansed"].astype("category")

---------

### Create Numerical variables

In [None]:
data["usd_price_day"] = data["price"]
data["p_host_response_rate"] = data["host_response_rate"].fillna(0).astype(int)

Rename cleaning_fee column

In [None]:
data = data.rename(columns={"cleaning_fee": "usd_cleaning_fee"})

Add new numeric columns from certain columns

In [None]:
numericals = [
    "accommodates",
    "bathrooms",
    "review_scores_rating",
    "number_of_reviews",
    "guests_included",
    "reviews_per_month",
    "extra_people",
    "minimum_nights",
    "beds",
]

Rename columns so they start with n_ as opposed to end with _n

In [None]:
for col in numericals:
    data["n_" + col] = pd.to_numeric(data[col], errors="coerce")

Create days since first review

In [None]:
data["n_days_since"] = (
    data.calendar_last_scraped.apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
    - data.first_review.fillna("1950-01-01").apply(
        lambda x: datetime.strptime(x, "%Y-%m-%d")
    )
).dt.days

data["n_days_since"] = np.where(data.first_review.isnull(), np.nan, data.n_days_since)


Create dummy vars

In [None]:
dummies = data.columns[71:121]

Rename colunms

In [None]:
for col in dummies:
    data["d_" + (re.sub("/|\s|-", "", col)).replace("(s)", "s").lower()] = data[col]


Keep columns if contain d_, n_,f_, p_, usd_ and some others

In [None]:
data = pd.concat(
    [
        data.filter(
            regex="^d_.*|^n_.*|^f_.*|^p_.*|^usd_.*",
        ),
        data[
            [
                "price",
                "id",
                "neighbourhood_cleansed",
                "cancellation_policy",
                "room_type",
                "property_type",
            ]
        ],
    ],
    axis=1,
)


### Take a look at price

In [None]:
data["price"] = data["price"].str.replace(",", "").astype(float)

data = data.loc[lambda x: x["price"] < 1000]


In [None]:
skim(data["price"].to_frame())

In [None]:
data["price"].hist()

In [None]:
data["price"].apply(np.log).hist()

Squares and further values to create

In [None]:
data = data.assign(
    n_accommodates2=lambda x: x["n_accommodates"] ** 2,
    ln_accommodates=lambda x: np.log(x["n_accommodates"]),
    ln_accommodates2=lambda x: np.log(x["n_accommodates"]) ** 2,
    ln_beds=lambda x: np.log(x["n_beds"]),
    ln_number_of_reviews=lambda x: np.log(x["n_number_of_reviews"] + 1),
)


 Pool accomodations with 0,1,2,10 bathrooms

In [None]:
bins = pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (2, 10)], closed="left")
f_bath = pd.cut(data["n_bathrooms"].to_list(), bins, labels=["0", "1", "2"])
f_bath.categories = [0, 1, 2]
data["f_bathroom"] = f_bath

f_bath.value_counts(dropna=False)


Pool num of reviews to 3 categories: none, 1-51 and >51

In [None]:
bins = pd.IntervalIndex.from_tuples(
    [(0, 1), (1, 51), (51, max(data.n_number_of_reviews))], closed="left"
)
fnor = pd.cut(data["n_number_of_reviews"].to_list(), bins, labels=["0", "1", "2"])
fnor.categories = [0, 1, 2]
data["f_number_of_reviews"] = fnor
data["f_number_of_reviews"].value_counts(dropna=False)


Pool and categorize the number of minimum nights: 1,2,3, 3+

In [None]:
bins = pd.IntervalIndex.from_tuples(
    [(1, 2), (2, 3), (3, max(data.n_minimum_nights))], closed="left"
)
f_min_n = pd.cut(data["n_minimum_nights"].to_list(), bins, labels=["1", "2", "3"])
f_min_n.categories = [1, 2, 3]
data["f_minimum_nights"] = f_min_n
data["f_minimum_nights"].value_counts(dropna=False)


Change Infinite values with NaNs

In [None]:
data = data.replace([np.inf, -np.inf], np.nan)

___

Where do we have missing variables now?

In [None]:
data.isnull().sum().loc[lambda x: x > 0]

What to do with missing values?

1. drop if no target

In [None]:
data = data.loc[lambda x: x["price"].notnull()]

2. imput when few, not that important

In [None]:
data = data.assign(
    n_bathrooms=lambda x: x["n_bathrooms"].fillna(np.median(x["n_bathrooms"].dropna())),
    n_beds=lambda x: np.where(x["n_beds"].isnull(), x["n_accommodates"], x["n_beds"]),
    f_bathroom=lambda x: x["f_bathroom"].fillna(1),
    f_minimum_nights=lambda x: x["f_minimum_nights"].fillna(1),
    f_number_of_reviews=lambda x: x["f_number_of_reviews"].fillna(1),
    ln_beds=lambda x: x["ln_beds"].fillna(0),
)


 3. drop columns when many missing not important

In [None]:
data = data.drop(["usd_cleaning_fee", "p_host_response_rate"], axis=1)

In [None]:
data.isnull().sum().loc[lambda x: x > 0]

 4. Replace missing variables re reviews with zero, when no review + add flags

In [None]:
data = data.assign(
    flag_days_since=np.multiply(data["n_days_since"].isnull(), 1),
    n_days_since=data["n_days_since"].fillna(np.median(data["n_days_since"].dropna())),
    flag_review_scores_rating=np.multiply(data["n_review_scores_rating"].isnull(), 1),
    n_review_scores_rating=data["n_review_scores_rating"].fillna(
        np.median(data["n_review_scores_rating"].dropna())
    ),
    flag_reviews_per_month=np.multiply(data["n_reviews_per_month"].isnull(), 1),
    n_reviews_per_month=data["n_reviews_per_month"].fillna(
        np.median(data["n_reviews_per_month"].dropna())
    ),
    flag_n_number_of_reviews=np.multiply(data["n_number_of_reviews"].isnull(), 1),
)

In [None]:
data.flag_days_since.value_counts()


Redo features

Create variables, measuring the time since: squared, cubic, logs

In [None]:
data = data.assign(
    ln_days_since=lambda x: np.log(x["n_days_since"] + 1),
    ln_days_since2=lambda x: np.log(x["n_days_since"] + 1) ** 2,
    ln_days_since3=lambda x: np.log(x["n_days_since"] + 1) ** 3,
    n_days_since2=lambda x: x["n_days_since"] ** 2,
    n_days_since3=lambda x: x["n_days_since"] ** 3,
    ln_review_scores_rating=lambda x: np.log(x["n_review_scores_rating"]),
)


In [None]:
data["n_days_since"] = data["ln_days_since"].fillna(0)
data["ln_days_since2"] = data["ln_days_since2"].fillna(0)
data["ln_days_since3"] = data["ln_days_since3"].fillna(0)

In [None]:
data.isnull().sum().loc[lambda x: x > 0]

In [None]:
data.describe()


In [None]:
skim(data)

In [None]:
data.to_csv("data/airbnb_london_workfile_adj.csv", index=False)