# Lecture 19 – part I

## Data preparation 

   - Data cleaning & refactoring   
   - Basic feature engineering
                        
#### Case Study:                                 
  - CH14B Predicting AirBnB apartment prices: selecting a regression model       

####  Dataset:       
    airbnb
---

In [None]:
import os
import sys
import warnings

import numpy as np
import pandas as pd
from skimpy import skim
from mizani.formatters import percent_format
from plotnine import *

warnings.filterwarnings("ignore")

Import data

In [None]:
data = pd.read_csv("data/airbnb_hackney_workfile_adj_book1.csv")


In [None]:
skim(data)


Where do we have missing variables now?

In [None]:
to_filter = data.isna().sum()
to_filter[to_filter > 0].index


What to do with missing values?

1. drop if no target (already did)

In [None]:
data = data.dropna(subset=["price"])


2. imput when few, not that important

In [None]:
data["n_bathrooms"] = data["n_bathrooms"].fillna(np.nanmedian(data["n_bathrooms"]))
data["n_beds"] = data["n_beds"].fillna(data["n_accommodates"])
data["f_bathroom"] = data["f_bathroom"].fillna(1)
data["f_minimum_nights"] = data["f_minimum_nights"].fillna(1)
data["f_number_of_reviews"] = data["f_number_of_reviews"].fillna(1)
data["ln_beds"] = data["ln_beds"].fillna(0)


In [None]:
data["n_bathrooms"].describe()


 3. drop columns when many missing not imortant

In [None]:
data = data.drop(
    ["usd_cleaning_fee", "p_host_response_rate", "d_reviews_per_month"], axis=1
)

 where do we have missing variables now?

In [None]:
to_filter = data.isna().sum()
to_filter[to_filter > 0].index


In [None]:
for var in ["flag_days_since", "flag_review_scores_rating", "flag_reviews_per_month"]:
    data[var] = [int(x) for x in data[var.replace("flag", "n")].isna()]


 4. Replace missing variables re reviews with zero, when no review + add flags

In [None]:
data["n_days_since"] = data["n_days_since"].fillna(np.nanmedian(data["n_days_since"]))
data["n_review_scores_rating"] = data["n_review_scores_rating"].fillna(
    np.nanmedian(data["n_review_scores_rating"])
)
data["n_reviews_per_month"] = data["n_reviews_per_month"].fillna(
    np.nanmedian(data["n_reviews_per_month"])
)


In [None]:
data.flag_days_since.value_counts()


Add features -> different functional forms

Create variables, measuring the time since: squared, cubic, logs

In [None]:
data = data.assign(
    ln_days_since=lambda x: np.log(x["n_days_since"] + 1),
    ln_days_since2=lambda x: np.log(x["n_days_since"] + 1) ** 2,
    ln_days_since3=lambda x: np.log(x["n_days_since"] + 1) ** 3,
    n_days_since2=lambda x: x["n_days_since"] ** 2,
    n_days_since3=lambda x: x["n_days_since"] ** 3,
    ln_review_scores_rating=lambda x: np.log(x["n_review_scores_rating"]),
).assign(
    ln_days_since=lambda x: np.where(
        x["ln_days_since"].isnull(), 0, x["ln_days_since"]
    ),
    ln_days_since2=lambda x: np.where(
        x["ln_days_since2"].isnull(), 0, x["ln_days_since2"]
    ),
    ln_days_since3=lambda x: np.where(
        x["ln_days_since3"].isnull(), 0, x["ln_days_since3"]
    ),
)

In [None]:
# Look at price
data["price"].describe()


In [None]:
# where do we have missing variables now?
to_filter = data.isna().sum()
to_filter[to_filter > 0]


## Business logic- define our prediction problem

Decision – size, we need a normal apartment, 1-7persons


In [None]:
data = data.loc[data["n_accommodates"] < 8]


That's gonna be our sample

In [None]:
skim(data)

### Descriptive statistics

 How is the average price changing in my district by `property_type`, `room_type` and the `bed_type`?


In [None]:
data.groupby(["f_property_type", "f_room_type"]).agg(mean_price=("price", np.mean))

In [None]:
data.groupby(["f_bed_type"]).agg(mean_price=("price", np.mean))


In [None]:
data.price.describe()


For plots, we exclude  extreme values of price

In [None]:
datau = data.loc[data.price < 400]


In [None]:
# Distribution of price by type below 400# Histograms# price
(
    ggplot(datau, aes(x="price"))
    + geom_histogram(
        aes(y="stat(count)/sum(stat(count))"),
        binwidth=10,
        fill="blue",
        color="white",
        alpha=0.8,
        boundary=0,
        closed="left",
    )
    + labs(x="Price (US dollars)", y="Percent")
    + scale_y_continuous(
        expand=(0.00, 0.00),
        limits=(0, 0.15),
        breaks=np.arange(0, 0.16, 0.03),
        labels=percent_format(),
    )
    + scale_x_continuous(
        expand=(0.00, 0.00), limits=(0, 400), breaks=np.arange(0, 401, 50)
    )
    + theme_bw()
)

In [None]:
(
    ggplot(datau, aes(x="ln_price"))
    + geom_histogram(
        aes(y="stat(count)/sum(stat(count))"),
        binwidth=0.2,
        fill="blue",
        color="white",
        alpha=0.8,
        boundary=0,
        closed="left",
    )
    + coord_cartesian(xlim=(2.5, 6.5))
    + scale_y_continuous(
        expand=(0.00, 0.00),
        limits=(0, 0.16),
        breaks=np.arange(0, 0.16, 0.05),
        labels=percent_format(),
    )
    + scale_x_continuous(expand=(0.00, 0.01), breaks=np.arange(2.4, 6.7, 0.6))
    + labs(x="ln(price, US dollars)", y="Percent")
    + theme_bw()
)


Boxplot of price by room type

In [None]:
(
    ggplot(datau, aes(x="f_room_type", y="price"))
    + stat_boxplot(
        aes(group="f_room_type"),
        geom="errorbar",
        width=0.3,
        color=("red", "blue", "black"),
        size=0.5,
        na_rm=True,
    )
    + geom_boxplot(
        aes(group="f_room_type"),
        color=("red", "blue", "black"),
        fill=("red", "blue", "black"),
        size=0.5,
        width=0.6,
        alpha=0.3,
        na_rm=True,
        outlier_alpha=0,  # to remove outliers
    )
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 300), breaks=np.arange(0, 301, 100)
    )
    + labs(x="Room type", y="Price (US dollars)")
    + theme_bw()
)

In [None]:
(
    ggplot(
        datau,
        aes(
            x="factor(n_accommodates)",
            y="price",
            fill="factor(f_property_type)",
            color="factor(f_property_type)",
        ),
    )
    + geom_boxplot(alpha=0.3, na_rm=True, outlier_alpha=0, width=0.8, stat="boxplot")
    + stat_boxplot(geom="errorbar", width=0.8, size=0.3, na_rm=True)
    + scale_color_manual(name=" ", values=("red", "blue"))
    + scale_fill_manual(name=" ", values=("red", "blue"))
    + labs(x="Accomodates (Persons)", y="Price (US dollars)")
    + scale_y_continuous(
        expand=(0.01, 0.01), limits=(0, 400), breaks=np.arange(0, 401, 50)
    )
    + theme_bw()
    + theme(
        legend_position=(0.3, 0.78),
        legend_title=element_blank(),
        legend_background=element_blank(),
    )
)

Save data for part II

In [None]:
data.to_csv("data/airbnb_hackney_work.csv",index=False)