#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### Chapter 16
**CH16A Predicting apartment prices with random forest**

using the airbnb dataset

version 0.9.0 2025-08-14

In [53]:
import os
import re
import sys
import warnings
from datetime import datetime

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

In [54]:
# Current script folder
current_path = os.getcwd()
dirname = current_path.split("da_case_studies")[0]

# location folders
data_in = dirname + "da_data_repo/airbnb/clean/"
data_out = dirname + "da_case_studies/ch16-airbnb-random-forest/"
output = dirname + "da_case_studies/ch16-airbnb-random-forest/output/"
func = dirname + "da_case_studies/ch00-tech-prep/"
sys.path.append(func)

In [55]:
import py_helper_functions as da

-------------------------------------------------------
### Import data

In [56]:
area = "london"
data = pd.read_csv(data_in + "airbnb_" + area + "_cleaned_book.csv", index_col=0)
#data = pd.read_csv("https://osf.io/download/7n96w/", index_col=0)


In [57]:
data["property_type"].value_counts()


property_type
Apartment             38270
House                 13055
Bed & Breakfast        1066
Townhouse               372
Other                   267
Loft                    254
Dorm                    127
Guesthouse               81
Boat                     69
Serviced apartment       65
Condominium              56
Bungalow                 47
Boutique hotel           35
Hostel                   32
Cabin                    32
Villa                    12
Camper/RV                 9
Chalet                    9
Yurt                      4
Hut                       3
Castle                    3
Tent                      2
Parking Space             2
Ryokan (Japan)            1
Lighthouse                1
Igloo                     1
Cave                      1
Name: count, dtype: int64

In [58]:
# keep if property type is Apartment, House or Townhouse
data = data.loc[lambda x: x["property_type"].isin(["Apartment", "House", "Townhouse"])]


In [59]:
# rename Townhouse to House
# f_ = Factor (Categorical variable)

data["property_type"] = np.where(
    data["property_type"] == "Townhouse", "House", data["property_type"]
)
data["f_property_type"] = data["property_type"].astype("category")


In [60]:
data["room_type"].value_counts()


room_type
Entire home/apt    26742
Private room       24415
Shared room          540
Name: count, dtype: int64

In [61]:
# Room type as factor

data["f_room_type"] = data["room_type"].astype("category")


In [62]:
# Rename roomt type because it is too long
data["f_room_type2"] = data["f_room_type"].map(
    {
        "Entire home/apt": "Entire/Apt",
        "Private room": "Private",
        "Shared room": "Shared",
    }
)


In [63]:
# cancellation policy as factor
data["cancellation_policy"].value_counts()


cancellation_policy
strict             21287
flexible           18435
moderate           11959
super_strict_30       15
super_strict_60        1
Name: count, dtype: int64

In [64]:
# if cancellation policy is super strict 30 or 60, rename it as strict
data["cancellation_policy"] = np.where(
    (data["cancellation_policy"] == "super_strict_30")
    | (data["cancellation_policy"] == "super_strict_60"),
    "strict",
    data["cancellation_policy"],
)
data["f_cancellation_policy"] = data["cancellation_policy"].astype("category")


In [65]:
# bed_type and neighbourhood_cleansed as factors

data["bed_type"] = np.where(
    data["bed_type"].isin(["Futon", "Pull-out Sofa", "Airbed"]),
    "Couch",
    data["bed_type"],
)

data["f_bed_type"] = data["bed_type"].astype("category")
data["f_neighbourhood_cleansed"] = data["neighbourhood_cleansed"].astype("category")


---------

### Create Numerical variables

In [66]:
data["usd_price_day"] = data["price"]
data["p_host_response_rate"] = data["host_response_rate"].fillna(0).astype(int)
# rename cleaning_fee column

data = data.rename(columns={"cleaning_fee": "usd_cleaning_fee"})


### üïµÔ∏è‚Äç‚ôÄÔ∏è Bellonda's Logic Decoder: Type Enforcement & Sanitization
**The Syntax Anatomy:**
* `data[col]` (**Dirty Series**) $\to$ `pd.to_numeric(...)` (**The Sanitizer**)
* `errors="coerce"`: **CRITICAL.** If value cannot be parsed, replace with `NaN`. Do not raise error.

**Data Flow:** `(N_rows, Mixed_Type)` $\to$ `(N_rows, Float64)`
**Student Note:** The prefix `n_` marks these as "safe" for the Random Forest. Any generated `NaN`s must be handled (imputed) before modeling.

In [67]:
# add new numeric columns from certain columns
# n_ = Numeric

numericals = [
    "accommodates",
    "bathrooms",
    "review_scores_rating",
    "number_of_reviews",
    "guests_included",
    "reviews_per_month",
    "extra_people",
    "minimum_nights",
    "beds",
]

for col in numericals:
    data["n_" + col] = pd.to_numeric(data[col], errors="coerce")

# # Vectorized approach using apply
# data[["n_" + col for col in numericals]] = data[numericals].apply(pd.to_numeric, errors="coerce")



In [68]:
data.head()

Unnamed: 0,id,scrape_id,host_id,host_name,host_since,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,square_feet,price,weekly_price,monthly_price,security_deposit,usd_cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,24-hour check-in,Air conditioning,Breakfast,Buzzer/wireless intercom,Cable TV,Carbon monoxide detector,Cat(s),Dog(s),Doorman,Doorman Entry,Dryer,Elevator in building,Essentials,Family/kid friendly,Fire extinguisher,First aid kit,Free parking on premises,Free parking on street,Gym,Hair dryer,Hangers,Heating,Hot tub,Indoor fireplace,Internet,Iron,Keypad,Kitchen,Laptop friendly workspace,Lock on bedroom door,Lockbox,Other pet(s),Paid parking off premises,Pets allowed,Pets live on this property,Pool,Private entrance,Private living room,Safety card,Self Check-In,Shampoo,Smartlock,Smoke detector,Smoking allowed,Suitable for events,TV,Washer,Washer / Dryer,Wheelchair accessible,Wireless Internet,f_property_type,f_room_type,f_room_type2,f_cancellation_policy,f_bed_type,f_neighbourhood_cleansed,usd_price_day,p_host_response_rate,n_accommodates,n_bathrooms,n_review_scores_rating,n_number_of_reviews,n_guests_included,n_reviews_per_month,n_extra_people,n_minimum_nights,n_beds
1,15896822,20170304065726,69018624,Dafina,2016-04-26,100.0,,0.0,RB of Kingston upon Thames,1.0,1.0,"['email', 'phone', 'facebook']",1.0,0.0,"A Thames Street, Kingston upon Thames, England...",RB of Kingston upon Thames,Kingston upon Thames,,Kingston upon Thames,England,KT1 1PE,London,"Kingston upon Thames, United Kingdom",GB,United Kingdom,51.410034,-0.306323,1.0,Apartment,Private room,1.0,1.0,1.0,1.0,Real Bed,,23.0,,,,,1.0,8.0,7.0,1125.0,6 weeks ago,,1.0,31.0,61.0,61.0,2017-03-05,1.0,2016-12-03,2016-12-03,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,,,0.0,flexible,0.0,0.0,1.0,0.32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,Apartment,Private room,Private,flexible,Real Bed,Kingston upon Thames,23.0,100,1.0,1.0,100.0,1.0,1.0,0.32,8.0,7.0,1.0
2,4836957,20170304065726,18154504,Anas,2014-07-15,100.0,,0.0,RB of Kingston upon Thames,1.0,1.0,"['email', 'phone', 'reviews', 'jumio']",1.0,1.0,"London Road, Kingston upon Thames, Greater Lon...",RB of Kingston upon Thames,Kingston upon Thames,,Kingston upon Thames,Greater London,KT2 6QS,London,"Kingston upon Thames, United Kingdom",GB,United Kingdom,51.411484,-0.290704,1.0,Apartment,Private room,2.0,1.0,1.0,1.0,Couch,,50.0,300.0,,,,1.0,0.0,1.0,1125.0,5 months ago,,29.0,59.0,89.0,364.0,2017-03-04,15.0,2015-05-03,2016-09-07,91.0,9.0,9.0,10.0,9.0,9.0,10.0,0.0,,,0.0,moderate,0.0,0.0,1.0,0.67,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,Apartment,Private room,Private,moderate,Couch,Kingston upon Thames,50.0,100,2.0,1.0,91.0,15.0,1.0,0.67,0.0,1.0,1.0
3,13355982,20170304065726,75741819,Maria,2016-06-04,,,0.0,RB of Kingston upon Thames,1.0,1.0,"['email', 'phone', 'reviews']",1.0,0.0,"Kingston Hill, Kingston upon Thames, KT2 7PW, ...",RB of Kingston upon Thames,Kingston upon Thames,,Kingston upon Thames,,KT2 7PW,,"Kingston upon Thames, United Kingdom",GB,United Kingdom,51.415852,-0.286496,1.0,Apartment,Private room,2.0,1.0,1.0,1.0,Real Bed,,24.0,,,400.0,,1.0,0.0,1.0,1125.0,8 months ago,,0.0,0.0,0.0,0.0,2017-03-05,2.0,2016-07-06,2016-07-27,80.0,10.0,8.0,10.0,10.0,10.0,8.0,0.0,,,0.0,flexible,0.0,0.0,1.0,0.25,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,Apartment,Private room,Private,flexible,Real Bed,Kingston upon Thames,24.0,0,2.0,1.0,80.0,2.0,1.0,0.25,0.0,1.0,1.0
4,13472704,20170304065726,77078182,Hannah,2016-06-11,,,0.0,RB of Kingston upon Thames,1.0,1.0,"['email', 'phone']",1.0,0.0,"Canbury Avenue, Kingston upon Thames, KT2 6JR,...",RB of Kingston upon Thames,Kingston upon Thames,,Kingston upon Thames,,KT2 6JR,,"Kingston upon Thames, United Kingdom",GB,United Kingdom,51.415722,-0.292246,1.0,House,Private room,2.0,1.5,1.0,1.0,Real Bed,,50.0,,,70.0,,1.0,0.0,2.0,1125.0,9 months ago,,0.0,0.0,0.0,0.0,2017-03-05,0.0,,,,,,,,,,0.0,,,0.0,flexible,0.0,0.0,1.0,,1,0,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,1,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,House,Private room,Private,flexible,Real Bed,Kingston upon Thames,50.0,0,2.0,1.5,,0.0,1.0,,0.0,2.0,1.0
5,17430865,20170304065726,113972982,Jung Kyung,2017-01-30,100.0,,0.0,RB of Kingston upon Thames,1.0,1.0,"['email', 'phone', 'google', 'jumio', 'offline...",1.0,1.0,"Kingston Road, New Malden, England KT3 3RX, Un...",RB of Kingston upon Thames,Kingston upon Thames,,New Malden,England,KT3 3RX,London,"New Malden, United Kingdom",GB,United Kingdom,51.404285,-0.275426,1.0,House,Private room,1.0,1.0,1.0,1.0,Real Bed,,25.0,,,,,1.0,0.0,1.0,14.0,5 days ago,,29.0,59.0,89.0,179.0,2017-03-05,0.0,,,,,,,,,,0.0,,,0.0,flexible,0.0,0.0,1.0,,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,House,Private room,Private,flexible,Real Bed,Kingston upon Thames,25.0,100,1.0,1.0,,0.0,1.0,,0.0,1.0,1.0


In [69]:
# # create days since first review

# data["n_days_since"] = (
#     data.calendar_last_scraped.apply(lambda x: datetime.strptime(x, "%Y-%m-%d"))
#     - data.first_review.fillna("1950-01-01").apply(
#         lambda x: datetime.strptime(x, "%Y-%m-%d")
#     )
# ).dt.days

# data["n_days_since"] = np.where(data.first_review.isnull(), np.nan, data.n_days_since)


### üïµÔ∏è‚Äç‚ôÄÔ∏è Bellonda's Logic Decoder: Feature Engineering (Tenure)
**The Syntax Anatomy:**
* `pd.to_datetime(Col_A)` - `pd.to_datetime(Col_B)` (**Vectorized Ops**) $\to$ `TimeDelta`
* `.dt.days`: Extracts the scalar "Days" from the duration.

**Data Flow:** `(N, String) - (N, String)` $\to$ `(N, Float64)`
**Student Note:** Returns Float64 because `NaN` values (missing reviews) prevent integer storage.

In [70]:
# Bellonda's Vectorized Alternative
# No loops, no "1950" hacks, no np.where fixing.
data["n_days_since"] = (
    pd.to_datetime(data["calendar_last_scraped"], format="%Y-%m-%d")
    - pd.to_datetime(data["first_review"], format="%Y-%m-%d", errors="coerce")
).dt.days

In [71]:
f"Percentage of missing values in 'n_days_since': {data['n_days_since'].isna().sum() / len(data) * 100:.2f}%"

"Percentage of missing values in 'n_days_since': 30.51%"

### üïµÔ∏è‚Äç‚ôÄÔ∏è Bellonda's Logic Decoder: Namespace Standardization
**The Syntax Anatomy:**
* `data.columns[X:Y]` (**Positional Slicing**) $\to$ `dummies`
* `re.sub` (**Regex Sanitization**) $\to$ Removes ` ` , `/`, `-`.

**Data Flow:** `(N, 50)` $\to$ `(N, 100)` (Duplicates columns with new names)
**Student Note:** Creating copies doubles memory usage for these columns. Renaming is usually preferred.

In [72]:
# create dummy vars
dummies = data.columns[71:121]

for col in dummies:
    data["d_" + (re.sub("/|\s|-", "", col)).replace("(s)", "s").lower()] = data[col]


### üïµÔ∏è‚Äç‚ôÄÔ∏è Bellonda's Logic Decoder: Feature Subsetting
**The Syntax Anatomy:**
* `regex="^d_.*..."` (**Pattern Match**) $\to$ Auto-selects engineering features.
* `pd.concat(..., axis=1)` (**Column Bind**) $\to$ Merges features + admin IDs.

**Data Flow:** `(N, Many_Cols)` $\to$ `(N, Curated_Cols)`
**Student Note:** This step effectively "Cleans" the workspace by dropping all intermediate variables not matching the naming convention.

In [73]:
# keep columns if contain d_, n_,f_, p_, usd_ and some others

data = pd.concat(
    [
        data.filter(
            regex="^d_.*|^n_.*|^f_.*|^p_.*|^usd_.*",
        ),
        data[
            [
                "price",
                "id",
                "neighbourhood_cleansed",
                "cancellation_policy",
                "room_type",
                "property_type",
            ]
        ],
    ],
    axis=1,
)


In [74]:
#####################
### look at price ###
#####################

data["price"] = data["price"].str.replace(",", "").astype(float)

data = data.loc[lambda x: x.price < 1000]


In [75]:
# Squares and further values to create
data = data.assign(
    n_accommodates2=lambda x: x["n_accommodates"] ** 2,
    ln_accommodates=lambda x: np.log(x["n_accommodates"]),
    ln_accommodates2=lambda x: np.log(x["n_accommodates"]) ** 2,
    ln_beds=lambda x: np.log(x["n_beds"]),
    ln_number_of_reviews=lambda x: np.log(x["n_number_of_reviews"] + 1),
)


### üïµÔ∏è‚Äç‚ôÄÔ∏è Bellonda's Logic Decoder: Variable Binning (Discretization)
**The Syntax Anatomy:**
* `Variable` + `[(Start, End), ...]` (**Bin Definitions**) $\to$ `Category Label`

**Data Flow:** `(N, Float)` $\to$ `(N, Categorical Int)`
**Student Note:** Categorizing prevents the model from interpolating. It treats "1 bathroom" and "2 bathrooms" as distinct entities with potentially totally different price behaviors.

In [76]:
# Pool accomodations with 0,1,2,10 bathrooms

data["f_bathroom"] = da.pool_and_categorize_continuous_variable(
    data["n_bathrooms"], [(0, 1), (1, 2), (2, 10)], [0, 1, 2]
)

data["f_bathroom"].value_counts(dropna=False)


f_bathroom
1      41417
2       9628
0        364
NaN      237
Name: count, dtype: int64

In [77]:
# Pool num of reviews to 3 categories: none, 1-51 and >51

data["f_number_of_reviews"] = da.pool_and_categorize_continuous_variable(
    data["n_number_of_reviews"],
    [(0, 1), (1, 51), (51, data["n_number_of_reviews"].max())],
    [0, 1, 2],
)
data["f_number_of_reviews"].value_counts(dropna=False)

f_number_of_reviews
1      32683
0      15741
2       3221
NaN        1
Name: count, dtype: int64

In [78]:
# Pool and categorize the number of minimum nights: 1,2,3, 3+

data["f_minimum_nights"] = da.pool_and_categorize_continuous_variable(
    data["n_minimum_nights"], [(1, 2), (2, 3), (3, data["n_minimum_nights"].max())], [1, 2, 3]
)
data["f_minimum_nights"].value_counts(dropna=False)


f_minimum_nights
1      19454
3      18075
2      14116
NaN        1
Name: count, dtype: int64

In [79]:
# Change Infinite values with NaNs
data = data.replace([np.inf, -np.inf], np.nan)


In [80]:
# ------------------------------------------------------------------------------------------------
# where do we have missing variables now?
to_filter = data.isna().sum()
to_filter[to_filter > 0]


usd_cleaning_fee          20017
n_bathrooms                 237
n_review_scores_rating    16501
n_reviews_per_month       15741
n_beds                      167
n_days_since              15741
ln_beds                     168
f_bathroom                  237
f_number_of_reviews           1
f_minimum_nights              1
dtype: int64

In [81]:
# what to do with missing values?
# 1. drop if no target
data = data.loc[lambda x: x.price.notnull()]


In [82]:
# 2. imput when few, not that important
data = data.assign(
    n_bathrooms=lambda x: x["n_bathrooms"].fillna(np.median(x["n_bathrooms"].dropna())),
    n_beds=lambda x: np.where(x["n_beds"].isnull(), x["n_accommodates"], x["n_beds"]),
    f_bathroom=lambda x: x["f_bathroom"].fillna(1),
    f_minimum_nights=lambda x: x["f_minimum_nights"].fillna(1),
    f_number_of_reviews=lambda x: x["f_number_of_reviews"].fillna(1),
    ln_beds=lambda x: x["ln_beds"].fillna(0),
)


In [83]:
# 3. drop columns when many missing not important
data = data.drop(["usd_cleaning_fee", "p_host_response_rate"], axis=1)


In [84]:
to_filter = data.isna().sum()
to_filter[to_filter > 0]


n_review_scores_rating    16501
n_reviews_per_month       15741
n_days_since              15741
dtype: int64

In [85]:
# 4. Replace missing variables re reviews with zero, when no review + add flags
data = data.assign(
    flag_days_since=np.multiply(data.n_days_since.isna(), 1),
    n_days_since=data.n_days_since.fillna(np.median(data.n_days_since.dropna())),
    flag_review_scores_rating=np.multiply(data.n_review_scores_rating.isna(), 1),
    n_review_scores_rating=data.n_review_scores_rating.fillna(
        np.median(data.n_review_scores_rating.dropna())
    ),
    flag_reviews_per_month=np.multiply(data.n_reviews_per_month.isna(), 1),
    n_reviews_per_month=data.n_reviews_per_month.fillna(
        np.median(data.n_reviews_per_month.dropna())
    ),
    flag_n_number_of_reviews=np.multiply(data.n_number_of_reviews.isna(), 1),
)


In [86]:
data.flag_days_since.value_counts()


flag_days_since
0    35905
1    15741
Name: count, dtype: int64

In [87]:
# redo features
# Create variables, measuring the time since: squared, cubic, logs
data = data.assign(
    ln_days_since=lambda x: np.log(x["n_days_since"] + 1),
    ln_days_since2=lambda x: np.log(x["n_days_since"] + 1) ** 2,
    ln_days_since3=lambda x: np.log(x["n_days_since"] + 1) ** 3,
    n_days_since2=lambda x: x["n_days_since"] ** 2,
    n_days_since3=lambda x: x["n_days_since"] ** 3,
    ln_review_scores_rating=lambda x: np.log(x["n_review_scores_rating"]),
)


In [88]:
data.ln_days_since = data["ln_days_since"].fillna(0)
data.ln_days_since2 = data["ln_days_since2"].fillna(0)
data.ln_days_since3 = data["ln_days_since3"].fillna(0)


In [89]:
to_filter = data.isna().sum()
to_filter[to_filter > 0]


Series([], dtype: int64)

In [90]:
data.describe()


Unnamed: 0,n_accommodates,n_bathrooms,n_review_scores_rating,n_number_of_reviews,n_guests_included,n_reviews_per_month,n_extra_people,n_minimum_nights,n_beds,n_days_since,d_24hourcheckin,d_airconditioning,d_breakfast,d_buzzerwirelessintercom,d_cabletv,d_carbonmonoxidedetector,d_cats,d_dogs,d_doorman,d_doormanentry,d_dryer,d_elevatorinbuilding,d_essentials,d_familykidfriendly,d_fireextinguisher,d_firstaidkit,d_freeparkingonpremises,d_freeparkingonstreet,d_gym,d_hairdryer,d_hangers,d_heating,d_hottub,d_indoorfireplace,d_internet,d_iron,d_keypad,d_kitchen,d_laptopfriendlyworkspace,d_lockonbedroomdoor,d_lockbox,d_otherpets,d_paidparkingoffpremises,d_petsallowed,d_petsliveonthisproperty,d_pool,d_privateentrance,d_privatelivingroom,d_safetycard,d_selfcheckin,d_shampoo,d_smartlock,d_smokedetector,d_smokingallowed,d_suitableforevents,d_tv,d_washer,d_washerdryer,d_wheelchairaccessible,d_wirelessinternet,price,id,n_accommodates2,ln_accommodates,ln_accommodates2,ln_beds,ln_number_of_reviews,flag_days_since,flag_review_scores_rating,flag_reviews_per_month,flag_n_number_of_reviews,ln_days_since,ln_days_since2,ln_days_since3,n_days_since2,n_days_since3,ln_review_scores_rating
count,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0,51646.0
mean,3.057178,1.260708,92.439627,12.350327,1.415773,1.13897,6.66555,3.310266,1.708884,418.132595,0.226058,0.050769,0.139682,0.280835,0.200926,0.460229,0.038454,0.020641,0.037989,0.00577,0.432792,0.235023,0.840259,0.535317,0.254386,0.276633,0.227162,0.000678,0.031464,0.53158,0.605332,0.940499,0.071932,0.088642,0.608856,0.595167,0.002885,0.930159,0.482109,0.121345,0.031832,0.002556,0.000445,0.078806,0.069783,0.007551,0.030728,0.013108,0.116408,0.041823,0.572494,0.001859,0.776517,0.081516,0.027708,0.667796,0.841575,0.000794,0.064652,0.946985,94.884831,10487300.0,12.912733,0.955356,1.228098,0.378869,1.521264,0.304786,0.319502,0.304786,0.0,5.714039,33.4644,199.676817,293617.1,305114100.0,4.520661
std,1.888509,0.527094,8.438353,25.860475,1.044884,1.236552,12.691355,29.083719,1.168387,344.651296,0.418281,0.219527,0.34666,0.449411,0.400696,0.498421,0.192292,0.142179,0.191172,0.075742,0.495467,0.424017,0.36637,0.498756,0.43552,0.447338,0.419002,0.026024,0.17457,0.499007,0.488784,0.236563,0.258378,0.284229,0.488011,0.490864,0.053635,0.254881,0.499685,0.326531,0.175555,0.050491,0.021099,0.269438,0.254783,0.086571,0.172583,0.11374,0.320716,0.200187,0.494722,0.043074,0.416583,0.273629,0.164136,0.471008,0.365142,0.028165,0.245913,0.224065,80.928011,5173373.0,18.738439,0.561604,1.207531,0.512998,1.403402,0.460321,0.466288,0.460321,0.0,0.902317,9.507929,80.289121,564084.9,962119500.0,0.120946
min,1.0,0.0,20.0,0.0,1.0,0.01,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8795.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.995732
25%,2.0,1.0,92.0,0.0,1.0,0.47,0.0,1.0,1.0,228.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,43.0,6471128.0,4.0,0.693147,0.480453,0.0,0.0,0.0,0.0,0.0,0.0,5.433722,29.525335,160.432461,51984.0,11852350.0,4.521789
50%,2.0,1.0,94.0,3.0,1.0,0.77,0.0,2.0,1.0,327.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,74.0,11494970.0,4.0,0.693147,0.480453,0.0,1.386294,0.0,0.0,0.0,0.0,5.793014,33.559007,194.407782,106929.0,34965780.0,4.543295
75%,4.0,1.5,97.0,12.0,1.0,1.17,10.0,3.0,2.0,504.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,120.0,15139390.0,16.0,1.386294,1.921812,0.693147,2.564949,1.0,1.0,1.0,0.0,6.224558,38.745128,241.171311,254016.0,128024100.0,4.574711
max,16.0,8.0,100.0,396.0,16.0,15.0,240.0,5000.0,16.0,2722.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,999.0,17550340.0,256.0,2.772589,7.687248,2.772589,5.983936,1.0,1.0,1.0,0.0,7.909489,62.560024,494.817853,7409284.0,20168070000.0,4.60517


In [91]:
data.to_csv(data_out + "airbnb_london_workfile_adj.csv", index=False)
