#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.
 
cleaning London airbnb file

v.1.3. 2021-05-17 paths changed


in: data from web

out: airbnb_london_cleaned.csv

In [1]:
import pandas as pd
import os
import warnings

warnings.filterwarnings("ignore")

In [2]:
data_dir = "/Users/vigadam/Dropbox/work/data_book/da_data_repo/"

# location of folders
data_in = data_dir + "airbnb/raw/"
data_out = data_dir + "airbnb/clean/"

In [3]:
# zero step
data = pd.read_csv(data_in + "listings.csv", delimiter=",", dtype="unicode")
drops = [
    "host_thumbnail_url",
    "host_picture_url",
    "listing_url",
    "thumbnail_url",
    "medium_url",
    "picture_url",
    "xl_picture_url",
    "host_url",
    "last_scraped",
    "description",
    "experiences_offered",
    "neighborhood_overview",
    "notes",
    "transit",
    "access",
    "interaction",
    "house_rules",
    "host_about",
    "host_response_time",
    "name",
    "summary",
    "space",
    "host_location",
]
data.drop(columns=drops, inplace=True)
data.to_csv(data_in + "airbnb_london_listing.csv", index=False)

In [4]:
# opening dataset
df = pd.read_csv(data_in + "airbnb_london_listing.csv", delimiter=",")

In [5]:
# drop broken lines - where id is not a character of numbers
df.id = pd.to_numeric(df.id, errors="coerce")
df = df[df.id.notna()]

In [6]:
# display the class and type of each columns
df.dtypes

id                                    int64
scrape_id                             int64
host_id                               int64
host_name                            object
host_since                           object
                                     ...   
cancellation_policy                  object
require_guest_profile_picture        object
require_guest_phone_verification     object
calculated_host_listings_count        int64
reviews_per_month                   float64
Length: 72, dtype: object

In [7]:
#####################
# formatting columns
for perc in ["host_response_rate", "host_acceptance_rate"]:
    df[perc] = pd.to_numeric(df[perc], errors="coerce")

In [8]:
# remove percentage signs
for pricevars in [
    "price",
    "weekly_price",
    "monthly_price",
    "security_deposit",
    "cleaning_fee",
    "extra_people",
]:
    df[pricevars] = df[pricevars].str.replace("\\$", "")
    df[pricevars] = pd.to_numeric(df[pricevars], errors="coerce")

In [9]:
# format binary variables
for binary in [
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
    "is_location_exact",
    "requires_license",
    "instant_bookable",
    "require_guest_profile_picture",
    "require_guest_phone_verification",
]:
    df[binary] = df[binary].map({"t": True, "f": False})

In [10]:
# amenities
df["amenities"] = df["amenities"].str.strip("{}").str.replace('"', "").str.split(",")

In [11]:
# generate dummies by amenities
print(sorted(df["amenities"].explode().unique()))
dummies = pd.get_dummies(df.amenities.apply(pd.Series).stack()).sum(level=0)
df = pd.concat([df, dummies], axis=1)


drops = [
    "",
    "amenities",
    "translation missing: en.hosting_amenity_49",
    "translation missing: en.hosting_amenity_50",
]
df.drop(columns=drops, inplace=True)

['', '24-hour check-in', 'Air conditioning', 'Breakfast', 'Buzzer/wireless intercom', 'Cable TV', 'Carbon monoxide detector', 'Cat(s)', 'Dog(s)', 'Doorman', 'Doorman Entry', 'Dryer', 'Elevator in building', 'Essentials', 'Family/kid friendly', 'Fire extinguisher', 'First aid kit', 'Free parking on premises', 'Free parking on street', 'Gym', 'Hair dryer', 'Hangers', 'Heating', 'Hot tub', 'Indoor fireplace', 'Internet', 'Iron', 'Keypad', 'Kitchen', 'Laptop friendly workspace', 'Lock on bedroom door', 'Lockbox', 'Other pet(s)', 'Paid parking off premises', 'Pets allowed', 'Pets live on this property', 'Pool', 'Private entrance', 'Private living room', 'Safety card', 'Self Check-In', 'Shampoo', 'Smartlock', 'Smoke detector', 'Smoking allowed', 'Suitable for events', 'TV', 'Washer', 'Washer / Dryer', 'Wheelchair accessible', 'Wireless Internet', 'translation missing: en.hosting_amenity_49', 'translation missing: en.hosting_amenity_50']


In [12]:
# MINOR STUFF
# data changed marginally, to make it compatible with textbook, we'll drop 27 rows.
not_in_book = pd.read_csv(data_in + "not_in_book.csv", delimiter=",", dtype="unicode")
df = df[~df.id.isin(not_in_book.id.astype(int))].reset_index(drop=True)

In [13]:
# write csv
df.to_csv(data_out + "airbnb_london_cleaned.csv", index=False)