In [1]:
!pwd

/Users/shelvia.hotama/IdeaProjects/airbnb-ml/notebooks


In [2]:
cd ..

/Users/shelvia.hotama/IdeaProjects/airbnb-ml


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import src.transform as trans

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
from datetime import date, timedelta

today = date.today() - timedelta(1)
currentDate = today.strftime("%Y-%m-%d")

In [6]:
import s3fs
import pyarrow.parquet as pq

s3 = s3fs.S3FileSystem()

filePath = 's3://airbnb-barcelona/valid/currentDate=%s' % currentDate
airbnb_df = pq.ParquetDataset(filePath, filesystem=s3).read_pandas().to_pandas()

airbnb = airbnb_df.drop(columns=[
    'rowId',
    'id',
    'host_location',
    'host_neighbourhood',
    'street',
    'neighbourhood',
    'neighbourhood_cleansed',
    'market',
    'license',
    'zipcode'
])

print(airbnb_df.shape)
print(airbnb_df.dtypes.value_counts())

(20428, 66)
float64    36
object     30
dtype: int64


In [5]:
import re

def explode_string(amenities):
    amenities = re.sub('[{}"]', '', amenities.strip())
    amenities = amenities.split(",")
    return amenities


In [8]:
import itertools

def flatten(list_of_lists):
    return itertools.chain.from_iterable(amenities_df.values)


In [9]:
from collections import Counter

amenities_df = airbnb_df['amenities'].apply(explode_string)
amenities_dict = dict(Counter(flatten(amenities_df.values)))

In [10]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
one_hot_encoded_amenities = pd.DataFrame(mlb.fit_transform(amenities_df),
                   columns=mlb.classes_,
                   index=amenities_df.index)

In [12]:
def remove_item_from_dict(dictionary, keys_to_remove):
    for key in keys_to_remove:
        del dictionary[key]

In [13]:
keys_to_remove = []
for amenity, count in amenities_dict.items():
    if count < 51:
        keys_to_remove.append(amenity)
        

In [14]:
len(amenities_dict)

192

In [15]:
irrelevant_amenities = ['1125', '1', '', 'translation missing: en.hosting_amenity_49', 'translation missing: en.hosting_amenity_50']
keys_to_remove += irrelevant_amenities
amenities_df = one_hot_encoded_amenities.drop(keys_to_remove, axis=1)
amenities_df.head(5)

Unnamed: 0,toilet,24-hour check-in,Accessible-height bed,Accessible-height toilet,Air conditioning,BBQ grill,Baby bath,Babysitter recommendations,Balcony,Bath towel,Bathroom essentials,Bathtub,Beach essentials,Beachfront,Bed linens,Bedroom comforts,Body soap,Breakfast,Breakfast table,Building staff,Buzzer/wireless intercom,Cable TV,Carbon monoxide detector,Cat(s),Changing table,Children’s books and toys,Children’s dinnerware,Cleaning before checkout,Coffee maker,Cooking basics,Crib,Disabled parking spot,Dishes and silverware,Dishwasher,Dog(s),Doorman,Dryer,EV charger,Elevator,En suite bathroom,Espresso machine,Essentials,Ethernet connection,Extra pillows and blankets,Extra space around bed,Family/kid friendly,Fire extinguisher,First aid kit,Fixed grab bars for shower,Flat path to guest entrance,Free parking on premises,Free street parking,Full kitchen,Game console,Garden or backyard,Gym,Hair dryer,Handheld shower head,Hangers,Heated towel rack,Heating,High chair,Host greets you,Hot tub,Hot water,Hot water kettle,Indoor fireplace,Internet,Iron,Keypad,Kitchen,Laptop friendly workspace,Lock on bedroom door,Lockbox,Long term stays allowed,Luggage dropoff allowed,Microwave,Netflix,No stairs or steps to enter,Other,Outdoor seating,Outlet covers,Oven,Pack ’n Play/travel crib,Paid parking off premises,Paid parking on premises,Patio or balcony,Pets allowed,Pets live on this property,Pocket wifi,Pool,Private entrance,Private living room,Rain shower,Refrigerator,Room-darkening shades,Safety card,Self check-in,Shampoo,Single level home,Ski-in/Ski-out,Smart TV,Smart lock,Smoke detector,Smoking allowed,Stair gates,Step-free shower,Stove,Suitable for events,TV,Table corner guards,Terrace,Toilet paper,Walk-in shower,Washer,Waterfront,Well-lit path to entrance,Wheelchair accessible,Wide clearance to shower,Wide doorway to guest bathroom,Wide entrance,Wide entrance for guests,Wide entryway,Wide hallways,Wifi,Window guards
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,1,1,0,1,1,0,0,1,0,1,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0,1,1,1,0,1,0,0,1,1,0,1,1,0,0,1,1,1,0,1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,1,1,0,0,0,1,1,1,1,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
