In [3]:
cd ..

/Users/sgemma.sun/Documents/data101/airbnb-ml


In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import src.transform as trans

In [3]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [20]:
import s3fs
import pyarrow.parquet as pq

s3 = s3fs.S3FileSystem()

filePath = 's3://airbnb-barcelona/valid/currentDate=2020-03-11'
airbnb_df = pq.ParquetDataset(filePath, filesystem=s3).read_pandas().to_pandas()

airbnb = airbnb_df.drop(columns=[
    'rowId',
    'id',
    'host_location',
    'host_neighbourhood',
    'street',
    'neighbourhood',
    'neighbourhood_cleansed',
    'market',
    'license',
    'zipcode',
    'calendar_updated'
])

print(airbnb.shape)

(20428, 55)


In [21]:
cat_df = airbnb.select_dtypes(include=['object']).copy()

In [22]:
# drop outliers
airbnb = trans.drop_rows_occurs_less_than(airbnb, "cancellation_policy", 2)
airbnb = trans.drop_rows_occurs_less_than(airbnb, "host_response_time", 1)
# boolean to float
airbnb = trans.encode_boolean_to_float(airbnb, "host_is_superhost")
airbnb = trans.encode_boolean_to_float(airbnb, "host_has_profile_pic")
airbnb = trans.encode_boolean_to_float(airbnb, "host_identity_verified")
airbnb = trans.encode_boolean_to_float(airbnb, "is_location_exact")
airbnb = trans.encode_boolean_to_float(airbnb, "has_availability")
airbnb = trans.encode_boolean_to_float(airbnb, "instant_bookable")
airbnb = trans.encode_boolean_to_float(airbnb, "require_guest_profile_picture")
airbnb = trans.encode_boolean_to_float(airbnb, "require_guest_phone_verification")
# fillna
airbnb = trans.fillna_with_lowest_occurance(airbnb, "host_since")
airbnb = trans.fillna_with_lowest_occurance(airbnb, "host_response_time")
airbnb = trans.fillna_with_lowest_occurance(airbnb, "host_is_superhost")
airbnb = trans.fillna_with_lowest_occurance(airbnb, "host_has_profile_pic")
airbnb = trans.fillna_with_lowest_occurance(airbnb, "host_identity_verified")
airbnb = trans.fillna_with_lowest_occurance(airbnb, "first_review")
airbnb = trans.fillna_with_lowest_occurance(airbnb, "last_review")
# element count
airbnb = trans.extract_num_of_items_for_column(airbnb, "host_verifications")
airbnb = trans.extract_num_of_items_for_column(airbnb, "amenities")
# category encode
category_encoder = trans.encode_category_dic(airbnb)
category_columns = [
    "neighbourhood_group_cleansed",
    "property_type",
    "room_type",
    "bed_type",
    "cancellation_policy"
]
category_dic = trans.foldleft(category_encoder, {}, category_columns)
dic_host_response_time = {'host_response_time': {'N/A': 1, 'a few days or more': 2, 'within a day': 3, 'within a few hours': 4, 'within an hour': 5}}
category_dic = dict(dic_host_response_time, **category_dic)
airbnb = airbnb.replace(category_dic)

In [23]:
# date columns
ymd_to_time = trans.string_to_timestamp('%Y-%m-%d')
airbnb["host_since_dt"] = airbnb["host_since"].apply(ymd_to_time)
airbnb["first_review_dt"] = airbnb["first_review"].apply(ymd_to_time)
airbnb["last_review_dt"] = airbnb["last_review"].apply(ymd_to_time)

days_from_2020_03_11 = trans.days_from_date(compare_date=pd.to_datetime('2020-03-11', format='%Y-%m-%d'))
airbnb["host_since_2020_03_11"] = airbnb["host_since_dt"].apply(days_from_2020_03_11)
airbnb["first_review_2020_03_11"] = airbnb["host_since_dt"].apply(days_from_2020_03_11)
airbnb["last_review_2020_03_11"] = airbnb["host_since_dt"].apply(days_from_2020_03_11)

In [24]:
airbnb = airbnb.drop(columns=[
    'host_since',
    'first_review',
    'last_review',
    'host_since_dt',
    'first_review_dt',
    'last_review_dt',
])
airbnb.head(10)

Unnamed: 0,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,latitude,...,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,host_since_2020_03_11,first_review_2020_03_11,last_review_2020_03_11
0,5,0.99,0.0,46.0,46.0,5,1.0,1.0,8,41.40889,...,0.0,0.0,30.0,30.0,0.0,0.0,0.02,3704,3704,3704
1,5,0.99,0.0,46.0,46.0,5,1.0,1.0,2,41.4042,...,0.0,0.0,30.0,30.0,0.0,0.0,0.25,3704,3704,3704
2,5,1.0,1.0,5.0,5.0,8,1.0,1.0,8,41.41203,...,0.0,1.0,2.0,2.0,0.0,0.0,0.48,3655,3655,3655
3,5,1.0,1.0,1.0,1.0,8,1.0,1.0,3,41.40145,...,1.0,1.0,1.0,0.0,1.0,0.0,2.38,3619,3619,3619
4,5,0.92,0.0,39.0,39.0,8,1.0,0.0,3,41.4095,...,0.0,0.0,39.0,39.0,0.0,0.0,1.71,3570,3570,3570
5,5,0.92,0.0,39.0,39.0,8,1.0,0.0,3,41.40928,...,0.0,0.0,39.0,39.0,0.0,0.0,0.84,3570,3570,3570
6,1,,0.0,1.0,1.0,4,1.0,0.0,5,41.3872,...,0.0,0.0,1.0,0.0,1.0,0.0,0.17,3563,3563,3563
7,5,1.0,0.0,13.0,13.0,8,1.0,1.0,3,41.40464,...,0.0,0.0,14.0,14.0,0.0,0.0,0.58,3572,3572,3572
8,4,1.0,0.0,3.0,3.0,8,1.0,1.0,1,41.37916,...,0.0,0.0,2.0,1.0,1.0,0.0,0.07,3699,3699,3699
9,4,1.0,0.0,3.0,3.0,8,1.0,1.0,1,41.37859,...,0.0,0.0,2.0,1.0,1.0,0.0,1.28,3699,3699,3699


# Numeric values

In [26]:
numeric_df = airbnb_df.select_dtypes(include=['float64', 'int32']).copy()
numeric_df.head(1)

Unnamed: 0,host_response_rate,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bathrooms,bedrooms,beds,price,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,0.99,46.0,46.0,41.40889,2.18555,6.0,1.0,2.0,4.0,130.0,...,10.0,2.0,10.0,10.0,8.0,30.0,30.0,0.0,0.0,0.02


In [27]:
numeric_df.isnull().sum()

host_response_rate                              2745
host_listings_count                               19
host_total_listings_count                         19
latitude                                           2
longitude                                          2
accommodates                                       1
bathrooms                                          6
bedrooms                                          24
beds                                              39
price                                              1
security_deposit                                6807
cleaning_fee                                    4160
guests_included                                    1
extra_people                                       0
minimum_nights                                     1
maximum_nights                                     1
minimum_nights_avg_ntm                             0
maximum_nights_avg_ntm                             0
availability_30                               