In [1]:
cd ..

/Users/shelvia.hotama/IdeaProjects/airbnb-ml


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import src.transform as trans

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
from datetime import date, timedelta

today = date.today() - timedelta(1)
currentDate = today.strftime("%Y-%m-%d")

In [4]:
import s3fs
import pyarrow.parquet as pq

s3 = s3fs.S3FileSystem()

filePath = 's3://airbnb-barcelona/valid/currentDate=%s' % currentDate
airbnb_df = pq.ParquetDataset(filePath, filesystem=s3).read_pandas().to_pandas()

airbnb_df = airbnb_df.drop(columns=[
    'rowId',
    'id',
    'host_location',
    'host_neighbourhood',
    'street',
    'neighbourhood',
    'neighbourhood_cleansed',
    'market',
    'license',
    'zipcode'
])

print(airbnb_df.shape)
print(airbnb_df.dtypes.value_counts())

(20428, 56)
float64    36
object     20
dtype: int64


## 

In [10]:
# drop outliers
airbnb_df = trans.drop_rows_occurs_less_than(airbnb_df, "cancellation_policy", 2)
airbnb_df = trans.drop_rows_occurs_less_than(airbnb_df, "host_response_time", 1)
# boolean to float
airbnb_df = trans.encode_boolean_to_float(airbnb_df, "host_is_superhost")
airbnb_df = trans.encode_boolean_to_float(airbnb_df, "host_has_profile_pic")
airbnb_df = trans.encode_boolean_to_float(airbnb_df, "host_identity_verified")
airbnb_df = trans.encode_boolean_to_float(airbnb_df, "is_location_exact")
airbnb_df = trans.encode_boolean_to_float(airbnb_df, "has_availability")
airbnb_df = trans.encode_boolean_to_float(airbnb_df, "instant_bookable")
airbnb_df = trans.encode_boolean_to_float(airbnb_df, "require_guest_profile_picture")
airbnb_df = trans.encode_boolean_to_float(airbnb_df, "require_guest_phone_verification")
# fillna
airbnb_df = trans.fillna_with_lowest_occurance(airbnb_df, "host_since")
airbnb_df = trans.fillna_with_lowest_occurance(airbnb_df, "host_response_time")
airbnb_df = trans.fillna_with_lowest_occurance(airbnb_df, "host_is_superhost")
airbnb_df = trans.fillna_with_lowest_occurance(airbnb_df, "host_has_profile_pic")
airbnb_df = trans.fillna_with_lowest_occurance(airbnb_df, "host_identity_verified")
airbnb_df = trans.fillna_with_lowest_occurance(airbnb_df, "first_review")
airbnb_df = trans.fillna_with_lowest_occurance(airbnb_df, "last_review")
# category encode
category_encoder = trans.encode_category_dic(airbnb_df)
category_columns = [
    "neighbourhood_group_cleansed",
    "property_type",
    "room_type",
    "bed_type",
    "cancellation_policy"
]
category_dic = trans.foldleft(category_encoder, {}, category_columns)
dic_host_response_time = {'host_response_time': {'N/A': 1, 'a few days or more': 2, 'within a day': 3, 'within a few hours': 4, 'within an hour': 5}}
category_dic = dict(dic_host_response_time, **category_dic)
airbnb_df = airbnb_df.replace(category_dic)
airbnb_df.head(1)

Unnamed: 0,host_since,host_response_time,host_response_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2010-01-19,5,0.99,0.0,46.0,46.0,"['email', 'phone', 'reviews', 'jumio', 'government_id']",1.0,1.0,8,41.40889,2.18555,1.0,2,1,6.0,1.0,2.0,4.0,5,"{TV,Internet,Wifi,""Air conditioning"",""Wheelchair accessible"",Kitchen,Elevator,""Free street parking"",Heating,""Family/kid friendly"",Washer,Dryer,Essentials,Shampoo,""Hair dryer"",""Hot water"",""Host greets you"",""Paid parking on premises""}",130.0,150.0,42.0,2.0,25.0,3.0,730.0,3.0,730.0,3 months ago,1.0,0.0,0.0,29.0,304.0,1.0,0.0,2015-10-10,2015-10-10,80.0,10.0,10.0,2.0,10.0,10.0,8.0,0.0,1,0.0,0.0,30.0,30.0,0.0,0.0,0.02


## One Hot Encoding for amenities and host_verifications

In [13]:
all_amenities_series = airbnb_df['amenities'].apply(trans.explode_string_to_list, pattern_to_remove='[{}""]')
irrelevant_amenities = ['', 
                        'translation missing: en.hosting_amenity_49', 
                        'translation missing: en.hosting_amenity_50']
amenities_to_remove = irrelevant_amenities + trans.get_keys_below_threshold(all_amenities_series, 51) 

ohe_amenities_df = trans.get_one_hot_encoded_df(all_amenities_series).drop(columns=amenities_to_remove)

In [14]:

all_host_verifications_series = airbnb_df['host_verifications'].apply(trans.explode_string_to_list, pattern_to_remove='[\[\]\']|[\'\s]')
invalid_verifications = ['','none']
verifications_to_remove = invalid_verifications + trans.get_keys_below_threshold(all_host_verifications_series, 51) 

ohe_host_verifications_df = trans.get_one_hot_encoded_df(all_host_verifications_series).drop(columns=verifications_to_remove)

In [15]:
ohe_airbnb_df = airbnb_df.join(ohe_amenities_df).join(ohe_host_verifications_df).drop(columns=['amenities', 'host_verifications'])

## Feature Selection

## p-value analysis