In [1]:
cd ..

/Users/sgemma.sun/Documents/data101/airbnb-ml


In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import src.transform as trans

In [3]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
import s3fs
import pyarrow.parquet as pq

s3 = s3fs.S3FileSystem()

filePath = 's3://airbnb-barcelona/valid/currentDate=2020-03-11'
airbnb_df = pq.ParquetDataset(filePath, filesystem=s3).read_pandas().to_pandas()

airbnb = airbnb_df.drop(columns=[
    'rowId',
    'id',
    'host_location',
    'host_neighbourhood',
    'street',
    'neighbourhood',
    'neighbourhood_cleansed',
    'market',
    'license',
    'zipcode'
])

print(airbnb_df.shape)
print(airbnb_df.dtypes.value_counts())

(20428, 66)
float64    36
object     30
dtype: int64


In [5]:
cat_df = airbnb.select_dtypes(include=['object']).copy()
cat_df.head(1)

Unnamed: 0,host_since,host_response_time,host_is_superhost,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,is_location_exact,property_type,room_type,bed_type,amenities,calendar_updated,has_availability,first_review,last_review,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification
0,2010-01-19,within an hour,False,"['email', 'phone', 'reviews', 'jumio', 'government_id']",True,True,Sant Martí,True,Apartment,Entire home/apt,Real Bed,"{TV,Internet,Wifi,""Air conditioning"",""Wheelchair accessible"",Kitchen,Elevator,""Free street parking"",Heating,""Family/kid friendly"",Washer,Dryer,Essentials,Shampoo,""Hair dryer"",""Hot water"",""Host greets you"",""Paid parking on premises""}",3 months ago,True,2015-10-10,2015-10-10,False,flexible,False,False


In [9]:
cat_df["host_verifications"].value_counts()

nt_id', 'government_id']                                                                   2
['email', 'google', 'reviews', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']                                                                   2
['email', 'phone', 'facebook', 'reviews', 'jumio', 'work_email']                                                                                                                 2
['email', 'phone', 'facebook', 'reviews', 'selfie', 'work_email']                                                                                                                2
['email', 'phone', 'facebook', 'reviews', 'offline_government_id', 'selfie', 'government_id', 'work_email']                                                                      2
['email', 'phone', 'jumio', 'government_id', 'work_email']                                                                                                                       2
['email', 'p

In [6]:
# drop outliers
cat_df = trans.drop_rows_occurs_less_than(cat_df, "cancellation_policy", 2)
cat_df = trans.drop_rows_occurs_less_than(cat_df, "host_response_time", 1)
# boolean to float
cat_df = trans.encode_boolean_to_float(cat_df, "host_is_superhost")
cat_df = trans.encode_boolean_to_float(cat_df, "host_has_profile_pic")
cat_df = trans.encode_boolean_to_float(cat_df, "host_identity_verified")
cat_df = trans.encode_boolean_to_float(cat_df, "is_location_exact")
cat_df = trans.encode_boolean_to_float(cat_df, "has_availability")
cat_df = trans.encode_boolean_to_float(cat_df, "instant_bookable")
cat_df = trans.encode_boolean_to_float(cat_df, "require_guest_profile_picture")
cat_df = trans.encode_boolean_to_float(cat_df, "require_guest_phone_verification")
# fillna
cat_df = trans.fillna_with_lowest_occurance(cat_df, "host_since")
cat_df = trans.fillna_with_lowest_occurance(cat_df, "host_response_time")
cat_df = trans.fillna_with_lowest_occurance(cat_df, "host_is_superhost")
cat_df = trans.fillna_with_lowest_occurance(cat_df, "host_has_profile_pic")
cat_df = trans.fillna_with_lowest_occurance(cat_df, "host_identity_verified")
cat_df = trans.fillna_with_lowest_occurance(cat_df, "first_review")
cat_df = trans.fillna_with_lowest_occurance(cat_df, "last_review")
# category encode
dic_host_response_time = {'host_response_time': {'N/A': 1, 'a few days or more': 2, 'within a day': 3, 'within a few hours': 4, 'within an hour': 5}}
cat_df = cat_df.replace(dic_host_response_time)

In [8]:
cat_df.head(1)

Unnamed: 0,host_since,host_response_time,host_is_superhost,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,is_location_exact,property_type,room_type,bed_type,amenities,calendar_updated,has_availability,first_review,last_review,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification
0,2010-01-19,within an hour,0,"['email', 'phone', 'reviews', 'jumio', 'government_id']",1,1,Sant Martí,1.0,Apartment,Entire home/apt,Real Bed,"{TV,Internet,Wifi,""Air conditioning"",""Wheelchair accessible"",Kitchen,Elevator,""Free street parking"",Heating,""Family/kid friendly"",Washer,Dryer,Essentials,Shampoo,""Hair dryer"",""Hot water"",""Host greets you"",""Paid parking on premises""}",3 months ago,1.0,2015-10-10,2015-10-10,0.0,flexible,0.0,0.0


In [6]:
import src.transform as trans
category_encoder = trans.encode_category_dic(airbnb)
category_columns = [
    "neighbourhood_group_cleansed",
    "property_type",
    "room_type",
    "bed_type"
]
category_dic = trans.foldleft(category_encoder, {}, category_columns)
category_dic

{'neighbourhood_group_cleansed': {'1.0': 1,
  'Ciutat Vella': 2,
  'Eixample': 3,
  'Gràcia': 4,
  'Horta-Guinardó': 5,
  'Les Corts': 6,
  'Nou Barris': 7,
  'Sant Andreu': 8,
  'Sant Martí': 9,
  'Sants-Montjuïc': 10,
  'Sarrià-Sant Gervasi': 11},
 'property_type': {'1': 1,
  '2.15484': 2,
  'Aparthotel': 3,
  'Apartment': 4,
  'Barn': 5,
  'Bed and breakfast': 6,
  'Boat': 7,
  'Boutique hotel': 8,
  'Cabin': 9,
  'Camper/RV': 10,
  'Casa particular (Cuba)': 11,
  'Castle': 12,
  'Chalet': 13,
  'Condominium': 14,
  'Cottage': 15,
  'Dome house': 16,
  'Farm stay': 17,
  'Guest suite': 18,
  'Guesthouse': 19,
  'Hostel': 20,
  'Hotel': 21,
  'House': 22,
  'Houseboat': 23,
  'Island': 24,
  'Loft': 25,
  'Nature lodge': 26,
  'Other': 27,
  'Serviced apartment': 28,
  'Tiny house': 29,
  'Townhouse': 30,
  'Treehouse': 31,
  'Villa': 32},
 'room_type': {'$0.00': 1,
  'Entire home/apt': 2,
  'Hotel room': 3,
  'Private room': 4,
  'Shared room': 5,
  't': 6},
 'bed_type': {'0': 1,
  

In [None]:
numeric_df = airbnb_df.select_dtypes(include=['float64', 'int32']).copy()

In [None]:
replace_map = foldleft(airbnb_replace, {}, category_columns)
replace_map

In [None]:
cat_airbnb = airbnb_drop.replace(mapp)