In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

host_response_rate: 99%, 100%  -> Double
host_acceptance_rate: NaN   -> Double

In [3]:
import s3fs
import pyarrow.parquet as pq

s3 = s3fs.S3FileSystem()

filePath = 's3://airbnb-barcelona/valid/currentDate=2020-03-09'
airbnb_df = pq.ParquetDataset(filePath, filesystem=s3).read_pandas().to_pandas()

airbnb_df.shape

(20428, 81)

In [4]:
airbnb_df.head(1)

Unnamed: 0,rowId,id,experiences_offered,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,e65f8fba-5c49-4d1f-8d2a-59da66ae1cba1583723740946,18666,none,2010-01-19,"Barcelona, Cataluña, Spain",within an hour,,,False,El Camp de l'Arpa del Clot,46.0,46.0,"['email', 'phone', 'reviews', 'jumio', 'government_id']",True,True,"Barcelona, CT, Spain",Sant Martí,el Camp de l'Arpa del Clot,Sant Martí,Barcelona,CT,8026,Barcelona,"Barcelona, Spain",41.40889,2.18555,True,Apartment,Entire home/apt,6.0,1.0,2.0,4.0,Real Bed,"{TV,Internet,Wifi,""Air conditioning"",""Wheelchair accessible"",Kitchen,Elevator,""Free street parking"",Heating,""Family/kid friendly"",Washer,Dryer,Essentials,Shampoo,""Hair dryer"",""Hot water"",""Host greets you"",""Paid parking on premises""}",75.0,130.0,,,150.0,42.0,2.0,25.0,3.0,730.0,3,3.0,730.0,730,3.0,730.0,3 months ago,True,0.0,0.0,29.0,304.0,2019-11-09,1.0,0.0,2015-10-10,2015-10-10,80.0,10.0,10.0,2.0,10.0,10.0,8.0,True,HUTB-003004,False,False,flexible,False,False,30.0,30.0,0.0,0.0,0.02


In [5]:
airbnb_df.dtypes.value_counts()

float64    42
object     37
int32       2
dtype: int64

In [187]:
cat_df = airbnb_df.select_dtypes(include=['object']).copy()

In [189]:
numeric_df = airbnb_df.select_dtypes(include=['float64', 'int32']).copy()
numeric_df.dtypes

host_response_rate                              float64
host_acceptance_rate                            float64
host_listings_count                             float64
host_total_listings_count                       float64
latitude                                        float64
longitude                                       float64
accommodates                                    float64
bathrooms                                       float64
bedrooms                                        float64
beds                                            float64
square_feet                                     float64
price                                           float64
weekly_price                                    float64
monthly_price                                   float64
security_deposit                                float64
cleaning_fee                                    float64
guests_included                                 float64
extra_people                                    

In [161]:
airbnb_drop = airbnb_df.drop(columns=[
    'host_acceptance_rate',
    'square_feet',
    'host_since',
    'host_location',
    'state',
    'host_neighbourhood',
    'street',
    'neighbourhood',
    'neighbourhood_cleansed',
    'city',
    'market',
    'smart_location',
    'has_availability',
    'calendar_last_scraped',
    'first_review',
    'requires_license',
    'license',
    'is_business_travel_ready',
])
print(airbnb_drop["host_response_rate"].value_counts())
airbnb_drop["host_response_rate"].head(10)

Series([], Name: host_response_rate, dtype: int64)


0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
7   NaN
8   NaN
9   NaN
Name: host_response_rate, dtype: float64

In [119]:
def generate_replace_map(dataframe):
    def h(dica,columnName): 
        labels = dataframe[columnName].astype('category').cat.categories.tolist()
        replace_map_comp = {columnName : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
        return dict(dica, **replace_map_comp)
    return h

import functools
def foldleft(func, acc, xs):
    return functools.reduce(func, xs, acc)

# get the function to work on airbnb_drop dataframe
airbnb_replace_fn = generate_replace_map(airbnb_drop)

In [145]:
category_columns = [
    'host_response_time',
    'host_is_superhost','host_has_profile_pic','host_identity_verified',
    'neighbourhood_group_cleansed','is_location_exact',
    'property_type','room_type','bed_type',
    'instant_bookable','cancellation_policy','require_guest_profile_picture','require_guest_phone_verification'
]
replace_map = foldleft(airbnb_replace, {}, category_columns)
replace_map

{'host_response_time': {'ES': 1,
  'N/A': 2,
  'a few days or more': 3,
  'el Barri Gòtic': 4,
  'within a day': 5,
  'within a few hours': 6,
  'within an hour': 7},
 'host_is_superhost': {False: 1, True: 2},
 'host_has_profile_pic': {False: 1, True: 2},
 'host_identity_verified': {False: 1, True: 2},
 'neighbourhood_group_cleansed': {'1.0': 1,
  'Ciutat Vella': 2,
  'Eixample': 3,
  'Gràcia': 4,
  'Horta-Guinardó': 5,
  'Les Corts': 6,
  'Nou Barris': 7,
  'Sant Andreu': 8,
  'Sant Martí': 9,
  'Sants-Montjuïc': 10,
  'Sarrià-Sant Gervasi': 11},
 'is_location_exact': {False: 1, True: 2},
 'property_type': {'1': 1,
  '2.15484': 2,
  'Aparthotel': 3,
  'Apartment': 4,
  'Barn': 5,
  'Bed and breakfast': 6,
  'Boat': 7,
  'Boutique hotel': 8,
  'Cabin': 9,
  'Camper/RV': 10,
  'Casa particular (Cuba)': 11,
  'Castle': 12,
  'Chalet': 13,
  'Condominium': 14,
  'Cottage': 15,
  'Dome house': 16,
  'Farm stay': 17,
  'Guest suite': 18,
  'Guesthouse': 19,
  'Hostel': 20,
  'Hotel': 21,
  

In [113]:
cat_airbnb = airbnb_drop.replace(mapp)

# fillna

In [108]:
airbnb_drop.isnull().sum()

rowId                                               0
id                                                  0
experiences_offered                                 0
host_response_time                                 17
host_is_superhost                                  19
host_listings_count                                19
host_total_listings_count                          19
host_verifications                                  0
host_has_profile_pic                               19
host_identity_verified                             18
street                                              0
neighbourhood                                      13
neighbourhood_cleansed                              0
neighbourhood_group_cleansed                        0
city                                                7
zipcode                                           586
market                                             16
smart_location                                      1
latitude                    