# Getting the Data Ready
This notebook is for cleaning up data, generate more columns, change data type, etc.


In [92]:
import os
import io
import re
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [93]:
#read data, specify index so it's easier to join and search using loc
path = '../../data/new-york-city-airbnb-open-data/'
listings_csv = os.path.join(path,'selected_columns_listings.csv')
listings = pd.read_csv(listings_csv)

In [94]:
#to get all columns, set option
pd.set_option('display.max_columns', 107)
#to get text with no truncation
pd.set_option('display.max_colwidth', -1)

In [95]:
#shape
listings.shape

(50796, 73)

In [96]:
listings.columns

Index(['id', 'listing_url', 'name', 'host_id', 'host_url', 'host_name',
       'host_since', 'host_location', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_identity_verified', 'street', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'city',
       'state', 'zipcode', 'market', 'smart_location', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms', 'bedrooms', 'beds', 'bed_type', 'amenities', 'price',
       'security_deposit', 'cleaning_fee', 'guests_included', 'extra_people',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'calendar_updated', 'avai

In [97]:
#head
listings.head(1)

Unnamed: 0,id,listing_url,name,host_id,host_url,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2060,https://www.airbnb.com/rooms/2060,Modern NYC,2259,https://www.airbnb.com/users/show/2259,Jenny,2008-08-18,US,a few days or more,22%,50%,f,Washington Heights,0.0,0.0,['reviews'],f,"New York, NY, United States",Manhattan,Washington Heights,Manhattan,New York,NY,10040,New York,"New York, NY",40.85722,-73.9379,t,Other,Private room,2,,1.0,2.0,Real Bed,"{Internet,Wifi}",$100.00,,,1,$0.00,1,730,1,1,730,730,1.0,730.0,4 weeks ago,30,60,90,365,1,0,2008-09-22,2008-09-22,80.0,,,,,,,f,flexible,1,0,1,0,0.01


# Drop unused columns
May use it later but remove for now

In [98]:
Unused_columns = ['listing_url', 'host_location', 'name', 'host_id', 'host_url', 'host_name', 'host_total_listings_count', 'city', 'street', 'state', 'zipcode', 'neighbourhood', 'host_neighbourhood', 'neighbourhood_cleansed', 'market', 'smart_location', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'availability_30',
'availability_60', 'availability_90', 'availability_365']

listings.drop(columns=Unused_columns, inplace = True)

# Dealing with null values
Fill with 0, dropping null, fill with avg

## Analysis
group by, count null, etc


In [99]:
##run this to find out how many nulls
colname = 'host_since'

listings[colname].isna().sum()

5

In [100]:
#run this to get group by
listings.groupby(colname)[colname].count().head(5)

host_since
2008-08-18    1
2008-08-22    1
2008-08-27    1
2008-09-07    8
2008-09-09    2
Name: host_since, dtype: int64

In [101]:
#get sample records of null to see if we can use other columns to fill
filt = listings[colname] == '2.5'
listings[filt]['host_since'].head(1)

Series([], Name: host_since, dtype: object)

## Dropping null rows
Dropping na when rec counts as  there are not too many


In [102]:
Drop_na = ['host_since', 'host_listings_count', 'host_identity_verified', 'host_is_superhost']
listings = listings.dropna(subset=Drop_na)


## Fill with 0 or 0 variance

In [103]:
#fill with 0
Fill_with_zero = ['bathrooms', 'bedrooms', 'host_is_superhost', 'beds', 'review_scores_rating', 'review_scores_accuracy',
'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month']

for column in Fill_with_zero:
    listings[column] = listings[column].replace(np.nan, '0')




In [104]:
#fill with $0.00
Fill_with_zero = ['security_deposit', 'cleaning_fee']

for column in Fill_with_zero:
    listings[column] = listings[column].replace(np.nan, '$0.00')

## Fill with average

In [105]:
#host_response_time will be converted to 1,2,3,4. So 2.5 is the middle value
listings['host_response_time'] = listings['host_response_time'].replace(np.nan, '2.5')
#host_acceptance_rate avg 50%
listings['host_acceptance_rate'] = listings['host_acceptance_rate'].replace(np.nan, '50%')


# Put aside features
Need to deal with this later: 'first_review', 'last_review', 'host_response_rate', 'host_acceptance_rate'
 

# Array data type - exploration
First part is querying the two columns which as array: host_verifications and amenities

In [106]:
#convert values of host_verifications to list
host_verifications = listings['host_verifications'].values.tolist()

#host verivication split and add to list
host_verification_all = []

for row in host_verifications:
    row_clean = row.replace('[','').replace(']','')
    methods = row_clean.split(',')
    for method in methods:
        method_clean = method.replace("'","").strip()
        host_verification_all.append(method_clean)

Counter(host_verification_all).most_common(5)

[('phone', 50628),
 ('email', 47312),
 ('reviews', 34126),
 ('government_id', 32313),
 ('offline_government_id', 22301)]

In [107]:
#convert values of amenities to list
amenities = listings['amenities'].values.tolist()

#amenities split and add to list
amenities_all = []

for row in amenities:
    row_string = str(row)
    row_string_clean = row_string.replace('{','').replace('}','')
    amenities_rows = row_string_clean.split(',')
    for row in amenities_rows:
        row_clean = row.replace('"','')
        amenities_all.append(row_clean)

Counter(amenities_all).most_common()

[('Wifi', 49523),
 ('Heating', 47853),
 ('Essentials', 46990),
 ('Kitchen', 46232),
 ('Smoke detector', 44421),
 ('Air conditioning', 43261),
 ('Hangers', 38362),
 ('Carbon monoxide detector', 36183),
 ('TV', 34934),
 ('Shampoo', 34311),
 ('Hair dryer', 33848),
 ('Laptop friendly workspace', 32315),
 ('Iron', 32254),
 ('Hot water', 28864),
 ('Refrigerator', 23195),
 ('Dishes and silverware', 22567),
 ('Washer', 20699),
 ('Dryer', 20347),
 ('Fire extinguisher', 19982),
 ('Microwave', 19629),
 ('Cooking basics', 19392),
 ('Stove', 19227),
 ('Lock on bedroom door', 19043),
 ('Oven', 18829),
 ('Free street parking', 18679),
 ('Coffee maker', 18068),
 ('First aid kit', 17557),
 ('Bed linens', 17022),
 ('Internet', 13600),
 ('Elevator', 13404),
 ('Family/kid friendly', 12059),
 ('Cable TV', 11949),
 ('Extra pillows and blankets', 11763),
 ('Self check-in', 11092),
 ('Private entrance', 11012),
 ('Long term stays allowed', 10433),
 ('Buzzer/wireless intercom', 9630),
 ('Dishwasher', 8636),
 (

# Array data type - reformatting
This step is creating multiple boolean columns to be used in the model

In [108]:
#function to remove some characters, clean up value
def clean_array(row):
    row = row.str.replace('[', '')
    row = row.str.replace(']', '')
    row = row.str.replace('{', '')
    row = row.str.replace('}', '')
    row = row.str.replace("'", '')
    row = row.str.replace('"', '')
    return row

In [109]:
#array_columns = ['host_verifications', 'amenities']
array_columns = ['host_verifications']
#apply get
array_columns_clean = listings[array_columns].apply(lambda col: clean_array(col))

# Create new columns based on unique value in the array
array_bool_columns = []

for col in listings[array_columns].columns:
    colname = array_columns_clean[col].str.get_dummies(sep=',');
    colname = colname.add_prefix(col + '_');
    array_bool_columns.append(colname);

#add to the listing
listings = listings.join(array_bool_columns)

#drop original columns
listings = listings.drop(array_columns,axis = 1)

#print all addtional columns
#array_bool_columns

In [110]:
#array_columns = ['host_verifications', 'amenities']
array_columns = ['amenities']
#apply get
array_columns_clean = listings[array_columns].apply(lambda col: clean_array(col))

# Create new columns based on unique value in the array
array_bool_columns = []

for col in listings[array_columns].columns:
    colname = array_columns_clean[col].str.get_dummies(sep=',');
    colname = colname.add_prefix(col + '_');
    array_bool_columns.append(colname);

#add to the listing
listings = listings.join(array_bool_columns)

#drop original columns
listings = listings.drop(array_columns,axis = 1)

#print all addtional columns
#array_bool_columns

# Currency conversion
This will remove dollars sign, comma and then convert to float

In [112]:
#function
def fix_currency(row):
    row = row.replace(',', '')
    row = row.replace('$', '')
    return row

In [113]:
#Currency = ['extra_people', 'price', 'cleaning_fee' ]
#also convert to float

colname = 'extra_people'
listings[colname] = listings[colname].apply(lambda col: fix_currency(col)).astype(float)

colname = 'price'
listings[colname] = listings[colname].apply(lambda col: fix_currency(col)).astype(float)

colname = 'cleaning_fee'
listings[colname] = listings[colname].apply(lambda col: fix_currency(col)).astype(float)

colname = 'security_deposit'
listings[colname] = listings[colname].apply(lambda col: fix_currency(col)).astype(float)

In [114]:
listings['cleaning_fee'].head(10)


0    0.0  
1    95.0 
2    0.0  
3    125.0
4    0.0  
5    0.0  
6    15.0 
7    0.0  
8    40.0 
9    0.0  
Name: cleaning_fee, dtype: float64

# Category conversion
Use one hot encoding

In [115]:
# Get one hot encoding of column neighbourhood_group_cleansed
one_hot_encoding_columns = ['neighbourhood_group_cleansed', 'property_type', 'room_type', 'cancellation_policy', 'bed_type']
listings = pd.get_dummies(data=listings, columns=one_hot_encoding_columns)

# Boolean conversion
Converting the value to 1 and 0 from 't' and 'f'.

In [116]:
#function to replace
def convert_bool(row):
    row = row.replace('f', '0')
    row = row.replace('t', '1')
    return row

In [117]:
# #update rows and convert to boo
Boolean_columns = ['host_is_superhost', 'is_location_exact', 'instant_bookable', 'host_identity_verified']

for column in Boolean_columns:
    listings[column] = listings[column].apply(lambda col: convert_bool(col)).astype(int)


# String conversion
Needs to convert to numbers/float

In [118]:
#function to replace
def convert_string_to_int(row):
    row = row.replace('within an hour', '1')
    row = row.replace('within a few hours', '2')
    row = row.replace('within a day', '3')
    row = row.replace('a few days or more', '4')
    return row

In [119]:
colname = ['host_response_time']

listings[colname] = listings[colname].apply(lambda col: convert_string_to_int(col)).astype(float)

In [120]:
listings.groupby('host_response_time')['host_response_time'].count()

host_response_time
1.0    20445
2.0    6009 
2.5    19001
3.0    4218 
4.0    1118 
Name: host_response_time, dtype: int64

# Date conversion
Convert to date then to ordinal

In [125]:
#conver to date first
listings['host_since'] = pd.to_datetime(listings['host_since'])

listings['host_since'] = listings['host_since'].apply(lambda x: x.toordinal())



In [126]:
listings['host_since'].head()

0    733272
1    733294
2    733383
3    733440
4    733440
Name: host_since, dtype: int64

In [127]:
listings.head(1)

Unnamed: 0,id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_identity_verified,latitude,longitude,is_location_exact,accommodates,bathrooms,bedrooms,beds,price,security_deposit,cleaning_fee,guests_included,extra_people,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,reviews_per_month,host_verifications_ facebook,host_verifications_ google,host_verifications_ government_id,host_verifications_ identity_manual,host_verifications_ jumio,host_verifications_ kba,host_verifications_ manual_offline,host_verifications_ manual_online,host_verifications_ offline_government_id,host_verifications_ phone,host_verifications_ reviews,host_verifications_ selfie,host_verifications_ sent_id,host_verifications_ sesame,host_verifications_ sesame_offline,host_verifications_ weibo,host_verifications_ work_email,host_verifications_ zhima_selfie,host_verifications_email,host_verifications_facebook,...,property_type_Barn,property_type_Bed and breakfast,property_type_Boat,property_type_Boutique hotel,property_type_Bungalow,property_type_Bus,property_type_Cabin,property_type_Camper/RV,property_type_Casa particular (Cuba),property_type_Castle,property_type_Cave,property_type_Condominium,property_type_Cottage,property_type_Dome house,property_type_Dorm,property_type_Earth house,property_type_Farm stay,property_type_Guest suite,property_type_Guesthouse,property_type_Hostel,property_type_Hotel,property_type_House,property_type_Houseboat,property_type_In-law,property_type_Island,property_type_Lighthouse,property_type_Loft,property_type_Other,property_type_Resort,property_type_Serviced apartment,property_type_Tent,property_type_Timeshare,property_type_Tiny house,property_type_Townhouse,property_type_Train,property_type_Treehouse,property_type_Villa,property_type_Yurt,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_strict_14_with_grace_period,cancellation_policy_super_strict_30,cancellation_policy_super_strict_60,bed_type_Airbed,bed_type_Couch,bed_type_Futon,bed_type_Pull-out Sofa,bed_type_Real Bed
0,2060,733272,4.0,22%,50%,0,0.0,0,40.85722,-73.9379,1,2,0,1,2,100.0,0.0,0.0,1,0.0,1,0,2008-09-22,2008-09-22,80,0,0,0,0,0,0,0,0.01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1


In [128]:
listings.columns.values

array(['id', 'host_since', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_listings_count',
       'host_identity_verified', 'latitude', 'longitude',
       'is_location_exact', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'price', 'security_deposit', 'cleaning_fee',
       'guests_included', 'extra_people', 'number_of_reviews',
       'number_of_reviews_ltm', 'first_review', 'last_review',
       'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'instant_bookable', 'reviews_per_month',
       'host_verifications_ facebook', 'host_verifications_ google',
       'host_verifications_ government_id',
       'host_verifications_ identity_manual', 'host_verifications_ jumio',
       'host_verifications_ kba', 'host_verifications_ manual_offline',
       'host_verifications_ 

# Export to CSV
Clean data for model to use

In [129]:
#export columns left to csv to be used with second notebook
path = '../../data/new-york-city-airbnb-open-data/'

listings_csv = os.path.join(path,'model_columns_listings.csv')
listings.to_csv(listings_csv)