# Getting the Data Ready
This notebook is for cleaning up data, generate more columns, change data type, etc.
Also playing around with different types regression model.


In [1]:
import os
import io
import re
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
#read data, specify index so it's easier to join and search using loc
path = '../../data/new-york-city-airbnb-open-data/'
listings_csv = os.path.join(path,'selected_columns_listings.csv')
listings = pd.read_csv(listings_csv)

In [3]:
#to get all columns, set option
pd.set_option('display.max_columns', 107)
#to get text with no truncation
pd.set_option('display.max_colwidth', -1)

In [4]:
#shape
listings.shape

(50796, 73)

In [5]:
#head
listings.head(1)

Unnamed: 0,id,listing_url,name,host_id,host_url,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2060,https://www.airbnb.com/rooms/2060,Modern NYC,2259,https://www.airbnb.com/users/show/2259,Jenny,2008-08-18,US,a few days or more,22%,50%,f,Washington Heights,0.0,0.0,['reviews'],f,"New York, NY, United States",Manhattan,Washington Heights,Manhattan,New York,NY,10040,New York,"New York, NY",40.85722,-73.9379,t,Other,Private room,2,,1.0,2.0,Real Bed,"{Internet,Wifi}",$100.00,,,1,$0.00,1,730,1,1,730,730,1.0,730.0,4 weeks ago,30,60,90,365,1,0,2008-09-22,2008-09-22,80.0,,,,,,,f,flexible,1,0,1,0,0.01


# Dealing with null values
Fill with 0, dropping null

In [6]:
#fill with 0

listings['reviews_per_month'] = listings['reviews_per_month'].replace(np.nan, 0)
listings['bathrooms'] = listings['bathrooms'].replace(np.nan, 0)
listings['bedrooms'] = listings['bedrooms'].replace(np.nan, 0)

listings['host_is_superhost'] = listings['host_is_superhost'].replace(np.nan, 0)

listings['cleaning_fee'] = listings['cleaning_fee'].replace(np.nan, '0')

In [7]:
#dropping na when rec counts as  there are not too many

listings = listings.dropna(subset=['host_since'])

#filt = listings['host_since'].isnull()
#listings[filt]
#listings['host_since'].isnull().sum()

# Array data type - exploration
First part is querying the two columns which as array: host_verifications and amenities

In [8]:
#convert values of host_verifications to list
host_verifications = listings['host_verifications'].values.tolist()

#host verivication split
#host_verification_ = host_verifications[0:10] -- for testing
host_verification_unique = []

for row in host_verifications:
    row_clean = row.replace('[','').replace(']','')
    methods = row_clean.split(',')
    for method in methods:
        method_clean = method.replace("'","").strip()
        if method_clean not in host_verification_unique:
            host_verification_unique.append(method_clean)

print(host_verification_unique)

['reviews', 'email', 'phone', 'offline_government_id', 'kba', 'selfie', 'government_id', 'identity_manual', 'work_email', 'jumio', 'facebook', 'google', 'manual_online', 'sent_id', 'manual_offline', 'weibo', 'sesame', 'sesame_offline', 'zhima_selfie', '']


In [9]:
#convert values of amenities to list
amenities = listings['amenities'].values.tolist()

#amenities split
amenities_unique = []

for row in amenities:
    row_string = str(row)
    row_string_clean = row_string.replace('{','').replace('}','')
    amenities_rows = row_string_clean.split(',')
    for row in amenities_rows:
        row_clean = row.replace('"','')
        if row_clean not in amenities_unique:
            amenities_unique.append(row_clean)
            
print(amenities_unique)

['Internet', 'Wifi', 'TV', 'Air conditioning', 'Kitchen', 'Paid parking off premises', 'Free street parking', 'Heating', 'Family/kid friendly', 'Smoke detector', 'Carbon monoxide detector', 'Fire extinguisher', 'Essentials', 'Lock on bedroom door', 'Hair dryer', 'Iron', 'Laptop friendly workspace', 'Self check-in', 'Keypad', 'Private living room', 'Bathtub', 'Hot water', 'Bed linens', 'Extra pillows and blankets', 'Ethernet connection', 'Coffee maker', 'Refrigerator', 'Dishes and silverware', 'Cooking basics', 'Oven', 'Stove', 'Luggage dropoff allowed', 'Long term stays allowed', 'Cleaning before checkout', 'Cable TV', 'Pets allowed', 'Shampoo', '24-hour check-in', 'Hangers', 'Lockbox', 'High chair', 'Stair gates', 'Children’s books and toys', 'Pack ’n Play/travel crib', 'Microwave', 'Buzzer/wireless intercom', 'First aid kit', 'translation missing: en.hosting_amenity_49', 'translation missing: en.hosting_amenity_50', 'Host greets you', 'Elevator', 'Washer', 'Dryer', 'Pets live on this

# Array data type - reformatting
This step is creating multiple boolean columns to be used in the model

In [10]:
#function to remove some characters, clean up value
def clean_array(row):
    row = row.str.replace('[', '')
    row = row.str.replace(']', '')
    row = row.str.replace('{', '')
    row = row.str.replace('}', '')
    row = row.str.replace("'", '')
    row = row.str.replace('"', '')
    return row

In [11]:
#array_columns = ['host_verifications', 'amenities']
array_columns = ['host_verifications']
#apply get
array_columns_clean = listings[array_columns].apply(lambda col: clean_array(col))

# Create new columns based on unique value in the array
array_bool_columns = []

for col in listings[array_columns].columns:
    colname = array_columns_clean[col].str.get_dummies(sep=',');
    colname = colname.add_prefix(col + '_');
    array_bool_columns.append(colname);

#add to the listing
listings = listings.join(array_bool_columns)

#drop original columns
listings = listings.drop(array_columns,axis = 1)

#print all addtional columns
#array_bool_columns

# Currency data type
This will remove dollars sign, comma and then convert to float

In [13]:
#function
def fix_currency(row):
    row = row.replace(',', '')
    row = row.replace('$', '')
    return row

In [16]:
#run on price, extra_people
#also convert to float
colname = 'extra_people'
listings[colname] = listings[colname].apply(lambda col: fix_currency(col)).astype(float)

colname = 'price'
listings[colname] = listings[colname].apply(lambda col: fix_currency(col)).astype(float)

colname = 'cleaning_fee'
listings[colname] = listings[colname].apply(lambda col: fix_currency(col)).astype(float)


In [19]:
listings['extra_people'].head(10)

0    0.0  
1    0.0  
2    0.0  
3    100.0
4    13.0 
5    30.0 
6    12.0 
7    0.0  
8    10.0 
9    0.0  
Name: extra_people, dtype: float64

# Category data type

In [20]:
# Get one hot encoding of column neighbourhood_group_cleansed
one_hot = pd.get_dummies(listings['neighbourhood_group_cleansed'])
# Drop original column as it is now encoded
listings = listings.drop('neighbourhood_group_cleansed',axis = 1)
# Join the encoded df
listings = listings.join(one_hot)


# Boolean data type
Converting the value to 1 and 0 from 't' and 'f'.

In [21]:
#function to replace
def convert_bool(row):
    row = row.replace('f', 0)
    row = row.replace('t', 1)
    return row

In [22]:
listings.groupby(['host_is_superhost'])[['host_is_superhost']].count()
listings['host_is_superhost'].isna().sum()

0

In [23]:
#update rows and convert to bool
colname = ['host_is_superhost']

#run fuction and convert to int
listings[colname] = listings[colname].apply(lambda col: convert_bool(col)).astype(int)

# Date data type
Convert to date then to ordinal

In [24]:
date_columns = ['host_since']
#conver to date first
listings['host_since'] = pd.to_datetime(listings['host_since'])

for col in date_columns:
    listings['host_since_calc'] = listings[col].apply(lambda x: x.toordinal())

listings[['host_since', 'host_since_calc']].head()


Unnamed: 0,host_since,host_since_calc
0,2008-08-18,733272
1,2008-09-09,733294
2,2008-12-07,733383
3,2009-02-02,733440
4,2009-02-02,733440


In [25]:
listings.head(1)

Unnamed: 0,id,listing_url,name,host_id,host_url,host_name,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,city,state,zipcode,market,smart_location,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,host_verifications_ facebook,host_verifications_ google,host_verifications_ government_id,host_verifications_ identity_manual,host_verifications_ jumio,host_verifications_ kba,host_verifications_ manual_offline,host_verifications_ manual_online,host_verifications_ offline_government_id,host_verifications_ phone,host_verifications_ reviews,host_verifications_ selfie,host_verifications_ sent_id,host_verifications_ sesame,host_verifications_ sesame_offline,host_verifications_ weibo,host_verifications_ work_email,host_verifications_ zhima_selfie,host_verifications_email,host_verifications_facebook,host_verifications_google,host_verifications_jumio,host_verifications_offline_government_id,host_verifications_phone,host_verifications_reviews,Bronx,Brooklyn,Manhattan,Queens,Staten Island,host_since_calc
0,2060,https://www.airbnb.com/rooms/2060,Modern NYC,2259,https://www.airbnb.com/users/show/2259,Jenny,2008-08-18,US,a few days or more,22%,50%,0,Washington Heights,0.0,0.0,f,"New York, NY, United States",Manhattan,Washington Heights,New York,NY,10040,New York,"New York, NY",40.85722,-73.9379,t,Other,Private room,2,0.0,1.0,2.0,Real Bed,"{Internet,Wifi}",100.0,,0.0,1,0.0,1,730,1,1,730,730,1.0,730.0,4 weeks ago,30,60,90,365,1,0,2008-09-22,2008-09-22,80.0,,,,,,,f,flexible,1,0,1,0,0.01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,733272


# Modelling
Try different type of models

In [29]:
#test model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#not sure how to use this one
from sklearn.model_selection import cross_val_score

In [30]:
#exclude for now
X = listings[['Bronx', 'Bronx', 'Bronx', 'Bronx', 'Queens', 'Staten Island', 'reviews_per_month', 'host_since_calc', 'bathrooms', 'bedrooms', 'extra_people', 'host_is_superhost', 'cleaning_fee', 'host_total_listings_count']]
y = listings['price']

# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize our algorithm
lr = LogisticRegression(random_state=1, solver='liblinear')

# Fit the training data to the model
log_reg = lr.fit(X_train, y_train)

#Score
log_reg.score(X_test, y_test)

0.04340978442760114

In [31]:
from sklearn import linear_model

clf = linear_model.Lasso(alpha=0.1)
log_clf = clf.fit(X_train, y_train)
log_clf.score(X_test, y_test)

0.04022846526034418

In [43]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

#train, mse and r2
pred_train = clf.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train)))
print(r2_score(y_train, pred_train))


436.56677542201965
0.03609072890220211


In [32]:
from sklearn.linear_model import Ridge

rr = Ridge(alpha=0.01)
log_rr = rr.fit(X_train, y_train) 
log_rr.score(X_test, y_test)


0.04011249195055011

In [45]:
pred_train = rr.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,pred_train)))
print(r2_score(y_train, pred_train))

436.5644429085179
0.036101028931932455
