In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import math
import re
import statsmodels.api as sm

from util.clean import rinse_listings 


In [2]:
#Import Listings
df_seattle_list = pd.read_csv('input/seattle/listings.csv')
print(df_seattle_list.shape)
df_seattle_list.head(5)

(3818, 92)


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,https://www.airbnb.com/rooms/241032,20160104002432,2016-01-04,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,...,10.0,f,,WASHINGTON,f,moderate,f,f,2,4.07
1,953595,https://www.airbnb.com/rooms/953595,20160104002432,2016-01-04,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",...,10.0,f,,WASHINGTON,f,strict,t,t,6,1.48
2,3308979,https://www.airbnb.com/rooms/3308979,20160104002432,2016-01-04,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,...,10.0,f,,WASHINGTON,f,strict,f,f,2,1.15
3,7421966,https://www.airbnb.com/rooms/7421966,20160104002432,2016-01-04,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,none,,...,,f,,WASHINGTON,f,flexible,f,f,1,
4,278830,https://www.airbnb.com/rooms/278830,20160104002432,2016-01-04,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,none,We are in the beautiful neighborhood of Queen ...,...,9.0,f,,WASHINGTON,f,strict,f,f,1,0.89


In [3]:
#Import calendar.csv
df_seattle_cal = pd.read_csv('input/seattle/calendar.csv', parse_dates=[1])
print(df_seattle_cal.shape)
df_seattle_cal.head(5)

(1393570, 4)


Unnamed: 0,listing_id,date,available,price
0,241032,2016-01-04,t,$85.00
1,241032,2016-01-05,t,$85.00
2,241032,2016-01-06,f,
3,241032,2016-01-07,f,
4,241032,2016-01-08,f,


# First cleaning
Let's first do some initial cleaning based on the exploration we did in "exploration_listing.ipynb". It showed many columns that have all unique values (categorical or ordinal), or just 1 unique value. This will be removed below. Also, a selection of columns with text or not immediately interesting information is removed to reduce the clutter. At last columns with percentage, currency or boolean values are formatted to useful values. 

We are left with a dataframe that are not free for NaN values, but is possible to work with. 

In [5]:
df_rinsed_listing = rinse_listings(df_seattle_list)

Index(['id', 'listing_url', 'thumbnail_url', 'medium_url', 'picture_url',
       'xl_picture_url', 'latitude', 'longitude'],
      dtype='object')


In [6]:
df_rinsed_listing

Unnamed: 0,id,host_id,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,956883,2011-08-11,within a few hours,0.96,1.0,0.0,3.0,3.0,"['email', 'phone', 'reviews', 'kba']",...,10.0,10.0,9.0,10.0,False,moderate,False,False,2,4.07
1,953595,5177328,2013-02-21,within an hour,0.98,1.0,1.0,6.0,6.0,"['email', 'phone', 'facebook', 'linkedin', 're...",...,10.0,10.0,10.0,10.0,False,strict,True,True,6,1.48
2,3308979,16708587,2014-06-12,within a few hours,0.67,1.0,0.0,2.0,2.0,"['email', 'phone', 'google', 'reviews', 'jumio']",...,10.0,10.0,10.0,10.0,False,strict,False,False,2,1.15
3,7421966,9851441,2013-11-06,,,,0.0,1.0,1.0,"['email', 'phone', 'facebook', 'reviews', 'jum...",...,,,,,False,flexible,False,False,1,
4,278830,1452570,2011-11-29,within an hour,1.00,,0.0,2.0,2.0,"['email', 'phone', 'facebook', 'reviews', 'kba']",...,10.0,10.0,9.0,9.0,False,strict,False,False,1,0.89
5,5956968,326758,2010-12-25,,,,0.0,1.0,1.0,"['email', 'phone', 'facebook', 'reviews', 'kba']",...,10.0,10.0,10.0,10.0,False,strict,False,False,1,2.45
6,1909058,2497928,2012-05-30,within an hour,1.00,1.0,1.0,1.0,1.0,"['email', 'phone', 'reviews', 'kba']",...,10.0,10.0,10.0,10.0,False,moderate,False,False,1,2.46
7,856550,4016632,2012-10-30,within an hour,1.00,1.0,1.0,5.0,5.0,"['email', 'phone', 'facebook', 'google', 'link...",...,10.0,10.0,9.0,10.0,False,strict,True,True,5,4.73
8,4948745,2166277,2012-04-18,,,1.0,0.0,1.0,1.0,"['email', 'phone', 'facebook', 'linkedin', 're...",...,10.0,9.0,10.0,10.0,False,strict,False,False,1,1.22
9,2493658,5177328,2013-02-21,within an hour,0.98,1.0,1.0,6.0,6.0,"['email', 'phone', 'facebook', 'linkedin', 're...",...,10.0,10.0,10.0,9.0,False,strict,True,True,6,1.55


In [None]:
df_rinsed_listing.count/df_rinsed_listing.shape[0]