In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

from IPython.core.display import display, HTML

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time, os

import re

In [2]:
import sys

import pickle
import random

### Listings

In [3]:
with open('project/data/listing_master.pickle','rb') as read_file:
    listings = pickle.load(read_file)

listings.head()

Unnamed: 0,url,property_name,type,price,reviews,hostel,url_cleaned
145,www.airbnb.com/hotels/42535128?check_in=2021-0...,Squaw Valley Lodge,Aparthotel in Olympic Valley,$275 per night,,3 room types · 99 total rooms on property,https://www.airbnb.com/hotels/42535128?check_i...
25,www.airbnb.com/rooms/10012754?check_in=2021-06...,Tahoe Stonehenge,Entire house in South Lake Tahoe,$918 per night,4.90 (110 reviews),12 guests · 5 bedrooms · 8 beds · 4 baths,https://www.airbnb.com/rooms/10012754?check_in...
241,www.airbnb.com/rooms/10039236?check_in=2021-06...,Vidor Village and Family Retreat #322,Entire condominium in North Lake Tahoe Region,$262 per night,4.0 (16 reviews),11 guests · 3 bedrooms · 8 beds · 2 baths,https://www.airbnb.com/rooms/10039236?check_in...
187,www.airbnb.com/rooms/10041987?check_in=2021-06...,Comfortable Lake Tahoe Charmer w/ Hot Tub,Entire house in South Lake Tahoe,$385 per night,4.77 (123 reviews),8 guests · 4 bedrooms · 5 beds · 2 baths,https://www.airbnb.com/rooms/10041987?check_in...
241,www.airbnb.com/rooms/10052096?check_in=2021-06...,Northstar Cabin - Summer access to pool and te...,Entire house in North Lake Tahoe Region,$375 per night,4.75 (48 reviews),9 guests · 4 bedrooms · 6 beds · 2 baths,https://www.airbnb.com/rooms/10052096?check_in...


In [4]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1349 entries, 145 to 254
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   url            1349 non-null   object
 1   property_name  1349 non-null   object
 2   type           1349 non-null   object
 3   price          1349 non-null   object
 4   reviews        1349 non-null   object
 5   hostel         1349 non-null   object
 6   url_cleaned    1349 non-null   object
dtypes: object(7)
memory usage: 84.3+ KB


In [5]:
listings_cleaned = listings.copy()

In [6]:
# Clean up property types
def type_desc(prop_type):
    if "Entire place" in prop_type:
        return "Entire house"
    elif "Entire bungalow" in prop_type:
        return "Entire house"
    elif "Room in boutique hotel" in prop_type:
        return "Hotel room"
    elif "Tiny house" in prop_type:
        return "Entire house"
    elif "Entire villa" in prop_type:
        return "Entire house"
    elif "Aparthote" in prop_type:
        return "Entire serviced apartment"
    elif "Entire house" in prop_type:
        return "Entire house"
    elif "Entire condominium" in prop_type:
        return "Entire condominium"
    elif "Entire cabin" in prop_type:
        return "Entire cabin"
    elif "Private room" in prop_type:
        return "Private room"
    elif "Entire townhouse" in prop_type:
        return "Entire townhouse"
    elif "Entire apartment" in prop_type:
        return "Entire apartment"
    elif "Entire chalet" in prop_type:
        return "Entire chalet"
    elif "Entire guesthouse" in prop_type:
        return "Entire guesthouse"
    elif "Hotel room" in prop_type:
        return "Hotel room"
    elif "Entire loft" in prop_type:
        return "Entire loft"
    elif "Entire cottage" in prop_type:
        return "Entire cottage"
    elif "Entire guest suite" in prop_type:
        return "Entire guest suite"
    elif "Resort room" in prop_type:
        return "Resort room"
    elif "Camper/RV" in prop_type:
        return "Camper/RV"
    elif "Shared room" in prop_type:
        return "Shared room"
    elif "Entire serviced apartment" in prop_type:
        return "Entire serviced apartment"
    elif "Hostel beds" in prop_type:
        return "Hotel room"
    else:
        return prop_type

In [7]:
listings_cleaned['type_desc'] = listings_cleaned['type'].map(lambda x: type_desc(x))

In [8]:
listings_cleaned.type_desc.unique()

array(['Entire serviced apartment', 'Entire house', 'Entire condominium',
       'Entire cabin', 'Private room', 'Entire townhouse',
       'Entire chalet', 'Entire apartment', 'Entire guest suite',
       'Hotel room', 'Resort room', 'Entire guesthouse', 'Entire loft',
       'Camper/RV', 'Entire cottage', 'Shared room'], dtype=object)

In [9]:
# Clean up price
listings_cleaned['price_cleaned'] = listings_cleaned['price'].str.split(" ").apply(lambda x: x[0]).str.replace("$","").str.replace(",","").astype(float)

In [10]:
listings_cleaned.drop(columns = 'price', inplace = True)
listings_cleaned.rename(columns = {"price_cleaned" : "price"}, inplace = True)

In [11]:
# Clean up reviews
listings_cleaned['reviews_tmp'] = listings_cleaned['reviews'].str.replace("(", " ").str.replace(")","").str.split(" ")

In [12]:
listings_cleaned['reviews_cleaned'] = listings_cleaned['reviews_tmp'].apply(lambda x: None if len(x) == 1 else x[0])
listings_cleaned['num_reviews'] = listings_cleaned['reviews_tmp'].apply(lambda x: x[1] if len(x) > 1 else None)

In [13]:
listings_cleaned.head()

Unnamed: 0,url,property_name,type,reviews,hostel,url_cleaned,type_desc,price,reviews_tmp,reviews_cleaned,num_reviews
145,www.airbnb.com/hotels/42535128?check_in=2021-0...,Squaw Valley Lodge,Aparthotel in Olympic Valley,,3 room types · 99 total rooms on property,https://www.airbnb.com/hotels/42535128?check_i...,Entire serviced apartment,275.0,[],,
25,www.airbnb.com/rooms/10012754?check_in=2021-06...,Tahoe Stonehenge,Entire house in South Lake Tahoe,4.90 (110 reviews),12 guests · 5 bedrooms · 8 beds · 4 baths,https://www.airbnb.com/rooms/10012754?check_in...,Entire house,918.0,"[4.90 , 110, reviews]",4.9,110.0
241,www.airbnb.com/rooms/10039236?check_in=2021-06...,Vidor Village and Family Retreat #322,Entire condominium in North Lake Tahoe Region,4.0 (16 reviews),11 guests · 3 bedrooms · 8 beds · 2 baths,https://www.airbnb.com/rooms/10039236?check_in...,Entire condominium,262.0,"[4.0 , 16, reviews]",4.0,16.0
187,www.airbnb.com/rooms/10041987?check_in=2021-06...,Comfortable Lake Tahoe Charmer w/ Hot Tub,Entire house in South Lake Tahoe,4.77 (123 reviews),8 guests · 4 bedrooms · 5 beds · 2 baths,https://www.airbnb.com/rooms/10041987?check_in...,Entire house,385.0,"[4.77 , 123, reviews]",4.77,123.0
241,www.airbnb.com/rooms/10052096?check_in=2021-06...,Northstar Cabin - Summer access to pool and te...,Entire house in North Lake Tahoe Region,4.75 (48 reviews),9 guests · 4 bedrooms · 6 beds · 2 baths,https://www.airbnb.com/rooms/10052096?check_in...,Entire house,375.0,"[4.75 , 48, reviews]",4.75,48.0


In [14]:
listings_cleaned.drop(columns = ["reviews", "reviews_tmp"], inplace = True)
listings_cleaned.rename(columns = {'reviews_cleaned' :'reviews'}, inplace = True)

In [15]:
listings_cleaned['reviews'] = listings_cleaned['reviews'].astype(float)
listings_cleaned['num_reviews'] = listings_cleaned['num_reviews'].astype(float)

In [16]:
listings_cleaned.drop(columns = ["url"], inplace = True)
listings_cleaned.rename(columns = {"url_cleaned" : "url"}, inplace = True)
listings_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1349 entries, 145 to 254
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   property_name  1349 non-null   object 
 1   type           1349 non-null   object 
 2   hostel         1349 non-null   object 
 3   url            1349 non-null   object 
 4   type_desc      1349 non-null   object 
 5   price          1349 non-null   float64
 6   reviews        1107 non-null   float64
 7   num_reviews    1107 non-null   float64
dtypes: float64(3), object(5)
memory usage: 94.9+ KB


In [17]:
# Clean up hostel
listings_cleaned['accomdations_tmp'] = listings_cleaned['hostel'].str.split(" · ")

listings_cleaned['num_guets'] = listings_cleaned['accomdations_tmp'].apply(lambda x: x[0])
listings_cleaned['num_bedrooms'] = listings_cleaned['accomdations_tmp'].apply(lambda x: x[1] if len(x) >= 2 else None)
listings_cleaned['num_beds'] = listings_cleaned['accomdations_tmp'].apply(lambda x: x[1] if len(x) >= 3 else None)
listings_cleaned['num_baths'] = listings_cleaned['accomdations_tmp'].apply(lambda x: x[1] if len(x) >= 4 else None)

In [18]:
listings_cleaned['num_guets'] = listings_cleaned['num_guets'].str.split(" ").apply(lambda x: x[0])
listings_cleaned['num_bedrooms'] = listings_cleaned['num_bedrooms'].str.split(" ").apply(lambda x: x[0] if x is not None else np.nan)
listings_cleaned['num_baths'] = listings_cleaned['num_baths'].str.split(" ").apply(lambda x: x[0] if x is not None else np.nan)
listings_cleaned['num_beds'] = listings_cleaned['num_beds'].str.split(" ").apply(lambda x: x[0] if x is not None else np.nan)

In [19]:
#For studios, assign num_bedrooms = 0, num_baths = 1 and num_beds = 1 by default

listings_cleaned['num_bedrooms_cleaned'] = listings_cleaned['num_bedrooms'].apply(lambda x: "0" if x == 'Studio' else x)
listings_cleaned['num_baths_cleaned'] = listings_cleaned['num_baths'].apply(lambda x: "1" if x == 'Studio' else x)
listings_cleaned['num_beds_cleaned'] = listings_cleaned['num_beds'].apply(lambda x: "1" if x == 'Studio' else x)

In [20]:
listings_cleaned.drop(columns = ["num_bedrooms", "num_baths", "num_beds"], inplace = True)


In [21]:
listings_cleaned.rename(columns = {"num_bedrooms_cleaned" : "num_bedrooms",
                                   "num_baths_cleaned" : "num_baths",
                                   "num_beds_cleaned" : "num_beds"}, inplace = True)


In [22]:
listings_cleaned['num_guets'] = listings_cleaned['num_guets'].astype(float)
listings_cleaned['num_bedrooms'] = listings_cleaned['num_bedrooms'].astype(float)
listings_cleaned['num_baths'] = listings_cleaned['num_baths'].astype(float)
listings_cleaned['num_beds'] = listings_cleaned['num_beds'].astype(float)

In [23]:
columns = ['url', 'property_name', 'type_desc', 'price', 'reviews', 'num_reviews',
          'num_guets','num_bedrooms','num_baths', 'num_beds']

In [24]:
listings_cleaned_final = listings_cleaned[columns]

In [25]:
listings_cleaned_final.head()

Unnamed: 0,url,property_name,type_desc,price,reviews,num_reviews,num_guets,num_bedrooms,num_baths,num_beds
145,https://www.airbnb.com/hotels/42535128?check_i...,Squaw Valley Lodge,Entire serviced apartment,275.0,,,3.0,99.0,,
25,https://www.airbnb.com/rooms/10012754?check_in...,Tahoe Stonehenge,Entire house,918.0,4.9,110.0,12.0,5.0,5.0,5.0
241,https://www.airbnb.com/rooms/10039236?check_in...,Vidor Village and Family Retreat #322,Entire condominium,262.0,4.0,16.0,11.0,3.0,3.0,3.0
187,https://www.airbnb.com/rooms/10041987?check_in...,Comfortable Lake Tahoe Charmer w/ Hot Tub,Entire house,385.0,4.77,123.0,8.0,4.0,4.0,4.0
241,https://www.airbnb.com/rooms/10052096?check_in...,Northstar Cabin - Summer access to pool and te...,Entire house,375.0,4.75,48.0,9.0,4.0,4.0,4.0


In [27]:
# Clean up listing details
with open('project/data/listing_details_final_all.pickle','rb') as read_file:
    listing_details = pickle.load(read_file)

listing_details.head()

Unnamed: 0,url,cleanliness,accuracy,communication,location,checkin,value,response_rate,identify_verify,host_status
0,https://www.airbnb.com/hotels/42535128?check_i...,,,,,,,,Not Verified,Host
1,https://www.airbnb.com/rooms/10012754?check_in...,4.7,4.8,4.9,4.9,5.0,4.7,Response rate: 100%,Not Verified,Superhost
2,https://www.airbnb.com/rooms/10039236?check_in...,4.2,4.1,4.4,4.8,4.5,3.8,Response rate: 100%,Identity verified,Host
3,https://www.airbnb.com/rooms/10041987?check_in...,4.8,4.9,4.7,4.9,4.9,4.8,Response rate: 100%,Identity verified,Host
4,https://www.airbnb.com/rooms/10052096?check_in...,4.5,4.9,4.9,4.9,4.9,4.7,Response rate: 100%,Identity verified,Superhost


In [28]:
listing_details_cleaned = listing_details.copy()

In [29]:
listing_details_cleaned['cleanliness'] = listing_details_cleaned['cleanliness'].astype(float)
listing_details_cleaned['accuracy'] = listing_details_cleaned['accuracy'].astype(float)
listing_details_cleaned['communication'] = listing_details_cleaned['communication'].astype(float)
listing_details_cleaned['location'] = listing_details_cleaned['location'].astype(float)
listing_details_cleaned['checkin'] = listing_details_cleaned['checkin'].astype(float)
listing_details_cleaned['value'] = listing_details_cleaned['value'].astype(float)

In [30]:
listing_details_cleaned.dtypes

url                 object
cleanliness        float64
accuracy           float64
communication      float64
location           float64
checkin            float64
value              float64
response_rate       object
identify_verify     object
host_status         object
dtype: object

In [31]:
listing_details_cleaned['response_rate_cleaned'] = listing_details_cleaned['response_rate'].str.split(":").apply(lambda x: x if x is None else x[1])
listing_details_cleaned['response_rate_cleaned'] = listing_details_cleaned['response_rate_cleaned'].str.replace("%","").astype(float)

In [32]:
listing_details_cleaned.drop(columns = ['response_rate'], inplace = True)
listing_details_cleaned.rename(columns = {"response_rate_cleaned" : "response_rate"}, inplace = True)

listing_details_cleaned.head()

Unnamed: 0,url,cleanliness,accuracy,communication,location,checkin,value,identify_verify,host_status,response_rate
0,https://www.airbnb.com/hotels/42535128?check_i...,,,,,,,Not Verified,Host,
1,https://www.airbnb.com/rooms/10012754?check_in...,4.7,4.8,4.9,4.9,5.0,4.7,Not Verified,Superhost,100.0
2,https://www.airbnb.com/rooms/10039236?check_in...,4.2,4.1,4.4,4.8,4.5,3.8,Identity verified,Host,100.0
3,https://www.airbnb.com/rooms/10041987?check_in...,4.8,4.9,4.7,4.9,4.9,4.8,Identity verified,Host,100.0
4,https://www.airbnb.com/rooms/10052096?check_in...,4.5,4.9,4.9,4.9,4.9,4.7,Identity verified,Superhost,100.0


In [33]:
listing_details_cleaned_final = listing_details_cleaned.copy()

In [34]:
# Clean up location data
with open('project/data/location_all.pickle','rb') as read_file:
    locations = pickle.load(read_file)

locations.head()

Unnamed: 0,url,location
0,https://www.airbnb.com/hotels/42535128?check_i...,"Olympic Valley, California, United States"
1,https://www.airbnb.com/rooms/10012754?check_in...,"South Lake Tahoe, California, United States"
2,https://www.airbnb.com/rooms/10039236?check_in...,"Kings Beach, California, United States"
3,https://www.airbnb.com/rooms/10041987?check_in...,"South Lake Tahoe, California, United States"
4,https://www.airbnb.com/rooms/10052096?check_in...,"Truckee, California, United States"


In [35]:
locations_cleaned = locations.copy()

In [36]:
locations_cleaned['location_tmp'] = locations_cleaned['location'].str.split(",")

In [37]:
locations_cleaned['region'] = locations_cleaned['location_tmp'].apply(lambda x: x[0])
locations_cleaned['state'] = locations_cleaned['location_tmp'].apply(lambda x: x[1])

In [38]:
columns = ['url','region','state']

locations_cleaned_final = locations_cleaned[columns]

In [39]:
locations_cleaned_final.head()

Unnamed: 0,url,region,state
0,https://www.airbnb.com/hotels/42535128?check_i...,Olympic Valley,California
1,https://www.airbnb.com/rooms/10012754?check_in...,South Lake Tahoe,California
2,https://www.airbnb.com/rooms/10039236?check_in...,Kings Beach,California
3,https://www.airbnb.com/rooms/10041987?check_in...,South Lake Tahoe,California
4,https://www.airbnb.com/rooms/10052096?check_in...,Truckee,California


In [40]:
#Merge Data

In [41]:
tmp1 = listings_cleaned_final.merge(listing_details_cleaned_final,
                                   on = 'url',
                                   how = 'inner')

In [42]:
tmp1.sort_values('url', inplace = True)

In [43]:
tmp1.drop_duplicates('url',inplace = True)

In [44]:
tmp1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1265 entries, 0 to 1347
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   url              1265 non-null   object 
 1   property_name    1265 non-null   object 
 2   type_desc        1265 non-null   object 
 3   price            1265 non-null   float64
 4   reviews          1034 non-null   float64
 5   num_reviews      1034 non-null   float64
 6   num_guets        1265 non-null   float64
 7   num_bedrooms     1265 non-null   float64
 8   num_baths        1263 non-null   float64
 9   num_beds         1264 non-null   float64
 10  cleanliness      1003 non-null   float64
 11  accuracy         1003 non-null   float64
 12  communication    1003 non-null   float64
 13  location         1003 non-null   float64
 14  checkin          1003 non-null   float64
 15  value            1003 non-null   float64
 16  identify_verify  1265 non-null   object 
 17  host_status   

In [45]:
listings_all_v1 = tmp1.merge(locations_cleaned_final,
                 on = 'url',
                 how = 'inner')

In [46]:
listings_all_v1.sort_values('url', inplace = True)

In [47]:
listings_all_v1.drop_duplicates('url',inplace = True)

In [48]:
listings_all_v1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1265 entries, 0 to 1310
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   url              1265 non-null   object 
 1   property_name    1265 non-null   object 
 2   type_desc        1265 non-null   object 
 3   price            1265 non-null   float64
 4   reviews          1034 non-null   float64
 5   num_reviews      1034 non-null   float64
 6   num_guets        1265 non-null   float64
 7   num_bedrooms     1265 non-null   float64
 8   num_baths        1263 non-null   float64
 9   num_beds         1264 non-null   float64
 10  cleanliness      1003 non-null   float64
 11  accuracy         1003 non-null   float64
 12  communication    1003 non-null   float64
 13  location         1003 non-null   float64
 14  checkin          1003 non-null   float64
 15  value            1003 non-null   float64
 16  identify_verify  1265 non-null   object 
 17  host_status   

In [49]:
listings_all_v1.head()

Unnamed: 0,url,property_name,type_desc,price,reviews,num_reviews,num_guets,num_bedrooms,num_baths,num_beds,...,accuracy,communication,location,checkin,value,identify_verify,host_status,response_rate,region,state
0,https://www.airbnb.com/hotels/42535128?check_i...,Squaw Valley Lodge,Entire serviced apartment,275.0,,,3.0,99.0,,,...,,,,,,Not Verified,Host,,Olympic Valley,California
1,https://www.airbnb.com/rooms/10012754?check_in...,Tahoe Stonehenge,Entire house,918.0,4.9,110.0,12.0,5.0,5.0,5.0,...,4.8,4.9,4.9,5.0,4.7,Not Verified,Superhost,100.0,South Lake Tahoe,California
2,https://www.airbnb.com/rooms/10039236?check_in...,Vidor Village and Family Retreat #322,Entire condominium,262.0,4.0,16.0,11.0,3.0,3.0,3.0,...,4.1,4.4,4.8,4.5,3.8,Identity verified,Host,100.0,Kings Beach,California
3,https://www.airbnb.com/rooms/10041987?check_in...,Comfortable Lake Tahoe Charmer w/ Hot Tub,Entire house,385.0,4.77,123.0,8.0,4.0,4.0,4.0,...,4.9,4.7,4.9,4.9,4.8,Identity verified,Host,100.0,South Lake Tahoe,California
4,https://www.airbnb.com/rooms/10052096?check_in...,Northstar Cabin - Summer access to pool and te...,Entire house,375.0,4.75,48.0,9.0,4.0,4.0,4.0,...,4.9,4.9,4.9,4.9,4.7,Identity verified,Superhost,100.0,Truckee,California


In [50]:
#Clean up amenities
with open('project/data/amenities.pickle','rb') as read_file:
    amenities = pickle.load(read_file)

amenities.head()

Unnamed: 0,url,vals
0,https://www.airbnb.com/rooms/10012754?check_in...,"[TV with standard cable, Cable TV, Wifi, Kitch..."
1,https://www.airbnb.com/rooms/10039236?check_in...,"[HDTV with standard cable, Cable TV, Wifi, Sha..."
2,https://www.airbnb.com/rooms/10041987?check_in...,"[TV, Wifi, Kitchen, Pack ’n Play/travel crib, ..."
3,https://www.airbnb.com/rooms/10052096?check_in...,"[TV with standard cable, Cable TV, Wifi, Pool,..."
4,https://www.airbnb.com/rooms/10340660?check_in...,"[TV with standard cable, Washer, Cable TV, Dry..."


In [51]:
amenities_cleaned = amenities.copy()

In [52]:
amenities.vals.tolist()

[['TV with standard cable',
  'Cable TV',
  'Wifi',
  'Kitchen',
  'Free parking on premises',
  'Hot tub',
  'Indoor fireplace',
  'Heating',
  'Washer',
  'Dryer'],
 ['HDTV with standard cable',
  'Cable TV',
  'Wifi',
  'Shared outdoor pool',
  'Kitchen',
  'Free parking on premises – 2 spaces',
  'Shared gym nearby',
  'Free street parking',
  'Indoor fireplace',
  'Heating'],
 ['TV',
  'Wifi',
  'Kitchen',
  'Pack ’n Play/travel crib',
  'Free parking on premises',
  'Hot tub',
  'Heating',
  'Washer',
  'Dryer',
  'Smoke alarm'],
 ['TV with standard cable',
  'Cable TV',
  'Wifi',
  'Pool',
  'Kitchen',
  'Free parking on premises',
  'Room-darkening shades',
  'Gym',
  'Indoor fireplace',
  'Unavailable: Carbon monoxide alarmCarbon monoxide alarm'],
 ['TV with standard cable',
  'Washer',
  'Cable TV',
  'Dryer',
  'Smoke alarm',
  'Wifi',
  'Carbon monoxide alarm',
  'First aid kit',
  'Fire extinguisher',
  'Kitchen'],
 ['TV',
  'Wifi',
  'Free parking on premises',
  'Gym',
 

In [54]:
amenities_cleaned['vals_strings'] = [" ".join(map(str, item)) for item in amenities_cleaned['vals']]

In [55]:
amenities_cleaned.head()

Unnamed: 0,url,vals,vals_strings
0,https://www.airbnb.com/rooms/10012754?check_in...,"[TV with standard cable, Cable TV, Wifi, Kitch...",TV with standard cable Cable TV Wifi Kitchen F...
1,https://www.airbnb.com/rooms/10039236?check_in...,"[HDTV with standard cable, Cable TV, Wifi, Sha...",HDTV with standard cable Cable TV Wifi Shared ...
2,https://www.airbnb.com/rooms/10041987?check_in...,"[TV, Wifi, Kitchen, Pack ’n Play/travel crib, ...",TV Wifi Kitchen Pack ’n Play/travel crib Free ...
3,https://www.airbnb.com/rooms/10052096?check_in...,"[TV with standard cable, Cable TV, Wifi, Pool,...",TV with standard cable Cable TV Wifi Pool Kitc...
4,https://www.airbnb.com/rooms/10340660?check_in...,"[TV with standard cable, Washer, Cable TV, Dry...",TV with standard cable Washer Cable TV Dryer S...


In [56]:
amenities_cleaned['TV'] = [1 if 'TV' in x else 0 for x in amenities_cleaned['vals_strings']]
amenities_cleaned['Free_Parking'] = [1 if 'Free parking' in x else 0 for x in amenities_cleaned['vals_strings']]
amenities_cleaned['Wifi'] = [1 if 'Wifi' in x else 0 for x in amenities_cleaned['vals_strings']]
amenities_cleaned['Kitchen'] = [1 if 'Kitchen' in x else 0 for x in amenities_cleaned['vals_strings']]
amenities_cleaned['Heating'] = [1 if 'Heating' in x else 0 for x in amenities_cleaned['vals_strings']]
amenities_cleaned['Air_conditioning'] = [1 if 'Air conditioning' in x else 0 for x in amenities_cleaned['vals_strings']]


In [57]:
amenities_cleaned.head()

Unnamed: 0,url,vals,vals_strings,TV,Free_Parking,Wifi,Kitchen,Heating,Air_conditioning
0,https://www.airbnb.com/rooms/10012754?check_in...,"[TV with standard cable, Cable TV, Wifi, Kitch...",TV with standard cable Cable TV Wifi Kitchen F...,1,1,1,1,1,0
1,https://www.airbnb.com/rooms/10039236?check_in...,"[HDTV with standard cable, Cable TV, Wifi, Sha...",HDTV with standard cable Cable TV Wifi Shared ...,1,1,1,1,1,0
2,https://www.airbnb.com/rooms/10041987?check_in...,"[TV, Wifi, Kitchen, Pack ’n Play/travel crib, ...",TV Wifi Kitchen Pack ’n Play/travel crib Free ...,1,1,1,1,1,0
3,https://www.airbnb.com/rooms/10052096?check_in...,"[TV with standard cable, Cable TV, Wifi, Pool,...",TV with standard cable Cable TV Wifi Pool Kitc...,1,1,1,1,0,0
4,https://www.airbnb.com/rooms/10340660?check_in...,"[TV with standard cable, Washer, Cable TV, Dry...",TV with standard cable Washer Cable TV Dryer S...,1,0,1,1,0,0


In [58]:
columns = ['url','TV','Free_Parking', 'Wifi', 'Kitchen', 'Heating', 'Air_conditioning']

In [59]:
amenities_cleaned_final = amenities_cleaned[columns]

In [60]:
listings_all_v2 = listings_all_v1.merge(amenities_cleaned_final,
                 on = 'url',
                 how = 'inner')

In [61]:
listings_all_v2.sort_values('url', inplace = True)

In [62]:
listings_all_v2.drop_duplicates('url',inplace = True)

In [63]:
listings_all_v2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1261 entries, 0 to 1376
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   url               1261 non-null   object 
 1   property_name     1261 non-null   object 
 2   type_desc         1261 non-null   object 
 3   price             1261 non-null   float64
 4   reviews           1031 non-null   float64
 5   num_reviews       1031 non-null   float64
 6   num_guets         1261 non-null   float64
 7   num_bedrooms      1261 non-null   float64
 8   num_baths         1260 non-null   float64
 9   num_beds          1261 non-null   float64
 10  cleanliness       1002 non-null   float64
 11  accuracy          1002 non-null   float64
 12  communication     1002 non-null   float64
 13  location          1002 non-null   float64
 14  checkin           1002 non-null   float64
 15  value             1002 non-null   float64
 16  identify_verify   1261 non-null   object 


In [64]:
with open('project/data/listings_final.pickle', 'wb') as to_write:
    pickle.dump(listings_all_v2, to_write)