In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Features relevant to predicting price of a listing

In [192]:
columns = ["Country", "City", "State", "Neighbourhood Cleansed", "Neighbourhood Group Cleansed",
"Host Since", "Host Response Time", "Host Response Rate", "Calculated host listings count",
"Property Type", "Room Type", "Accommodates", "Bedrooms", "Beds", "Bed Type",  "Square Feet", "Cancellation Policy",
"Minimum Nights", "Maximum Nights", "Has Availability", "Availability 30", "Availability 60", "Availability 90", "Availability 365",
"Number of Reviews", "Reviews per Month", "First Review", "Last Review", "Review Scores Rating", "Review Scores Accuracy", "Review Scores Cleanliness","Review Scores Checkin", "Review Scores Communication", "Review Scores Location","Review Scores Value",
"Features",
"Amenities",
"Price"]

# "Square Feet": 97.56% of entries have NULL values => cannot be used
#"Calendar Updated"
#"Weekly Price", "Monthly Price", "Security Deposit", "Cleaning Fee", "Guests Included", "Extra People",

features_new_host = ["Country", "City", "Neighbourhood Cleansed",
                   "Property Type", "Room Type", "Accommodates","Bedrooms", "Beds", "Bed Type", "Cancellation Policy",
                   "Minimum Nights",
                   "Price"]

features_existing_host = ["Country", "City", "Neighbourhood Cleansed",
                          "Property Type", "Room Type", "Accommodates", "Bedrooms", "Beds", "Bed Type", "Cancellation Policy",
                          "Minimum Nights", "Availability 30", "Availability 60", "Availability 90", "Availability 365",
                          "Number of Reviews", "Reviews per Month", "Review Scores Rating", "Review Scores Accuracy", "Review Scores Cleanliness","Review Scores Checkin", "Review Scores Communication", "Review Scores Location","Review Scores Value",
                          "Host Since", "Host Response Time", "Host Response Rate", "Calculated host listings count",
                          "Price"]

############################################
########## Feature pre-processing ##########
#### Numerical (Input text fields)
# "Minimum Nights", "Availability 30", "Availability 60", "Availability 90", "Availability 365"
# "Number of Reviews", 
# Review Scores Rating(20-100)
# "Host Since" (num_days = current_date - host_since_date), Host Response Rate(0-100), "Calculated host listings count"
# Price (Convert from local currency to USD)

#### Numerical (Dropdown numerical values)
# Accommodates(1-16), "Bedrooms"(0-10), "Beds"(0-16)
# "Review Scores Accuracy(2-10)", "Review Scores Cleanliness(2-10)","Review Scores Checkin(2-10)", "Review Scores Communication(2-10)", "Review Scores Location(2-10)","Review Scores Value(2-10)"

#### Label encoding (0, 1, 2 .... n_categories-1) (Category dropdowns -> on user select must be assigned numerical value)
# "Country", "City", "Neighbourhood Cleansed"
# "Property Type", "Room Type", "Bed Type", "Cancellation Policy"
# "Host Response Time"
############################################


# TODO
# Host since -> convert to days ----------------------------------- Done
# Features -> Extract features  ----------------------------------- Done
# Amenities -> Extract features ----------------------------------- Done
# Price -> Convert to local currency
# Verify missing features (Listing type etc...) ------------------- Done
# Decide what plots to be shown in exploratory analysis section ---
# Flask vs Flast + React ------------------------------------------ Done (Flask)

In [193]:
df = pd.read_csv("../../data/airbnb-data-science/airbnb-listings.csv", usecols=columns, sep=';')
df = df[columns]
df.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Country,City,State,Neighbourhood Cleansed,Neighbourhood Group Cleansed,Host Since,Host Response Time,Host Response Rate,Calculated host listings count,Property Type,...,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Features,Amenities,Price
0,Netherlands,Amsterdam,Noord-Holland,Watergraafsmeer,,2017-03-28,,,1.0,Apartment,...,,,,,,,,Host Has Profile Pic,"TV,Kitchen,Heating,Washer,Smoke detector,Lapto...",80.0
1,Netherlands,Amsterdam,Noord-Holland,Watergraafsmeer,,2014-10-21,within a day,100.0,1.0,House,...,94.0,9.0,9.0,10.0,9.0,10.0,9.0,"Host Has Profile Pic,Host Identity Verified,Is...","TV,Wireless Internet,Kitchen,Heating,Family/ki...",195.0


In [196]:
#dfo = df.copy()
len(dfo)

485419

In [142]:
dfo["First Review"].unique()

array([nan, '2015-05-09', '2016-12-04', ..., '2010-05-06', '2010-05-12',
       '2009-09-03'], dtype=object)

In [320]:
dfo[pd.isnull(dfo["Price"])]

Unnamed: 0,Country,City,State,Neighbourhood Cleansed,Neighbourhood Group Cleansed,Host Since,Host Response Time,Host Response Rate,Calculated host listings count,Property Type,...,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Features,Amenities,Price
303,United States,Austin,TX,78746,,2014-01-18,,,1.0,House,...,96.0,9.0,10.0,10.0,10.0,10.0,10.0,"Host Has Profile Pic,Is Location Exact","TV,Cable TV,Internet,Wireless Internet,Air con...",
367,United States,Austin,TX,78748,,2014-10-08,within an hour,98.0,7.0,House,...,,,,,,,,"Host Has Profile Pic,Host Identity Verified,Is...",,
403,United States,Austin,TX,78702,,2017-02-08,,,2.0,House,...,,,,,,,,"Host Has Profile Pic,Host Identity Verified,Is...","Wireless Internet,Air conditioning,Kitchen,Fre...",
417,United States,Austin,TX,78704,,2013-01-09,within an hour,100.0,73.0,House,...,87.0,9.0,9.0,9.0,9.0,10.0,9.0,"Host Has Profile Pic,Host Identity Verified,Is...","TV,Cable TV,Internet,Wireless Internet,Air con...",
419,United States,Austin,TX,78704,,2014-10-22,within a day,85.0,3.0,House,...,99.0,10.0,10.0,10.0,10.0,10.0,10.0,"Host Has Profile Pic,Host Identity Verified,Is...","TV,Cable TV,Internet,Wireless Internet,Air con...",
420,United States,Austin,TX,78704,,2014-09-03,within an hour,100.0,1.0,Loft,...,100.0,10.0,10.0,10.0,10.0,10.0,9.0,"Host Has Profile Pic,Host Identity Verified,Is...","TV,Cable TV,Internet,Wireless Internet,Air con...",
456,United States,Austin,TX,78704,,2016-04-06,,,1.0,House,...,,,,,,,,"Host Has Profile Pic,Instant Bookable","TV,Internet,Wireless Internet,Air conditioning...",
468,United States,Austin,TX,78704,,2014-05-22,within an hour,100.0,45.0,House,...,,,,,,,,"Host Has Profile Pic,Host Identity Verified,Is...","TV,Wireless Internet,Air conditioning,Pool,Kit...",
486,United States,Austin,TX,78734,,2012-09-24,,,1.0,House,...,,,,,,,,"Host Has Profile Pic,Host Identity Verified,Is...",,
490,United States,Austin,TX,78701,,2014-08-10,,,1.0,Townhouse,...,,,,,,,,"Host Has Profile Pic,Host Identity Verified,Is...","TV,Wireless Internet,Air conditioning,Pool,Kit...",


In [310]:
dfo.groupby("Amenities").count()

Unnamed: 0_level_0,Country,City,State,Neighbourhood Cleansed,Neighbourhood Group Cleansed,Host Since,Host Response Time,Host Response Rate,Calculated host listings count,Property Type,...,Last Review,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Features,Price
Amenities,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24-Hour Check-in,2,2,2,2,0,2,2,2,2,2,...,0,0,0,0,0,0,0,0,2,1
24-hour check-in,1,1,1,1,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,1
"24-hour check-in,translation missing: en.hosting_amenity_49,translation missing: en.hosting_amenity_50",1,1,1,1,0,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
9,1,1,1,1,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,1
Air Conditioning,3,3,3,3,0,3,1,1,3,3,...,0,0,0,0,0,0,0,0,3,2
"Air Conditioning,24-Hour Check-in",1,1,1,1,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,1
"Air Conditioning,Breakfast,Heating,Suitable for Events,First Aid Kit",1,1,1,1,0,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"Air Conditioning,Elevator in Building,Family/Kid Friendly,Essentials,Shampoo,Hangers,Hair Dryer",1,1,1,1,0,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
"Air Conditioning,Elevator in Building,Family/Kid Friendly,Shampoo",1,1,1,1,0,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
"Air Conditioning,Elevator in Building,Heating,Essentials",1,1,1,1,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,1,1


## Data cleaning

### Existing hosts

In [325]:
df = dfo.copy()

enable_separate_features = 0
enable_separate_amenities = 0

########################################################
############ Location Features #############
###### Countries ######## 
# Remove countries with very few entries
exclude_low_count_countries_list = ["0", "Cuba", "Mexico", "Uruguay", "Vanuatu", "Vatican City"]
df = df[~df["Country"].isin(exclude_low_count_countries_list)]
# Remove null (3 entries)
df = df[~pd.isnull(df["Country"])]

###### City ######## 
# Remove null (451 entries)
df = df[~pd.isnull(df["City"])]


###### Neighbourhood Cleansed ######
# Remove null (1 entry)
df = df[~pd.isnull(df["Neighbourhood Cleansed"])]
#######################################################


####################################################################
############ Basic (Beds, Area, Cancellation) Features #############

###### Property Type #####
# Remove null (11 entries)
df = df[~pd.isnull(df["Property Type"])]
# Remove types with less than 10 entries
df = df.groupby("Property Type").filter(lambda x: len(x)>=10)

###### Room Type #####
# Remove null (1 entries)
df = df[~pd.isnull(df["Room Type"])]

###### Accommodates #####
# Remove null (63 entries)
df = df[~pd.isnull(df["Accommodates"])]
# Accommodate maximum 16 people
df = df[df["Accommodates"] <= 16]

###### Bedrooms #########
# Remove null (597 entries)
df = df[~pd.isnull(df["Bedrooms"])]
# Remove more than 10 bedrooms
df = df[df["Bedrooms"] <= 10]

###### Beds #########
# Remove null (903 entries)
df = df[~pd.isnull(df["Beds"])]
# Remove more than 16 beds (2 entries)
df = df[df["Beds"] <= 16]

###### Bed Type #########
# Remove null (1 entry)
df = df[~pd.isnull(df["Bed Type"])]

###### Cancellation Policy #########
# Remove cancellation policy with low entries (3)
exclude_cancellation_policy_list = ["long_term", "no_refunds"]
df = df[~df["Cancellation Policy"].isin(exclude_cancellation_policy_list)]
# Remove null (2 entries)
df = df[~pd.isnull(df["Cancellation Policy"])]
####################################################################


##########################################################
######### Availability #########

###### Minimum Nights #########
# Remove "Minimum Nights" above 30 except for multiples of 30 (corresponding to months)
df = df[(df["Minimum Nights"]<=31) | (df["Minimum Nights"].isin([60, 90, 120, 180]))]
# Remove null (2 entries)
df = df[~pd.isnull(df["Minimum Nights"])]

###### Availability 30 #########
# Remove null (2 entries)
df = df[~pd.isnull(df["Availability 30"])]

###### Availability 60 #########
# Remove null (2 entries)
df = df[~pd.isnull(df["Availability 60"])]

###### Availability 90 #########
# Remove null (2 entries)
df = df[~pd.isnull(df["Availability 90"])]

###### Availability 365 #########
# Remove null (2 entries)
df = df[~pd.isnull(df["Availability 365"])]
##########################################################


##########################################################
######### Reviews #########

###### Number of Reviews #########
# Remove entries with 0 reviews (Use Model for new host in this case)
df = df[df["Number of Reviews"] > 0]
# Remove null (2 entries)
df = df[~pd.isnull(df["Number of Reviews"])]

###### Reviews per Month #########
#df.at[(df["Number of Reviews"]==0) & (df[pd.isnull("Reviews per Month")]), "Reviews per Month"] = 0
# Remove null (235 entries)
df = df[~pd.isnull(df["Reviews per Month"])]

##### Reviews Scores #############
df = df[~pd.isnull(df["Review Scores Rating"])]
df = df[~pd.isnull(df["Review Scores Accuracy"])]
df = df[~pd.isnull(df["Review Scores Cleanliness"])]
df = df[~pd.isnull(df["Review Scores Checkin"])]
df = df[~pd.isnull(df["Review Scores Communication"])]
df = df[~pd.isnull(df["Review Scores Location"])]
df = df[~pd.isnull(df["Review Scores Value"])]
##########################################################



##########################################################
######### Host Features #########

###### Host Since #########
# Remove null (504 entries)
df = df[~pd.isnull(df["Host Since"])]

# Convert date to number of days till today
today_date = pd.to_datetime(pd.to_datetime("today").date())
df["Host Since"] = pd.to_datetime(df["Host Since"])
df["Host Since Days"] = (today_date - df["Host Since"])
df['Host Since Days'] = df[['Host Since Days']].apply(pd.to_numeric)
df['Host Since Days'] = df['Host Since Days'] / (24*60*60*1e9) #np.timedelta64(1, 'D')
df["Host Since Days"] = df["Host Since Days"].astype(int)

###### Host Response Time #####
# Assign "a few days or more" to entries with NULL values for "Host Response Time"
df.at[pd.isnull(df["Host Response Time"]), "Host Response Time"] = "a few days or more"

###### Host Response Rate #####
# Assign mean(93.4) to entries with NULL values for "Host Response Rate"
df.at[pd.isnull(df["Host Response Rate"]), "Host Response Rate"] = dfo["Host Response Rate"].mean()

###### Calculated host listings count #####
# Assign 1 to entries with NULL values for "Calculated host listings count"
df.at[pd.isnull(df["Calculated host listings count"]), "Calculated host listings count"] = 1
##########################################################



##########################################################
######### Extract Features #########
"""
features_list = []
for i in range(len(df["Features"].unique())):
    if not pd.isnull(df["Features"].unique()[i]):
        features = df["Features"].unique()[i].split(",")
        for ft in features:
            if ft not in features_list:
                features_list.append(ft)
"""

features_list = ['Host Has Profile Pic',
                 'Host Identity Verified',
                 'Is Location Exact',
                 'Instant Bookable',
                 'Host Is Superhost',
                 'Require Guest Phone Verification',
                 'Require Guest Profile Picture',
                 'Requires License']

# Assign 0 to all the above features
for i in range(len(features_list)):
    ft = features_list[i]
    df[ft] = 0


if enable_separate_features == 1:
    # Assign 1 if that feature is present
    for i in range(len(df)):
        idx = df.index[i]
        features = df.loc[idx]["Features"]
        if not pd.isnull(features):
            features = features.split(",")
            for ft in features:
                df.at[idx, ft] = 1
##########################################################



##########################################################
######### Extract Amenities #########
"""
amenities_list = []
for i in range(len(df["Amenities"].unique())):
    if not pd.isnull(df["Amenities"].unique()[i]):
        amenities = df["Amenities"].unique()[i].split(",")
        for ft in amenities:
            if ft not in amenities_list:
                amenities_list.append(ft)
amenities_list
"""
amenities_list = ['TV',
                 'Wireless Internet',
                 'Kitchen',
                 'Heating',
                 'Family/kid friendly',
                 'Washer',
                 'Smoke detector',
                 'Fire extinguisher',
                 'Essentials',
                 'Cable TV',
                 'Internet',
                 'Dryer',
                 'First aid kit',
                 'Safety card',
                 'Shampoo',
                 'Hangers',
                 'Laptop friendly workspace',
                 'Air conditioning',
                 'Breakfast',
                 'Free parking on premises',
                 'Elevator in building',
                 'Buzzer/wireless intercom',
                 'Hair dryer',
                 'Private living room',
                 'Iron',
                 'Wheelchair accessible',
                 'Hot tub',
                 'Carbon monoxide detector',
                 '24-hour check-in',
                 'Pets live on this property',
                 'Dog(s)',
                 'Gym',
                 'Lock on bedroom door',
                 'Private entrance',
                 'Indoor fireplace',
                 'Smoking allowed',
                 'Pets allowed',
                 'Cat(s)',
                 'Self Check-In',
                 'Doorman Entry',
                 'Suitable for events',
                 'Pool',
                 'Lockbox',
                 'Bathtub',
                 'Room-darkening shades',
                 'Game console',
                 'Doorman',
                 'High chair',
                 'Pack ’n Play/travel crib',
                 'Keypad',
                 'Other pet(s)',
                 'Smartlock']

# Assign 0 to all the above amenities
for i in range(len(amenities_list)):
    ft = amenities_list[i]
    df[ft] = 0

    
if enable_separate_amenities == 1:
    # Assign 1 if those amenities are present
    for i in range(len(df)):
        idx = df.index[i]
        amenities = df.loc[idx]["Amenities"]
        if not pd.isnull(amenities):
            amenities = amenities.split(",")
            for ft in amenities:
                df.at[idx, ft] = 1              
##########################################################



##########################################################
######### Price (Output variable) #########


### TODO: Convert to local currency

# Remove null (7954 entries)
df = df[~pd.isnull(df["Price"])]
# Remove 0 (4 entries)
df = df[df["Price"] != 0]

##########################################################

### Split Dataset into train and test sets

In [328]:
dfx = df.copy()
del dfx["Price"]
dfy = df["Price"].copy()

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

#"""
split = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df["Price"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
#"""

#x_train, x_test, y_train, y_test = train_test_split(dfx,dfy, test_size=0.1, random_state=42)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [329]:
df.groupby("Price").count()

Unnamed: 0_level_0,Country,City,State,Neighbourhood Cleansed,Neighbourhood Group Cleansed,Host Since,Host Response Time,Host Response Rate,Calculated host listings count,Property Type,...,Lockbox,Bathtub,Room-darkening shades,Game console,Doorman,High chair,Pack ’n Play/travel crib,Keypad,Other pet(s),Smartlock
Price,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,4,4,4,4,1,4,4,4,4,4,...,4,4,4,4,4,4,4,4,4,4
8.0,3,3,2,3,0,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
9.0,18,18,15,18,11,18,18,18,18,18,...,18,18,18,18,18,18,18,18,18,18
10.0,141,141,113,141,67,141,141,141,141,141,...,141,141,141,141,141,141,141,141,141,141
11.0,51,51,47,51,30,51,51,51,51,51,...,51,51,51,51,51,51,51,51,51,51
12.0,108,108,96,108,56,108,108,108,108,108,...,108,108,108,108,108,108,108,108,108,108
13.0,94,94,84,94,37,94,94,94,94,94,...,94,94,94,94,94,94,94,94,94,94
14.0,158,158,142,158,74,158,158,158,158,158,...,158,158,158,158,158,158,158,158,158,158
15.0,686,686,602,686,325,686,686,686,686,686,...,686,686,686,686,686,686,686,686,686,686
16.0,304,304,266,304,165,304,304,304,304,304,...,304,304,304,304,304,304,304,304,304,304


In [331]:
df["Price"].max()

999.0

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["Review Scores Rating", "Price"]
scatter_matrix(df[attributes], figsize=(12, 8))