In [None]:
import numpy as np
import pandas as pd
from load_data import load_data_cleansed_imputed
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import halfnorm

Load Data

In [None]:
price, listings, reviews = load_data_cleansed_imputed()

Which variables have NAN's

In [None]:
listings.isna().sum()
url_listing = "http://data.insideairbnb.com/ireland/leinster/dublin/2021-11-07/data/listings.csv.gz"
listings_orig = pd.read_csv(url_listing)

quite a lot, so let's look at one after the other

In [None]:
# name and description, take room_type instead
ind = listings[listings["name"].isna()]["name"].index
listings["name"].loc[ind] = listings["room_type"].loc[ind]

ind = listings[listings["description"].isna()]["description"].index
listings["description"].loc[ind] = listings["room_type"].loc[ind]

# neighbourhood-overview (=description) just neihgbourhood cleansed
ind = listings[listings["neighborhood_overview"].isna()]["neighborhood_overview"].index
listings["neighborhood_overview"].loc[ind] = listings["neighbourhood_cleansed"].loc[ind]

In [None]:
host_var = ["host_name", "host_since", #"host_is_superhost",
            "host_listings_count"] #, "host_identity_verified"]
listings.isna().sum()[host_var]
ind_s = listings[listings["host_name"].isna()]["host_name"].index
rel_URL = listings_orig.loc[ind_s]["host_url"]
ids = listings_orig["id"].loc[ind_s]

name = []
id_ver = []
for i in range(len(ind_s)):
    listings["host_listings_count"].loc[ind_s]= len(listings_orig[listings_orig.id == ids.values[i]])
    session = requests.Session()
    html_code = session.get(rel_URL.values[i]).content
    soup = bs(html_code, "html.parser")
    name_html = soup.select("._a0kct9 ._14i3z6h")
    verified_html = soup.select("._p03egf+ ._p03egf ._1ax9t0a")
    if len(name_html) == 0:
        name.append("Anonymous")
    else:
        name.append(name_html[0].text[8:])
    if len(verified_html) == 0:
        id_ver.append(0)
    else:
        id_ver.append(1)


listings["host_name"].loc[ind_s] = name
#listings["host_identity_verified"].loc[ind_s] = id_ver
#listings["host_is_superhost"].loc[ind_s] = 0
listings["host_since"].loc[ind_s] = listings["first_review"].loc[ind_s]

# host_about
ind = listings[listings["host_about"].isna()]["host_about"].index
listings["host_about"].loc[ind] = " "

Linear Models with beds and bedrooms

In [None]:
# nach beds ab availability weiter
listings.isna().sum()[["bedrooms", "beds"]]
# Accommodates look very good linearly
a = listings.drop(price[price > 2000].index)["accommodates"]
b = listings.drop(price[price > 2000].index)["beds"]
plt.scatter(a,b)
plt.xlabel("accommodates")
plt.ylabel("beds")
plt.title("Accomodates vs. beds")
plt.grid()
plt.show()

# So let us estimate linear models and predict, for beds
Y = listings["beds"]
x = listings["accommodates"]
X = pd.DataFrame([x]).transpose()
X = sm.add_constant(X) # adding a constant

# Fit model for beds
model = sm.OLS(Y, X, missing='drop').fit()

ind = listings[listings["beds"].isna()]["beds"].index
x0 = listings["accommodates"].loc[ind]
x0 = sm.add_constant(x0)
predictions = model.predict(x0)
# Round????????????
listings["beds"].loc[ind] = round(predictions).astype(int)



# Then we see, that bedrooms and beds look linear as well
a = listings.drop(price[price > 2000].index)["beds"]
b = listings.drop(price[price > 2000].index)["bedrooms"]

plt.scatter(a,b)
plt.xlabel("beds")
plt.ylabel("bedrooms")
plt.title("beds vs. bedrooms")
plt.grid()
plt.show()

# So now estimate linear models and predict, for beds
Y = listings["bedrooms"]
x = listings["beds"]
X = pd.DataFrame([x]).transpose()
X = sm.add_constant(X) # adding a constant

# Fit model for beds
model = sm.OLS(Y, X, missing='drop').fit()

ind = listings[listings["bedrooms"].isna()]["bedrooms"].index
x0 = listings["beds"].loc[ind]
x0 = sm.add_constant(x0)
predictions = model.predict(x0)

listings["bedrooms"].loc[ind] = round(predictions).astype(int)


Go on further

In [None]:
listings.isna().sum()[['first_review', 'last_review']]
ind = listings[listings["first_review"].isna()]["first_review"].index
listings["first_review"].loc[ind] = listings_orig["last_scraped"].loc[ind]

ind = listings[listings["last_review"].isna()]["last_review"].index
listings["last_review"].loc[ind] = listings_orig["last_scraped"].loc[ind]



# Look all like half normal
review_var = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_communication', 'review_scores_location', 'review_scores_value']

for i in range(len(review_var)):
    plt.hist(listings[review_var[i]], density=True)
    plt.title("Histogram of")
    plt.xlabel(review_var[i])
    plt.ylabel("frequency")
    plt.show()


# let us replace the nans
for i in range(len(review_var)):
    ind = listings[listings[review_var[i]].isna()][review_var[i]].index
    sd = np.nanstd(listings[review_var[i]])  # ML-estimator
    sd = sd - sd / (4 * len(listings[review_var[i]])) # MLE bias corrected
    np.random.seed(123)
    fill_ind = (halfnorm.rvs(loc=0, scale=sd, size=len(ind)) * -1) + 5
    listings[review_var[i]].loc[ind] = fill_ind


In [None]:
ind = listings[listings["reviews_per_month"].isna()]["reviews_per_month"].index
listings["reviews_per_month"].loc[ind] = listings["number_of_reviews"].loc[ind]

ind = listings[listings["host_location_country"].isna()]["host_location_country"].index
listings["host_location_country"].loc[ind] = "Ireland"

In [None]:
# those are all the same rows
rest_var = ['Bathtub', 'Bed linens', 'Breakfast', 'Cleaning before checkout', 'Dishwasher',
       'Elevator', 'Hair dryer', 'Indoor fireplace', 'Long term stays allowed',
       'Private entrance', 'Security cameras on property', 'Single level home',
       'Special_stuff', 'TV_number', 'Outdoor_stuff_number', 'Baby_friendly',
       'sound_system_number', 'Oven_available', 'Stoves_available',
       'Refridgerator_available', 'Body_soap_available',
       'Garden_backyard_available', 'Free_parking_number',
       'Paid_parking_number', 'Children_Entertainment', 'Workspace',
       'Shampoo_Conditioner_available', 'Fast_wifi_available', 'Gym_available',
       'Coffee_machine_available', 'Dryer_available', 'Washer_available',
       'Hot_tub_available', 'Pool_available', 'Patio_balcony_available',
       'Wifi_available', 'AC_available', 'heating_available',
       'Kitchen_available', 'Safe_available', 'Water_location']
# all are dummies
listings[rest_var].describe()

for i in range(len(rest_var)):
    ind = listings[listings[rest_var[i]].isna()][rest_var[i]].index
    m = np.nanmean(listings[rest_var[i]])
    listings[rest_var[i]].loc[ind] = np.random.binomial(n=1, p=m, size=len(ind))

Which variable still has nans?

In [None]:
len(listings.isna().sum()[listings.isna().sum().values > 0])