In [109]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import sys
import re
import matplotlib.pyplot as plt
import seaborn as sns
import geopy.distance
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 10000)
pd.options.display.max_rows = 10000

In [110]:
data = pd.read_csv("data/december/listings.csv")

In [111]:
drops = ["calendar_updated", "neighbourhood", "host_response_time", "host_response_rate",
         "host_neighbourhood", "has_availability","host_verifications",
    "license",
    "neighbourhood_group_cleansed",
    "bathrooms",
    "host_thumbnail_url",
    "host_picture_url",
    "listing_url",
    "picture_url",
    "host_url",
    "last_scraped",
    "description", "calendar_last_scraped",
    "neighborhood_overview",
    "host_about",
    "name", "host_location"]
data.drop(columns=drops, inplace=True)

In [112]:
data['host_acceptance_rate'] = data['host_acceptance_rate'].str.rstrip("%").astype(float)/100

In [113]:
data['price'] = data['price'].str.replace(r'[$,]', '').astype(float)

In [114]:
data.columns

Index(['id', 'scrape_id', 'source', 'host_id', 'host_name', 'host_since',
       'host_acceptance_rate', 'host_is_superhost', 'host_listings_count',
       'host_total_listings_count', 'host_has_profile_pic',
       'host_identity_verified', 'neighbourhood_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review',
       'last_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_lo

In [115]:
# prices == 0
# prices >25k
data[data["price"]>25000]["price"]

798     26696.0
1144    71536.0
Name: price, dtype: float64

In [116]:
data['host_is_superhost'] = data['host_is_superhost'].apply(lambda x: 1 if x == 't' else (0 if x == 'f' else x))

In [117]:
data["amenities"].unique().tolist()

['["Bed linens", "Dishes and silverware", "Hangers", "Wifi", "Fire extinguisher", "Microwave", "Private entrance", "Smoking allowed", "Long term stays allowed", "Shampoo", "River view", "Essentials", "Waterfront", "Luggage dropoff allowed", "Hot water", "Lock on bedroom door", "Host greets you", "Refrigerator", "Heating", "Harbor view"]',
 '["Books and reading material", "Smoke alarm", "Fire extinguisher", "Dedicated workspace", "Paid parking off premises", "Room-darkening shades", "Paid washer \\u2013 In unit", "Bed linens", "Private entrance", "Outdoor furniture", "Fast wifi \\u2013 52 Mbps", "Security cameras on property", "Mini fridge", "Essentials", "Ethernet connection", "Sony + Wireless Bluetooth Speaker(phone) Bluetooth sound system", "Central heating", "Cleaning products", "First aid kit", "Carbon monoxide alarm", "Paid parking on premises", "Extra pillows and blankets", "PH neutral and dermatologically tested shampoo", "Hot water kettle", "Clothing storage: wardrobe", "Single

In [118]:
data['host_has_profile_pic'] = data['host_has_profile_pic'].apply(lambda x: 1 if x == 't' else (0 if x == 'f' else x))

In [119]:
data['host_identity_verified'] = data['host_identity_verified'].apply(lambda x: 1 if x == 't' else (0 if x == 'f' else x))

In [120]:
data['instant_bookable'] = data['instant_bookable'].apply(lambda x: 1 if x == 't' else (0 if x == 'f' else x))

In [121]:
data['amenities'] = data['amenities'].apply(lambda x: re.sub(r'\\u\w{4}', '', re.sub(r'\\', '', re.sub(r'\[', '',  re.sub(r'\]', '', re.sub(r'"', '', re.sub(r',', '', x)))))  ))

In [122]:
words = []
data['amenities'].apply(lambda x:  words.append(x.split(" ")))
flat_list = []
for sub_list in words:
    for item in sub_list:
        flat_list.append(item)

In [123]:
ab = pd.DataFrame(pd.Series(flat_list).value_counts())
ab.columns = ["counts"]
ab = ab[ab["counts"]>600]
ba = ab.transpose()
ba = ba.drop(columns=["and", "allowed", "Private", "Essentials","silverware",\
                      "Long", "Hair", "stays", "term", "Bed", "Shampoo", "Cooking", "basics", "u2013",\
                      "Paid", "Carbon", "monoxide", "maker"
                     ], axis=1)
findd = ba.columns[:18]

In [124]:
data["review_scores_rating"].unique()

array([4.85, 4.89, 4.44, 4.94, 4.88, 4.79, 4.72, 4.92, 4.87, 4.86, 4.91,
       4.77, 4.5 , 4.59, 4.76, 5.  , 4.75, 4.55, 4.65, 4.82, 4.71, 4.84,
       4.83, 4.95, 4.51, 4.9 , 4.61, 4.33, 4.74, 4.69, 4.8 , 4.7 , 4.97,
       4.67, 4.93, 4.28, 4.96, 4.64, 4.66, 4.19,  nan, 4.81, 4.73, 4.62,
       4.43, 4.58, 4.53, 4.63, 4.56, 4.99, 4.98, 4.21, 4.78, 4.54, 4.6 ,
       4.22, 4.37, 4.68, 4.41, 4.47, 4.48, 4.45, 4.02, 4.52, 4.14, 4.57,
       4.39, 4.3 , 4.4 , 4.46, 4.42, 4.  , 4.17, 3.86, 4.24, 4.35, 4.27,
       4.11, 4.29, 4.38, 4.49, 4.32, 4.31, 3.5 , 4.36, 4.25, 0.  , 4.16,
       4.2 , 4.13, 3.43, 4.09, 3.79, 3.67, 4.34, 4.15, 2.  , 3.  , 4.05,
       4.23, 3.75, 4.26, 3.6 , 3.93, 3.92, 3.37, 4.08, 3.69, 3.33, 3.46,
       4.1 , 3.62, 3.4 , 3.9 , 3.88, 4.18, 2.75, 2.67, 4.06])

In [125]:
findd

Index(['alarm', 'Hot', 'water', 'Coffee', 'dryer', 'Wifi', 'Smoke', 'Kitchen',
       'Dishes', 'Hangers', 'parking', 'Refrigerator', 'Heating', 'Iron', 'TV',
       'premises', 'linens', 'Fire'],
      dtype='object')

In [126]:
for ind, row in data.iterrows():
    for i in list(set(row["amenities"].split(" ")) & set(findd)):
        data.at[ind, i] = 1

In [127]:
for i in findd:
    data[i].fillna(0, inplace=True)

In [128]:
data["property_type"].value_counts().head(15)

Entire rental unit                   2384
Entire condo                         1284
Private room in rental unit           520
Entire home                           432
Private room in bed and breakfast     294
Entire townhouse                      213
Entire loft                           186
Private room in condo                 135
Houseboat                             132
Private room in home                  130
Room in boutique hotel                122
Room in hotel                         117
Private room in houseboat             102
Private room in guest suite            99
Private room in townhouse              91
Name: property_type, dtype: int64

In [129]:
data.columns.to_list()

['id',
 'scrape_id',
 'source',
 'host_id',
 'host_name',
 'host_since',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_listings_count',
 'host_total_listings_count',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood_cleansed',
 'latitude',
 'longitude',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms_text',
 'bedrooms',
 'beds',
 'amenities',
 'price',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'number_of_reviews_l30d',
 'first_review',
 'last_review',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'instant_bookable',
 'cal

correlation = data[['price', 'Barn',
 'Boat',
 'Bus',
 'Camper/RV',
 'Cave',
 'Entire cabin',
 'Entire chalet',
 'Entire condo',
 'Entire cottage',
 'Entire guest suite',
 'Entire guesthouse',
 'Entire home',
 'Entire loft',
 'Entire place',
 'Entire rental unit',
 'Entire serviced apartment',
 'Entire townhouse',
 'Entire vacation home',
 'Entire villa',
 'Houseboat',
 'Private room',
 'Private room in bed and breakfast',
 'Private room in boat',
 'Private room in bungalow',
 'Private room in cabin',
 'Private room in casa particular',
 'Private room in condo',
 'Private room in earthen home',
 'Private room in farm stay',
 'Private room in guest suite',
 'Private room in guesthouse',
 'Private room in home',
 'Private room in hostel',
 'Private room in houseboat',
 'Private room in loft',
 'Private room in nature lodge',
 'Private room in rental unit',
 'Private room in serviced apartment',
 'Private room in tiny home',
 'Private room in townhouse',
 'Private room in vacation home',
 'Private room in villa',
 'Room in aparthotel',
 'Room in bed and breakfast',
 'Room in boutique hotel',
 'Room in hostel',
 'Room in hotel',
 'Room in serviced apartment',
 'Shared room in aparthotel',
 'Shared room in bed and breakfast',
 'Shared room in boat',
 'Shared room in farm stay',
 'Shared room in home',
 'Shared room in hostel',
 'Shared room in houseboat',
 'Shared room in rental unit',
 'Tiny home',
 'Tower',
 'Yurt']].corr()
# abs(correlation).sort_values(by='price', ascending=False)


In [130]:
for_ = ['Private room in houseboat', 'Boat', 'Entire rental unit',
        'Private room in bed and breakfast', 'Entire villa','Private room in townhouse',
        'Private room in home', 'Entire condo', 'Entire home', 'Houseboat', 'Private room in guest suite',
        'Shared room in hostel', 'Entire loft']
specific_dummies = pd.get_dummies(data['property_type'][data['property_type'].isin(for_)], prefix="proptype")
data = pd.concat([data, specific_dummies], axis=1)

In [131]:
for i in for_:
    data["proptype_"+i].fillna(0, inplace=True)

In [132]:
specific_dummies = pd.get_dummies(data['room_type'], prefix="room_type", drop_first=True)
data = pd.concat([data, specific_dummies], axis=1)

In [133]:
specific_dummies = pd.get_dummies(data['neighbourhood_cleansed'], prefix="neigh", drop_first=True)
data = pd.concat([data, specific_dummies], axis=1)

In [134]:
data["bathrooms_text"].unique()

array(['1.5 baths', '1.5 shared baths', '1 private bath', '1 shared bath',
       '1 bath', '2.5 baths', '3.5 baths', '0 baths', 'Private half-bath',
       '2 baths', '3 baths', '0 shared baths', 'Half-bath', nan,
       '5 baths', 'Shared half-bath', '2 shared baths', '5.5 baths',
       '4 baths', '4.5 baths', '4 shared baths', '17 baths',
       '3 shared baths'], dtype=object)

In [135]:
data['first_review'] = pd.to_datetime(data['first_review'])
data['last_review'] = pd.to_datetime(data['last_review'])
data['host_since'] = pd.to_datetime(data['host_since'])
data['first_review_unix'] = (data['first_review'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
data['last_review_unix'] = (data['last_review'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
data['host_since_unix'] = (data['host_since'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

In [136]:
data['last_review_unix'] = data['last_review_unix'].apply(lambda x: min(data['last_review_unix'][data['last_review_unix']>0]) if x <0 else x)

In [137]:
data['first_review_unix'] = data['first_review_unix'].apply(lambda x: min(data['first_review_unix'][data['first_review_unix']>0]) if x <0 else x)

In [138]:
data['host_since_unix'] = data['host_since_unix'].apply(lambda x: min(data['host_since_unix'][data['host_since_unix']>0]) if x <0 else x)

In [139]:
to_zero = ['review_scores_communication', 'host_acceptance_rate', 'review_scores_communication', 'review_scores_checkin', 'review_scores_cleanliness', 'review_scores_location', 'review_scores_value',
           'review_scores_accuracy','review_scores_rating', 'reviews_per_month', 'bedrooms', 'beds', 'host_is_superhost', 'bathrooms_text']
for i in to_zero:
  data[i].fillna(0, inplace=True)


In [140]:
data.drop(columns=["first_review", "last_review"], inplace=True)

In [141]:
missing_values = data.isna().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values>0]
print(missing_values)

last_review_unix     648
first_review_unix    648
dtype: int64


In [142]:
data['bathrooms_text'] = data['bathrooms_text'].apply(lambda x: str(x).replace("half-bath", "0.5"))
data['bathrooms_text'] = data['bathrooms_text'].apply(lambda x: str(x).replace("Half-bath", "0.5"))
data['bathrooms'] = data['bathrooms_text'].apply(lambda x: re.findall(r'\d+\.\d+|\d+',x)[0])
data['bathrooms']=data['bathrooms'].astype("float")

In [143]:
for index, row in data.iterrows():
    if "share" in row['bathrooms_text']:
      data.at[index, 'bathrooms'] = 0.5*row['bathrooms']

In [167]:
data.drop(columns=["host_since", "bathrooms_text", "property_type", "amenities", "room_type"], inplace=True)

KeyError: "['host_since', 'bathrooms_text', 'property_type', 'amenities', 'room_type'] not found in axis"

In [145]:
data["aval"] = (15.66/30)*data["availability_30"] + (15.66/60)*data["availability_60"] + (15.66/90)*data["availability_90"] + (15.66/365)*data["availability_365"]

In [146]:
data.drop(columns=["availability_30", "availability_60", "availability_90", "availability_365"], inplace=True)

In [147]:
data.columns.tolist()

['id',
 'scrape_id',
 'source',
 'host_id',
 'host_name',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_listings_count',
 'host_total_listings_count',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighbourhood_cleansed',
 'latitude',
 'longitude',
 'accommodates',
 'bedrooms',
 'beds',
 'price',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'number_of_reviews_l30d',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'instant_bookable',
 'calculated_host_listings_count',
 'calculated_host_listings_count_entire_homes',
 'calculated_host_listings_count_private_rooms',
 'calculated_host_listings_count_shared_rooms',
 'reviews_per_month

In [148]:
data["id"].unique()

array([528022,   2818,  20168, ..., 801544, 802052, 805330], dtype=int64)

In [149]:
data.shape

(6809, 102)

In [150]:
trip = pd.read_csv("data/december/trip.csv")

In [151]:
min(trip["greviews"])

20.0

In [152]:
max(trip["greviews"])+max(trip["treviews"])

140352.0

In [153]:
trip["greviews"].head()

0    72629.0
1    75732.0
2    10035.0
3    16555.0
4     1502.0
Name: greviews, dtype: float64

In [154]:
trip["g"] = (trip["greviews"] - min(trip["greviews"]))/(max(trip["greviews"] - min(trip["greviews"])))

In [155]:
trip["t"] = (trip["treviews"] - min(trip["treviews"]))/(max(trip["treviews"] - min(trip["treviews"])))

In [156]:
trip["score"] = 100*((trip["g"]+trip["t"])/2)

In [157]:
geopy.distance.distance( np.array(data[['latitude', 'longitude']])[0], trip["location"][0])

Distance(3.6628217212702485)

In [172]:
for index, row in data.iterrows():
    ament=0
  for indeks, line in trip.iterrows():
    ament += line['score']/(geopy.distance.distance(np.array(row[['latitude', 'longitude']]), line['location']).km)
  data.at[index, "ament"] = ament

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 3)

In [None]:
data["first_review_unix"]=data["first_review_unix"]/(10**6)
data["last_review_unix"]=data["last_review_unix"]/(10**6)
data["ament"]=data["ament"]/(10**6)


In [None]:
data["host_since_unix"]=data["host_since_unix"]/(10**6)

In [None]:
data.describe(include="all")

In [None]:
data["maximum_maximum_nights"].unique()

In [None]:
data.columns.tolist()

In [168]:
drops = ["source", "host_id", "latitude", "longitude", "host_total_listings_count"]
df = data.drop(columns=drops)

In [169]:
df.describe(include="all")

Unnamed: 0,id,scrape_id,host_name,host_acceptance_rate,host_is_superhost,host_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,accommodates,...,neigh_Slotervaart,neigh_Watergraafsmeer,neigh_Westerpark,neigh_Zuid,first_review_unix,last_review_unix,host_since_unix,bathrooms,aval,ament
count,6809.0,6809.0,6809,6809.0,6809.0,6809.0,6809.0,6809.0,6809,6809.0,...,6809.0,6809.0,6809.0,6809.0,6161.0,6161.0,6809.0,6809.0,6809.0,6809.0
unique,,,2799,,,,,,22,,...,,,,,,,,,,
top,,,Peter,,,,,,De Baarsjes - Oud-West,,...,,,,,,,,,,
freq,,,41,,,,,,1066,,...,,,,,,,,,,
mean,1.543801e+17,20221210000000.0,,0.679852,0.193567,2.801146,0.993538,0.857688,,2.918784,...,0.021589,0.029226,0.068733,0.065648,1550.534709,1648.152671,1447.413938,1.233551,13.857734,8e-06
std,2.861175e+17,0.0,,0.374956,0.395123,18.066322,0.080133,0.349395,,1.401175,...,0.145348,0.168452,0.253017,0.247685,91.046065,37.436317,87.817598,0.528132,16.886821,2e-05
min,2818.0,20221210000000.0,,0.0,0.0,1.0,0.0,0.0,,0.0,...,0.0,0.0,0.0,0.0,1238.3712,1388.7936,1222.2144,0.0,0.0,1e-06
25%,15109770.0,20221210000000.0,,0.44,0.0,1.0,1.0,1.0,,2.0,...,0.0,0.0,0.0,0.0,1479.0816,1654.56,1381.6224,1.0,0.0,4e-06
50%,33730180.0,20221210000000.0,,0.86,0.0,1.0,1.0,1.0,,2.0,...,0.0,0.0,0.0,0.0,1555.8912,1663.9776,1432.08,1.0,6.507123,5e-06
75%,53137910.0,20221210000000.0,,1.0,0.0,2.0,1.0,1.0,,4.0,...,0.0,0.0,0.0,0.0,1644.7104,1668.384,1496.7936,1.5,23.380356,7e-06
