-
Notifications
You must be signed in to change notification settings - Fork 247
/
airbnb.py
206 lines (179 loc) · 7.69 KB
/
airbnb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2023 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""Module for loading the New York City Airbnb 2019 Open Dataset.
The New York City Airbnb 2019 Open Data is a dataset containing varius details about a listed unit, when the goal
is to predict the rental price of a unit.
This dataset contains the details for units listed in NYC during 2019, was adapted from the following open kaggle
dataset: https://www.kaggle.com/datasets/dgomonov/new-york-city-airbnb-open-data. This, in turn was downloaded from
the Airbnb data repository http://insideairbnb.com/get-the-data.
This dataset is licensed under the CC0 1.0 Universal License (https://creativecommons.org/publicdomain/zero/1.0/).
The typical ML task in this dataset is to build a model that predicts the average rental price of a unit.
Dataset Shape:
.. list-table:: Dataset Shape
:widths: 50 50
:header-rows: 1
* - Property
- Value
* - Samples Total
- 47.3K
* - Dimensionality
- 9
* - Features
- real, string
* - Targets
- int 31 - 795
Description:
.. list-table:: Dataset Description
:widths: 50 50 50
:header-rows: 1
* - Column name
- Column Role
- Description
* - datestamp
- Datetime
- The date of the observation
* - neighbourhood_group
- Feature
-
* - neighbourhood
- Feature
-
* - room_type
- Feature
-
* - minimum_nights
- Feature
-
* - number_of_reviews
- Feature
-
* - reviews_per_month
- Feature
-
* - calculated_host_listings_count
- Feature
-
* - availability_365
- Feature
-
* - has_availability
- Feature
-
* - price
- Label
- The rental price of the unit
"""
import time
import typing as t
from typing import Tuple
import numpy as np
import pandas as pd
from deepchecks.tabular.dataset import Dataset
__all__ = ['load_data', 'load_pre_calculated_prediction', 'load_pre_calculated_feature_importance']
from numpy import ndarray
_TRAIN_DATA_URL = 'https://deepchecks-datasets-public.s3.eu-west-1.amazonaws.com/airbnb/airbnb_ref_data.csv'
_TEST_DATA_URL = 'https://deepchecks-datasets-public.s3.eu-west-1.amazonaws.com/airbnb/airbnb_prod_data.csv'
_target = 'price'
_predictions = 'predictions'
_datetime = 'timestamp'
_CAT_FEATURES = ['room_type', 'neighbourhood', 'neighbourhood_group', 'has_availability']
_NUM_FEATURES = ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count',
'availability_365']
_FEATURES = _NUM_FEATURES + _CAT_FEATURES
def load_data(data_format: str = 'Dataset', as_train_test: bool = True, modify_timestamps: bool = True,
data_size: t.Optional[int] = 15000) -> t.Union[t.Tuple, t.Union[Dataset, pd.DataFrame]]:
"""Load and returns the Airbnb NYC 2019 dataset (regression).
Parameters
----------
data_format : str , default: Dataset
Represent the format of the returned value. Can be 'Dataset'|'Dataframe'
'Dataset' will return the data as a Dataset object
'Dataframe' will return the data as a pandas Dataframe object
as_train_test : bool , default: True
If True, the returned data is split into train and test exactly like the toy model
was trained. The first return value is the train data and the second is the test data.
In order to get this model, call the load_fitted_model() function.
Otherwise, returns a single object.
modify_timestamps : bool , default: True
If True, the returned data timestamp column will be for the last 30 days.
Otherwise, the data timestamp will be for March 2023.
data_size : t.Optional[int] , default: 15000
The number of samples to return. If None, returns all the data.
Returns
-------
dataset : Union[deepchecks.Dataset, pd.DataFrame]
the data object, corresponding to the data_format attribute.
train_data, test_data : Tuple[Union[deepchecks.Dataset, pd.DataFrame],Union[deepchecks.Dataset, pd.DataFrame]
tuple if as_train_test = True. Tuple of two objects represents the dataset split to train and test sets.
"""
train = pd.read_csv(_TRAIN_DATA_URL, index_col=0).drop(_predictions, axis=1)
test = pd.read_csv(_TEST_DATA_URL, index_col=0).drop(_predictions, axis=1)
if data_size is not None:
if data_size < len(train):
train = train.sample(data_size, random_state=42)
if data_size < len(test):
test = test.sample(data_size, random_state=42)
if modify_timestamps:
current_time = int(time.time())
time_test_start = current_time - 86400 * 30 # Span data for 30 days
test[_datetime] = np.sort((np.random.rand(len(test)) * (current_time - time_test_start)) + time_test_start)
test[_datetime] = test[_datetime].apply(lambda x: pd.Timestamp(x, unit='s'))
if not as_train_test:
dataset = pd.concat([train, test.drop(_datetime, axis=1)], axis=0, ignore_index=True)
if data_format == 'Dataset':
dataset = Dataset(dataset, label=_target, cat_features=_CAT_FEATURES, features=_FEATURES)
return dataset
else:
if data_format == 'Dataset':
train = Dataset(train, label=_target, cat_features=_CAT_FEATURES,
features=_FEATURES)
test = Dataset(test, label=_target, cat_features=_CAT_FEATURES,
datetime_name=_datetime, features=_FEATURES)
return train, test
def load_pre_calculated_prediction(data_size: t.Optional[int] = 15000) -> Tuple[ndarray, ndarray]:
"""Load the pre-calculated prediction for the Airbnb NYC 2019 dataset.
Parameters
----------
data_size : t.Optional[int] , default: 15000
The number of samples to return. If None, returns all the data.
Returns
-------
predictions : Tuple(np.ndarray, np.ndarray)
The first element is the pre-calculated prediction for the train set.
The second element is the pre-calculated prediction for the test set.
"""
usable_columns = [_target, _predictions]
train = pd.read_csv(_TRAIN_DATA_URL, usecols=usable_columns)
test = pd.read_csv(_TEST_DATA_URL, usecols=usable_columns)
if data_size is not None:
if data_size < len(train):
train = train.sample(data_size, random_state=42)
if data_size < len(test):
test = test.sample(data_size, random_state=42)
return np.asarray(train[_predictions]), np.asarray(test[_predictions])
def load_pre_calculated_feature_importance() -> pd.Series:
"""Load the pre-calculated feature importance for the Airbnb NYC 2019 dataset.
Returns
-------
feature_importance : pd.Series
The feature importance for a model trained on the Airbnb NYC 2019 dataset.
"""
return pd.Series({
'neighbourhood_group': 0.1,
'neighbourhood': 0.2,
'room_type': 0.1,
'minimum_nights': 0.1,
'number_of_reviews': 0.1,
'reviews_per_month': 0.1,
'calculated_host_listings_count': 0.1,
'availability_365': 0.1,
'has_availability': 0.1,
})