-
Notifications
You must be signed in to change notification settings - Fork 0
/
nyc_streamlit.py
449 lines (348 loc) · 19.2 KB
/
nyc_streamlit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Setting the style 'seaborn' for your charts
plt.style.use('seaborn')
import warnings
warnings.filterwarnings('ignore')
import streamlit as st
from PIL import Image
import pydeck as pdk
# Outline:
# 0. Big question
# 1. Airbnb introduction
# 2. COVID - Why still airbnb?
# Unique places with unique experience;
# Time to relax a bit, but still follow COVID rules.
# 3. How to choose an airbnb:
# Review Scores;
# Price ranges: Neighborhood, Accommodate types;
# Locations - Safety: Crime Rate, COVID Rate.
# 4. Result: A short list of "unique" airbnb to choose
# 5. Project difficulty
# =========================================================================
# Begin
# Cache functions
@st.cache(allow_output_mutation=True)
def load_data(path):
try:
data = pd.read_excel(path)
except:
data = pd.read_csv(path)
return data
# 0. Big question
st.title('Staycation with Airbnb')
# 1. Airbnb introduction
airbnb_image = Image.open('airbnb-678x381.jpg')
st.image(airbnb_image, use_column_width = True)
st.header('1. Airbnb Introduction')
st.markdown('- **The world leader** in accommodations of the “sharing economy”, allows you to find places to stay directly from individuals in thousands of cities around the world.')
st.markdown('- Able to **rent apartments or even entire houses** from people all over the world, almost everywhere in fact.')
# 2. COVID - Why still airbnb?
st.header('2. COVID - Why choose an Airbnb?')
# NYC_Covid_Daily_CSV
nyc_covid_daily_oct_2020 = load_data('https://raw.githubusercontent.com/duongtruongtrong/CoderSchool_week_3_project_duong_huyto/master/nyc_covid_daily_oct_2020.csv')
nyc_covid_daily_oct_2020['Date'] = pd.to_datetime(nyc_covid_daily_oct_2020['DATE_OF_INTEREST'])
# NYC_Covid_Daily_Chart
plt.rc('font', size=18) #controls default text size
plt.rc('axes', titlesize=22) #fontsize of the title
plt.rc('axes', labelsize=22) #fontsize of the x and y labels
plt.rc('xtick', labelsize=22) #fontsize of the x tick labels
plt.rc('ytick', labelsize=22) #fontsize of the y tick labels
plt.rc('legend', fontsize=22) #fontsize of the legend
covid, ax = plt.subplots(figsize=(18, 8))
covid.suptitle('New York Daily New COVID-19 Cases', fontsize=25)
sns.lineplot(x='Date', y='Cases', data=nyc_covid_daily_oct_2020, ax=ax, label='Daily')
sns.lineplot(x='Date', y='7-day average', data=nyc_covid_daily_oct_2020, ax=ax, label='7-day average')
ax.grid(True)
st.pyplot(covid)
st.markdown('### Number of new COVID cases in New York is stable under 1000 cases/day, and decreasing.')
st.markdown('## It is about time to get away and relax **safely**!')
covid_image = Image.open('covid.jpg')
st.image(covid_image, use_column_width = True)
st.write('We are **restricted to travel** ever since the pandemic started. Therefore, the **best option** we have is to travel **around our area**.')
# Airbnb in COVID reasons
st.subheader('You are a New Yorker')
st.markdown('- **Rarely** go to the other side of your city;')
st.markdown('- Or **visit other cities** in New York.')
st.subheader('Live As a Local with Unique Experience')
st.markdown('- **Unique experience** with Airbnb that traditional hotels cannot provide.')
st.markdown('- **Immersing you in the local culture** than getting a traditional hotel room')
st.subheader('Meet People')
st.markdown('Airbnb gives you the option to **stay with the local host**. The host will often be happy to welcome you and to tell you every interesting thing about the place you visit.')
st.subheader('Save Money')
st.markdown('_(sometimes)_')
st.markdown('- Airbnbs are usually **cheaper** than traditional hotels _(depends on the location)_')
st.markdown('- **Travel in a group**, Airbnb is definitely cheaper than a traditional hotel.')
# 3. How to choose an airbnb:
st.header('3. How To choose the best unique Airbnb?')
# 3.1 Current number of airbnb listings in NYC: Chart and xlsx
df_meaningful_col_light = load_data('https://raw.githubusercontent.com/duongtruongtrong/CoderSchool_week_3_project_duong_huyto/master/cleansed_listings_meaningful_2020_light.csv')
total_number_aribnb, ax = plt.subplots(figsize=(2, 1))
total_number_aribnb.suptitle('Current number of Airbnb in New York\n(until 2020-09-09)', fontsize=15)
ax.text(x=0.1, y=0, s='{:,}'.format(len(df_meaningful_col_light)), fontsize=30)
ax.axis("off")
st.pyplot(total_number_aribnb)
st.markdown('# Let us help you choose in a data way!')
st.markdown('_No recommendation Airbnb places based on your interior design interest._')
# 3.2 Unique Airbnb concept
st.subheader('What is unique Airbnb?')
st.markdown('Airbnb with reviews **containing the word "unique"** and its synonyms: distinctive, special, extraordinary.')
st.markdown('**Example**:')
df_sample_review = load_data('https://raw.githubusercontent.com/duongtruongtrong/CoderSchool_week_3_project_duong_huyto/master/review_sample.csv')
st.dataframe(df_sample_review)
st.markdown('''**"Unique" Review of listing id 9576478**: Vitaliy provided very informative directions and instructions on getting into the apartment. The apartment was exactly like it was in the photos. Harlem is a **unique** neighbourhood in NYC and will give you a different flavour of the city.''')
st.markdown('''**"Unique" Review of listing id 27930717**: The location of this apartment is great - very close to the subway and it allows you to reach Manhattan quickly. Good restaurants in the neighborhood. Maxime's (Hidden by Airbnb) , who were at home when we stayed, were absolutely fantastic and they made our experience in NYC **special**.''')
# Poplular words in "unique" review
st.markdown('**Poplular words in "unique" review**')
popular_word_image = Image.open('popular_words_unique_review.png')
st.image(popular_word_image, use_column_width = True)
st.subheader('Why unique Airbnb?')
st.markdown('You are **a New Yorker**.')
st.markdown('- Already too familiar to New York.')
st.markdown('- You need something **new**, something **unique** around New York.')
# Airbnb total review distribution chart
st.subheader('Number of "unique" reviews.')
unique_review, ax = plt.subplots(figsize=(20, 5))
unique_review.suptitle('Number of "Unique" Reviews Distribution', fontsize=25)
sns.boxplot(data=df_meaningful_col_light[(df_meaningful_col_light['total_unique_reviews']!=0) & (df_meaningful_col_light['total_unique_reviews'].notnull())],
x='total_unique_reviews',
ax=ax)
ax.set_xticks(list(range(0, 8, 2)) + list(range(10, 136, 25)))
st.pyplot(unique_review)
st.markdown('Most of Aibnb has **1 or 2 "unique" reviews**, which is **too small**.')
st.markdown('The more "unique" reviews the Airbnb has, the more authentically unique the Airbnb is.')
st.markdown('The **number of "unique" reviews** should be **over the upper whisker point** to be called a unique Aribnb.')
st.markdown('Not using "unique" review rate because:')
st.markdown('- High "unique" review rate does not mean the place is really unique, there may be only a few reviews in total.')
st.markdown('- A unique listings only need to be unique to some people')
st.markdown('- In addition, if the place is unique, most of reviewers only describe the place rather than mentioning the word "unique".')
# Current number of airbnb listings in NYC: Chart and xlsx
df_unique_outliers = load_data('https://raw.githubusercontent.com/duongtruongtrong/CoderSchool_week_3_project_duong_huyto/master/nyc_unique_aibnb.csv')
total_number_unique_aribnb, ax = plt.subplots(figsize=(2, 1))
total_number_unique_aribnb.suptitle('Total number of unique Airbnb in New York', fontsize=15)
ax.text(x=0.1, y=0, s='{:,}'.format(len(df_unique_outliers)), fontsize=30)
ax.axis("off")
st.pyplot(total_number_unique_aribnb)
# 3.2 Review Score of Unique Airbnb
st.subheader('Review Scores of "Unique" Airbnb')
review_list = ['overall', 'cleanliness', 'location', 'communication', 'checkin', 'accuracy']
review_score_list = []
# review_scores_value
# review_scores_cleanliness
# review_scores_location
# review_scores_communication
# review_scores_checkin
# review_scores_accuracy
review_score_list.append(df_unique_outliers['review_scores_value'].mean())
review_score_list.append(df_unique_outliers['review_scores_cleanliness'].mean())
review_score_list.append(df_unique_outliers['review_scores_location'].mean())
review_score_list.append(df_unique_outliers['review_scores_communication'].mean())
review_score_list.append(df_unique_outliers['review_scores_checkin'].mean())
review_score_list.append(df_unique_outliers['review_scores_accuracy'].mean())
cleanliness, ax = plt.subplots(figsize=(20, 5))
cleanliness.suptitle("Average Review Scores", fontsize=25)
sns.barplot(x=review_list, y=review_score_list, ax=ax)
ax.set_ylabel('Scores')
"""Attach a text label above each bar in *rects*, displaying its height."""
for i, j in zip(range(0, len(review_list)), review_score_list):
ax.annotate(round(j, 1),
xy=(i, j),
# xytext=(2,-5), # 2 points horizontal and -5 points vertial offset
# textcoords="offset points",
ha='center')
st.pyplot(cleanliness)
st.markdown('## Quality is secured.')
st.markdown('Review score of "unique" Airbnb are high.')
# 3.3 Price Range with: Neighborhood and Room types
st.subheader('Price Ranges of "Unique" Airbnb')
price_range, ax = plt.subplots(figsize=(20, 5))
price_range.suptitle("Price Ranges", fontsize=25)
sns.boxplot(data=df_unique_outliers,
x='price',
ax=ax)
ax.set_xticks(list(range(0, 701, 100)) + list(range(700, 2701, 400)))
st.pyplot(price_range)
st.markdown('Most "unique" Airbnb have price under 200 USD.')
st.markdown('The upper whisker point is around 350 USD, rounding it down to 300 USD as the first breaking point of "unique" Airbnb price ranges:')
st.markdown('**0-300**: Reasonable Price (round down from the upper whisker point)')
st.markdown('**Over 300**: Comming to Luxury')
# 3.3.1 Price Distribution in Term of Neighborhood
st.subheader('Price Distribution')
max_price = df_unique_outliers['price'].max()
min_price = df_unique_outliers['price'].min()
price_slider = st.slider("Choose a Price Range (USD)", 0, round(int(max_price), -1), (0, 300), 10)
bins_price = list(range(price_slider[0], price_slider[1] + 1, 50))
# for clearer xticks, when over 1000 USD the intervals must be 200 (instead of 100)
if price_slider[1] > 1000 and price_slider[0] < 1000:
xticks_price = list(range(price_slider[0], 901, 100)) + list(range(900, price_slider[1], 200))
elif price_slider[1] > 1000 and price_slider[0] > 1000:
xticks_price = list(range(price_slider[0], price_slider[1] + 1, 200))
else:
xticks_price = list(range(price_slider[0], price_slider[1] + 1, 100))
price_neighborhood_room_type, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(20, 27), constrained_layout=True)
sns.histplot(data=df_unique_outliers,
x='price',
hue='neighbourhood_group_cleansed',
ax=ax1,
multiple="stack",
stat='count', bins=bins_price
)
ax1.set_title("Price vs Neighborhood", fontsize=25)
ax1.set_xticks(xticks_price)
sns.histplot(data=df_unique_outliers,
x='price',
hue='room_type',
ax=ax2,
multiple="stack",
stat='count', bins=bins_price)
ax2.set_title("Price vs Room types", fontsize=25)
ax2.set_xticks(xticks_price)
sns.histplot(data=df_unique_outliers,
x='price',
hue='bathrooms_type',
ax=ax3,
multiple="stack",
stat='count', bins=bins_price)
ax3.set_title("Price vs Bathoom types", fontsize=25)
ax3.set_xticks(xticks_price)
st.pyplot(price_neighborhood_room_type)
st.markdown('There are plenty of Airbnb in **Manhattan and Brooklyn** with **private room or entire home/appartment** with **affordable price**.')
st.markdown('**Private room and entire home/appartment** are prioritized to **minimize COVID** transmission from strangers.')
# 3.4 Location
st.subheader('Location: Where is safe?')
st.markdown('### COVID cases and Crime cases around 25$km^2$ area around an Airbnb.')
# https://medium.com/@ahmetemin.tek.66/build-a-data-science-web-app-with-streamlit-and-python-4a9bdba35449
midpoint = [np.average(df_unique_outliers['latitude']), np.average(df_unique_outliers['longitude'])]
df_unique_outliers['covid_case_rate'] = df_unique_outliers['covid_case_rate'].round(1)
df_unique_outliers['total_crime_per_day'] = df_unique_outliers['total_crime_per_day'].round(1)
avg_covid_crime, (ax1, ax2) = plt.subplots(2, 1, figsize=(2, 2), constrained_layout=True)
df_covid = load_data('https://raw.githubusercontent.com/nychealth/coronavirus-data/master/data-by-modzcta.csv')
avg_covid_case_rate = df_covid['COVID_CASE_RATE'].mean()
ax1.set_title('Average New York COVID Cases per 100k People', fontsize=15)
ax1.text(x=0.1, y=0.4, s='{:.1f}'.format(avg_covid_case_rate), fontsize=30)
ax1.axis("off")
avg_total_crime_per_day = df_unique_outliers['total_crime_per_day'].mean()
ax2.set_title('Average New York Crime Cases per Day', fontsize=15)
ax2.text(x=0.35, y=0.4, s='{:.1f}'.format(avg_total_crime_per_day), fontsize=30)
ax2.axis("off")
st.pyplot(avg_covid_crime)
st.markdown('**Overall Distribution Map**')
st.markdown('Each bar represent COVID cases and Crime cases around 25$km^2$ area around an Airbnb.')
# COVID Map
st.markdown('**COVID Map of each Airbnb**')
st.pydeck_chart(pdk.Deck(
initial_view_state={
'latitude': midpoint[0],
'longitude': midpoint[1],
'zoom': 11,
'pitch': 50
},
layers=[
pdk.Layer(
'HexagonLayer',
data=df_unique_outliers[['covid_case_rate', 'latitude', 'longitude']],
get_position=['longitude', 'latitude'],
auto_highlight=True,
radius=40,
elevation_scale=4,
elevation_range=[0, 500],
pickable=True,
extruded=True,
coverage=0.6,
getElevationWeight='covid_case_rate'
),
],
tooltip={
'html': "<b>COVID Case per 100k people:</b> {elevationValue}",
'style': {
'color': 'white'
}
}
))
# Crime map
st.markdown('**Crime Map of each Airbnb**')
st.pydeck_chart(pdk.Deck(
initial_view_state={
'latitude': midpoint[0],
'longitude': midpoint[1],
'zoom': 11,
'pitch': 50
},
layers=[
pdk.Layer(
'HexagonLayer',
data=df_unique_outliers[['total_crime_per_day', 'latitude', 'longitude']],
get_position=['longitude', 'latitude'],
auto_highlight=True,
radius=40,
elevation_scale=4,
elevation_range=[0, 500],
pickable=True,
extruded=True,
coverage=0.6,
getElevationWeight='total_crime_per_day'
),
],
tooltip={
'html': "<b>Number of crime per day:</b> {elevationValue}",
'style': {
'color': 'white'
}
}
))
# 4. Short List
st.subheader('Result: Short List of Unique Airbnb')
st.markdown('**Requirements**:')
st.markdown('- Price: **under 300 USD**, the affordable price.')
st.markdown('- Room type: **entire home/apt** or **private room** or **hotel room** with **private bathroom**, minimize COVID transmission.')
st.markdown('- COVID cases per 100k: **under New York average COVID cases** per 100k people.')
st.markdown('- Crime cases: **under the average crime cases** per day.')
st.markdown('- Host location: **local host**, for local unque experience.')
st.markdown('## Safety first!')
df_unique_outliers['cleansed_room_type'] = 'Shared'
df_unique_outliers['cleansed_room_type'][(df_unique_outliers['room_type'].isin(['Entire home/apt', 'Hotel room', 'Private room']) & (df_unique_outliers['bathrooms_type']=='private'))] = 'Private'
df_unique_outliers['is_local_host'][df_unique_outliers['is_local_host']=='t'] = 'Local Host'
df_unique_outliers['is_local_host'][df_unique_outliers['is_local_host']=='f'] = 'Not Local Host'
columns_list = ['id',
'listing_url',
'host_location',
'neighbourhood_cleansed',
'neighbourhood_group_cleansed',
'room_type',
'accommodates',
'price',
'bathrooms_number',
'bathrooms_type',
'price_per_person',
'is_local_host',
'total_unique_reviews',
'covid_case_rate',
'total_crime_per_day',
'latitude',
'longitude'
]
room_type_selector = st.selectbox("Select Room Type - Default: Private", df_unique_outliers['cleansed_room_type'].unique(), index=1)
local_host_selector = st.selectbox("Select Host Type - Default: Local Host", df_unique_outliers['is_local_host'].unique(), index=0)
price_slider = st.slider("Price Range (USD) - Default: 0-300", 0, round(int(max_price), -1), (0, 300), 10)
max_covid = df_unique_outliers['covid_case_rate'].max()
max_crime = df_unique_outliers['total_crime_per_day'].max()
covid_rate = st.slider("COVID cases (per 100k people) - Default: {:.1f}".format(avg_covid_case_rate), 0.0, max_covid, float(avg_covid_case_rate))
crime_rate = st.slider("Crime cases (per day) - Default: {:.1f}".format(avg_total_crime_per_day), 0.0, max_crime, float(avg_total_crime_per_day))
df_short_list = df_unique_outliers[columns_list][(df_unique_outliers['price'] >= price_slider[0])
& (df_unique_outliers['price'] <= price_slider[1])
& (df_unique_outliers['cleansed_room_type'] == room_type_selector)
& (df_unique_outliers['is_local_host'] == local_host_selector)
& (df_unique_outliers['covid_case_rate'] <= covid_rate)
& (df_unique_outliers['total_crime_per_day'] <= crime_rate)
]
df_short_list
# Location
st.map(df_short_list[['latitude', 'longitude']], use_container_width=True)
st.markdown('## Only 1 place can meet all of the requirements.')
# 5. Project difficulty
st.subheader('Project difficulty')
st.markdown('- Limited in showing more information in tooltips of Airbnb on map. Can only 1 aggerated number.')
st.markdown('- Detecting synonyms of the word "unique" in reviews: "special" appear in "especial" -> Had to re-run everything.')
st.markdown('- Matching COVID data and crime data to Airbnb data set via latitude and longitude, they are not 1-1 match.')