In [0]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [0]:
%python

df = spark.read.table("airbnb.gold.combined_listings")
df = df.toPandas() 
df.head()


In [0]:
df.loc[(df["area"] == "Tokyo") & df["price"].notna() & (df["price"] != 0), "price"] *= 0.0065


In [0]:
city = "Tokyo"

city_df = df[df["area"] == city].copy()
print(f"Number of listings for {city}: {len(city_df)}")


In [0]:
city_df.head()

In [0]:
overview = {
    "Total listings": len(city_df),
    "Unique hosts": city_df["host_id"].nunique(),
    "Average price ($)": round(city_df["price"].mean(), 2),
    "Average review score": round(city_df["review_scores_rating"].mean(), 2),
    "Occupancy rate (est.)": round(city_df["estimated_occupancy_l365d"].mean() * 100, 2)
}
for k,v in overview.items():
    print(f"{k}: {v}")


In [0]:
city_df_filtered = city_df[city_df["price"] < 1000]

fig = px.histogram(
    city_df_filtered,
    x="price",
    nbins=50,
    title=f"Price distribution in {city} (<$1000)",
    labels={"price": "Price per night ($)"},
    color_discrete_sequence=["#97c939"]
)
fig.update_traces(marker_line_width=0)
fig.update_layout(
    plot_bgcolor="#1f1b1a",
    paper_bgcolor="#1f1b1a",
    font_color="white",
    title_font_color="white",
    xaxis=dict(color="white", showline=False, showgrid=False, zeroline=False),
    yaxis=dict(color="white", showline=False, showgrid=False, zeroline=False)
)
fig.show()

In [0]:
import plotly.express as px

trend_df = (
    city_df.groupby("availability_365")["price"]
    .mean()
    .reset_index()
    .sort_values("availability_365")
)

fig = px.line(
    trend_df,
    x="availability_365",
    y="price",
    title=f"Average price vs annual availability in {city}",
    labels={"availability_365": "Days available per year", "price": "Average price ($)"},
    color_discrete_sequence=["#97c939"]
)
fig.update_traces(line=dict(color="#97c939"))
fig.update_layout(
    plot_bgcolor="#1f1b1a",
    paper_bgcolor="#1f1b1a",
    font_color="white",
    title_font_color="white",
    xaxis=dict(color="white", showline=False, showgrid=False, zeroline=False),
    yaxis=dict(color="white", showline=False, showgrid=False, zeroline=False)
)
fig.show()

In [0]:
top_neigh = (
    city_df.groupby("neighbourhood_cleansed")["price"]
    .mean()
    .sort_values(ascending=False)
    .head(10)
    .reset_index()
)

fig = px.bar(
    top_neigh,
    x="neighbourhood_cleansed",
    y="price",
    text="price",
    title=f"Top 10 neighbourhoods in {city} by average price",
    color_discrete_sequence=["#97c939"]
)
fig.update_traces(
    texttemplate='%{text:.0f}',
    textposition='outside',
    marker_line_width=0,
    textfont_color="white"
)
fig.update_layout(
    plot_bgcolor="#1f1b1a",
    paper_bgcolor="#1f1b1a",
    font_color="white",
    title_font_color="white",
    xaxis=dict(color="white", showline=False, showgrid=False, zeroline=False),
    yaxis=dict(color="white", showline=False, showgrid=False, zeroline=False)
)
fig.show()

In [0]:
availability_by_area = (
    city_df.groupby("neighbourhood_cleansed")["estimated_occupancy_l365d"]
    .mean()
    .reset_index()
    .sort_values("estimated_occupancy_l365d", ascending=False)
    .head(10)
)

fig = px.bar(
    availability_by_area,
    x="neighbourhood_cleansed",
    y="estimated_occupancy_l365d",
    title=f"Top 10 neighborhoods by estimated occupancy in {city}",
    labels={"estimated_occupancy_l365d": "Average occupancy rate (year)"},
    color_discrete_sequence=["#97c939"]
)
fig.update_traces(
    marker_line_width=0,
    textfont_color="white"
)
fig.update_layout(
    plot_bgcolor="#1f1b1a",
    paper_bgcolor="#1f1b1a",
    font_color="white",
    title_font_color="white",
    xaxis=dict(color="white", showline=False, showgrid=False, zeroline=False),
    yaxis=dict(color="white", showline=False, showgrid=False, zeroline=False)
)
fig.show()

In [0]:
agg_df = (
    city_df.groupby("neighbourhood_cleansed")[["latitude", "longitude", "price", "estimated_occupancy_l365d"]]
    .mean()
    .reset_index()
)

fig = px.scatter_mapbox(
    agg_df,
    lat="latitude",
    lon="longitude",
    color="price",
    size="estimated_occupancy_l365d",
    hover_name="neighbourhood_cleansed",
    color_continuous_scale=[(0, "white"), (1, "#97c939")],
    mapbox_style="carto-positron",
    zoom=9,
    title=f"Average Airbnb price by neighborhood in {city}"
)
fig.update_traces(marker=dict(opacity=0.7))
fig.update_layout(
    plot_bgcolor="#1f1b1a",
    paper_bgcolor="#1f1b1a",
    font_color="white",
    title_font_color="white"
)
fig.show()

In [0]:
room_summary = (
    city_df.groupby("room_type")["price"]
    .agg(["count", "mean"])
    .reset_index()
    .sort_values("mean", ascending=False)
)

fig = px.bar(
    room_summary,
    x="room_type",
    y="mean",
    text="count",
    title=f"Average price by room type in {city}",
    labels={"mean": "Average price ($)"},
    color_discrete_sequence=["#97c939"]
)
fig.update_traces(
    texttemplate="%{text}",
    textposition="outside",
    textfont_color="white",
    marker_line_width=0
)
fig.update_layout(
    plot_bgcolor="#1f1b1a",
    paper_bgcolor="#1f1b1a",
    font_color="white",
    title_font_color="white",
    xaxis=dict(color="white", showline=False, showgrid=False, zeroline=False),
    yaxis=dict(color="white", showline=False, showgrid=False, zeroline=False)
)
fig.show()

In [0]:
scatter_df = city_df.dropna(subset=["number_of_reviews", "review_scores_rating", "price"])

q99 = scatter_df["price"].quantile(0.99)
scatter_df = scatter_df[scatter_df["price"] <= q99]

scatter_df["scaled_price"] = np.sqrt(scatter_df["price"])
scatter_df = scatter_df.sample(n=5000, random_state=42)

color_map = {True: "#97c939", False: "red"}

fig = px.scatter(
    scatter_df,
    x="number_of_reviews",
    y="review_scores_rating",
    size="scaled_price",
    color="host_is_superhost",
    color_discrete_map=color_map,
    hover_data=["neighbourhood_cleansed", "room_type", "price"],
    title=f"Relationship between reviews, ratings, and normalized price in {city}",
    labels={"number_of_reviews": "Number of reviews", "review_scores_rating": "Rating"},
    opacity=0.6
)

fig.update_xaxes(range=[0, 500], color="white", showgrid=True, gridcolor="gray")
fig.update_yaxes(color="white", showgrid=True, gridcolor="gray")
fig.update_layout(
    legend_title_text="Superhost",
    font_color="white",
    plot_bgcolor="#1f1b1a",
    paper_bgcolor="#1f1b1a",
    title_font_color="white"
)
fig.show()

In [0]:
summary = {
    "Total listings": len(city_df),
    "Average price": city_df["price"].mean(),
    "Average rating": city_df["review_scores_rating"].mean(),
    "Occupancy rate": city_df["has_availability"].mean() * 100,
    "Superhosts share": city_df["host_is_superhost"].mean() * 100
}

In [0]:
avg_price = round(summary["Average price"], 2)
avg_rating = round(summary["Average rating"], 2)
superhost_share = round(summary["Superhosts share"], 2)
occupancy_rate = round(summary["Occupancy rate"], 2)
total_listings = summary["Total listings"]

print(
    f"In {city}, there are approximately {total_listings:,} active Airbnb listings. \n"
    f"The average nightly price is around ${avg_price}, while the typical review score is {avg_rating} out of 5. \n"
    f"On average, about {occupancy_rate}% of listings are available at any given time, "
    f"indicating moderate occupancy throughout the year. \n"
    f"Roughly {superhost_share}% of all hosts in {city} are Superhosts, "
    f"suggesting a strong presence of experienced and reliable hosts on the platform."
)
