In [0]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [0]:
%python

df = spark.read.table("airbnb.gold.combined_listings")
df = df.toPandas() 

display(df.head())


In [0]:
display(df.columns)

In [0]:
# Convert all to USD
df.loc[(df["area"] == "Tokyo") & df["price"].notna() & (df["price"] != 0), "price"] *= 0.0065

df.loc[(df["area"] == "London") & df["price"].notna() & (df["price"] != 0), "price"] *=0.77 


In [0]:
city = ["London", "Tokyo"]


df_london = df[df["area"] == city[0]].copy()
df_tokyo = df[df["area"] == city[1]].copy()

df_tokyo["city"] = "Tokyo"
df_london["city"] = "London"

city_df = pd.concat([df_london, df_tokyo], axis=0)

print(f"Number of listings for {city}: {len(city_df)}")


In [0]:
# City Level Summary
import pandas as pd

city_metrics = (
    city_df
    .groupby("city")
    .agg(
        avg_price=('price', 'mean'),
        avg_occupancy=('estimated_occupancy_l365d', 'mean'),
        listing_count=('host_id', 'count'),
        avg_rating=('review_scores_rating', 'mean')
    )
    .reset_index()
)

display(city_metrics)


In [0]:
comparison = city_metrics.set_index("city").T

comparison["% diff (Tokyo vs London)"] = (
    (comparison["Tokyo"] - comparison["London"]) / comparison["London"] * 100
).round(2)

comparison.index = [
    "Average Price (USD)",
    "Average Occupancy (%)",
    "Listing Count",
    "Average Rating (out of 5)"
]

comparison


In [0]:
import plotly.express as px

# Create % difference plot
fig = px.bar(
    comparison.reset_index(),
    x="% diff (Tokyo vs London)",
    y="index",
    orientation="h",
    color="% diff (Tokyo vs London)",
    color_continuous_scale="RdYlGn",
    title="üìä Tokyo vs London ‚Äî % Difference by Metric",
    text="% diff (Tokyo vs London)"
)

fig.update_layout(
    xaxis_title="Percent Difference (Tokyo vs London)",
    yaxis_title="Metric",
    template="plotly_white",
    title_x=0.5,
    showlegend=False
)
fig.update_traces(texttemplate="%{text}%", textposition="outside")

display(fig)


In [0]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create 1x4 subplot grid
fig = make_subplots(
    rows=1, cols=4,
    subplot_titles=[
        "Average Price",
        "Average Occupancy",
        "Listing Count",
        "Average Rating"
    ]
)

# Define metrics and y-axis titles
metrics = ["avg_price", "avg_occupancy", "listing_count", "avg_rating"]

# Add bars for each metric
for i, metric in enumerate(metrics, start=1):
    fig.add_trace(
        go.Bar(
            name="London",
            x=["London"], 
            y=[city_metrics.loc[city_metrics['city']=="London", metric].values[0]],
            marker_color="#e7191f"
        ),
        row=1, col=i
    )
    fig.add_trace(
        go.Bar(
            name="Tokyo",
            x=["Tokyo"],
            y=[city_metrics.loc[city_metrics['city']=="Tokyo", metric].values[0]],
            marker_color="#97c939"
        ),
        row=1, col=i
    )

# Layout
fig.update_layout(
    title="Tokyo vs London ‚Äî Key Metrics Comparison",
    barmode='group',
    showlegend=False,
    height=400,
    width=1200,
    paper_bgcolor="#1f1b1a",
    plot_bgcolor="#1f1b1a",
    font=dict(color="white"),
)

fig.show()

In [0]:
import pandas as pd

df_all = city_df
# --- Ensure data types are clean ---
df_all = df_all.copy()
df_all["price"] = pd.to_numeric(df_all["price"], errors="coerce")
df_all["estimated_occupancy_l365d"] = pd.to_numeric(df_all["estimated_occupancy_l365d"], errors="coerce")

neigh_stats = (
    df_all.groupby(["area", "neighbourhood_cleansed"])
    .agg(
        avg_price=('price', 'mean'),
        avg_occupancy=('estimated_occupancy_l365d', 'mean'),
        listing_count=('host_id', 'count'),
        avg_rating=('review_scores_rating', 'mean')
    )
    .reset_index()
)

top_price = (
    neigh_stats
    .sort_values(["area", "avg_price"], ascending=[False, True])
    .groupby("area")
    .head(5)
    .reset_index(drop=True)
)

top_occupancy = (
    neigh_stats
    .sort_values(["area", "avg_occupancy"], ascending=[False, True])
    .groupby("area")
    .head(5)
    .reset_index(drop=True)
)

display(top_price)
display(top_occupancy)


In [0]:
import plotly.express as px

fig_price = px.bar(
    top_price,
    x="neighbourhood_cleansed",
    y="avg_price",
    color="area",
    barmode="group",
    title="Top 5 Neighborhoods by Average Price",
    color_discrete_map={"London": "#e7191f", "Tokyo": "#97c939"}
)
fig_price.show()

In [0]:
fig_occ = px.bar(
    top_occupancy,
    x="neighbourhood_cleansed",
    y="avg_occupancy",
    color="area",
    barmode="group",
    title="Top 5 Neighborhoods by Average Occupancy",
    color_discrete_map={"London": "#e7191f", "Tokyo": "#97c939"}
)
fig_occ.update_layout(template="plotly_dark")
fig_occ.show()


In [0]:
import plotly.express as px

room_type_counts = (
    city_df.groupby(["city", "room_type"])
    .size()
    .reset_index(name="count")
)

fig_room = px.pie(
    room_type_counts,
    names="room_type",
    values="count",
    color="room_type",
    facet_col="city",
    title="üè† Room Type Distribution ‚Äî Tokyo vs London"
)
fig_room.update_traces(textposition="inside", textinfo="percent+label")
fig_room.update_layout(title_x=0.5)
display(fig_room)


In [0]:
superhost_rate = (
    city_df.groupby("city")["host_is_superhost"]
    .apply(lambda x: (x == True).mean() * 100)
    .reset_index(name="superhost_rate")
)

fig_super = px.bar(
    superhost_rate,
    x="city",
    y="superhost_rate",
    color="city",
    title="Share of Superhosts by City",
    text="superhost_rate"
)
fig_super.update_traces(texttemplate="%{text:.1f}%", textposition="outside")
fig_super.update_layout(yaxis_title="Superhost Share (%)", title_x=0.5, template="plotly_white")
display(fig_super)


In [0]:
import pandas as pd

revenue_df = (
    city_df.groupby("city")
    .agg(
        avg_price=("price", "mean"),
        avg_occupancy=("estimated_occupancy_l365d", "mean")
    )
    .reset_index()
)
revenue_df["estimated_annual_revenue"] = revenue_df["avg_price"] * (revenue_df["avg_occupancy"] / 100) * 365

fig_revenue = px.bar(
    revenue_df,
    x="city",
    y="estimated_annual_revenue",
    color="city",
    text="estimated_annual_revenue",
    title="Estimated Annual Revenue per Listing (USD)",
    color_discrete_map={"Tokyo": "#97c939", "London": "#de1f1e"}
)
fig_revenue.update_traces(
    texttemplate="$%{text:,.0f}",
    textposition="outside",
    marker_line_color="#1f1b1a",
    textfont_color="white"
)
fig_revenue.update_layout(
    yaxis_title="Revenue (USD)",
    template="plotly_white",
    title_x=0.5,
    paper_bgcolor="#1f1b1a",
    plot_bgcolor="#1f1b1a",
    font=dict(color="white")
)
display(fig_revenue)