
# Install revo package

In [0]:
%sh rm -rf ~/projects/revodataassignment &&
    git clone --single-branch --branch setup-repo https://github.com/gvalentini85/assessment-rent-airbnb.git ~/projects/revodataassignment &&
    cd ~/projects/revodataassignment &&
    python -m pip install --upgrade pip &&
    pip install -r requirements.txt &&
    python setup.py sdist &&
    pip install dist/revo-0.0.1.tar.gz &&
    cd data/ &&
    unzip airbnb.zip &&
    unzip rentals.zip &&
    unzip geo/post_codes.zip &&
    cp airbnb.csv rentals.json post_codes.geojson /databricks/driver/


# Load bronze layer data

In [0]:
import os
from revo.data_loader import DataLoader

dl_airbnb = DataLoader(f"file:{os.getcwd()}/airbnb.csv")
dl_rentals = DataLoader(f"file:{os.getcwd()}/rentals.json")
dl_post_codes = DataLoader(f"file:{os.getcwd()}/post_codes.geojson")

df_airbnb_bronze = dl_airbnb.load_data()
df_rentals_bronze = dl_rentals.load_data()
df_post_codes_bronze = dl_post_codes.load_data()


# Generate silver and gold layer data

In [0]:
from revo.data_processor import DataProcessor

dp = DataProcessor(airbnb_occupancy_rate=0.7)

post_codes_silver, amsterdam_codes = dp.clean_post_codes(df_post_codes_bronze)
df_airbnb_silver = dp.clean_airbnb(df_airbnb_bronze, amsterdam_codes, post_codes_silver)
df_rentals_silver = dp.clean_rentals(df_rentals_bronze, amsterdam_codes)
df_prices = dp.aggregate_data()
df_avg_prices = dp.compute_average_by_zipcode()

df_airbnb_silver.show(5)
df_rentals_silver.show(5)
df_prices.show(5)
df_avg_prices.show(5)


# Perform the analysis and visualize results

In [0]:
df_prices = df_prices.toPandas()
df_avg_prices = df_avg_prices.toPandas()
df_avg_prices_airbnb = df_avg_prices[df_avg_prices.source == "airbnb"].copy()
df_avg_prices_kamernet = df_avg_prices[df_avg_prices.source == "kamernet"].copy()

Let's have a look at the average monthly prices for AirBnB and kamernet data after removing outliers.

In [0]:
import plotly.express as px

cond = df_avg_prices.source == "airbnb"
fig = px.bar(df_avg_prices[cond].sort_values("avg_monthly_price"), x="zipcode", y="avg_monthly_price", color="source", barmode="group", height=400)
fig.show()

In [0]:
from revo.data_visualization.plot_amsterdam import plot_amsterdam

plot_amsterdam(df_avg_prices_airbnb, "avg_monthly_price", post_codes_silver)

We select the 10 most profitable zipcodes from the AirBnB data:

In [0]:
cond = df_avg_prices.source == "airbnb"
airbnb_zipcodes = df_avg_prices[cond].sort_values("avg_monthly_price", ascending=False).head(10)["zipcode"]
df_avg_prices[cond].sort_values("avg_monthly_price", ascending=False).head(10)


## Select zipcodes for kamernet

In [0]:
cond = df_avg_prices.source == "kamernet"
fig = px.bar(df_avg_prices[cond].sort_values("avg_monthly_price"), x="zipcode", y="avg_daily_price", color="source", barmode="group", height=400)
fig.show()

In [0]:
plot_amsterdam(df_avg_prices_kamernet, "avg_monthly_price", post_codes_silver)

We select the 10 most profitable zipcodes from the kamernet data

In [0]:
cond = df_avg_prices.source == "kamernet"
kamernet_zipcodes = df_avg_prices[cond].sort_values("avg_monthly_price", ascending=False).head(10)["zipcode"]
df_avg_prices[cond].sort_values("avg_monthly_price", ascending=False).head(10)


## Select between Airbnb and kamernet

Finally, after selecting zipcodes, let's have a look at which areas are more profitable to rent on AirBnB with respect to Kamernet:

In [0]:
fig = px.bar(df_avg_prices, x="zipcode", y="avg_daily_price", color="source", barmode="group", height=400)
fig.show()

As we can se from the plot above, renting through AirBnB is considerably more profitable than through Kamernet. This result remains essentially the same even when further lowering the occupancy rate of AirBnB rentals below 70% and when removing studios from the Kamernet data. More factors should be considered before making a final decision (e.g., cost of cleaning agency if renting through AirBnB).