In [1]:
from utils.bronze_utils import *
from silver_main import load_clean_and_save_silver_data
from gold_main import generate_revenue_insights_and_save_gold_data
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

2024-12-10 16:14:10,103 - INFO - NumExpr defaulting to 8 threads.


In [2]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("pipeline") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/10 16:14:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
# Define all the constants for use in the notebook
AIRBNB_BRONZE_FILE_PATH = "../data/output/bronze/airbnb"
AIRBNB_RAW_FILE_PATH = "../data/airbnb.csv"
AIRBNB_SILVER_FILE_PATH = "../data/output/silver/airbnb"
GEOJSON_PATH = "../data/geo/post_codes.geojson"
OUTPUT_PATH = "../data/output/gold/revenue_insights"
RENTALS_BRONZE_FILEPATH = "../data/output/bronze/rentals"
RENTALS_RAW_FILEPATH = "../data/rentals.json"
RENTALS_SILVER_FILE_PATH = "../data/output/silver/rentals"

In [11]:
# ingest the raw airbnb data
ingest_parquet_data_from_raw_file(
    raw_file_path=AIRBNB_RAW_FILE_PATH,
    output_file_path=AIRBNB_BRONZE_FILE_PATH,
    file_type="csv",
    spark=spark)

2024-12-10 16:19:52,951 - INFO - CSV file saved successfully!


True

In [12]:
# ingest the raw rentals data from kamernet
ingest_parquet_data_from_raw_file(
    raw_file_path=RENTALS_RAW_FILEPATH,
    output_file_path=RENTALS_BRONZE_FILEPATH,
    file_type="json",
    spark=spark)

2024-12-10 16:20:11,248 - INFO - JSON file saved successfully!                  


True

In [13]:
# clean and format the airbnb data
load_clean_and_save_silver_data(bronze_file_path=AIRBNB_BRONZE_FILE_PATH,
                                geojson_path=GEOJSON_PATH,
                                postal_code_col_name="zipcode",
                                monetary_col_name="price",
                                silver_file_path=AIRBNB_SILVER_FILE_PATH,
                                spark=spark)

2024-12-10 16:20:24,703 - INFO - Loaded 9913 records from ../data/output/bronze/airbnb successfully!
2024-12-10 16:20:24,741 - INFO - Cleaning postal code column...
2024-12-10 16:20:24,742 - INFO - Cleaning postal code column ...
2024-12-10 16:20:24,771 - INFO - Validating postcodes ...
2024-12-10 16:20:24,988 - INFO - Postcodes successfully validated for 9913 records!
2024-12-10 16:20:25,036 - INFO - Converting int column to float column ...
2024-12-10 16:20:25,165 - INFO - Monetary columns cleaned for 9913 successfully!
2024-12-10 16:20:25,166 - INFO - Filling missing and invalid postcodes...
2024-12-10 16:20:26,526 - INFO - Records with incomplete postal codes: 2258
2024-12-10 16:20:26,526 - INFO - Getting missing postal codes ...
2024-12-10 16:20:27,123 - INFO - Filled missing and invalid postal codes for 12171 records!
2024-12-10 16:20:27,124 - INFO - Dropping unusable records...
2024-12-10 16:20:29,365 - INFO - Point created for lon: 52.36575451 lat: 4.941419235
2024-12-10 16:20:

True

In [14]:
# clean and format the rentals data
load_clean_and_save_silver_data(bronze_file_path=RENTALS_BRONZE_FILEPATH,
                                geojson_path=GEOJSON_PATH,
                                postal_code_col_name="postalCode",
                                monetary_col_name="rent",
                                silver_file_path=RENTALS_SILVER_FILE_PATH,
                                spark=spark)

2024-12-10 16:22:07,432 - INFO - Loaded 46722 records from ../data/output/bronze/rentals successfully!
2024-12-10 16:22:07,450 - INFO - Cleaning postal code column...
2024-12-10 16:22:07,450 - INFO - Cleaning postal code column ...
2024-12-10 16:22:07,461 - INFO - Validating postcodes ...
2024-12-10 16:22:07,609 - INFO - Postcodes successfully validated for 46722 records!
2024-12-10 16:22:07,632 - INFO - Converting int column to float column ...
2024-12-10 16:22:07,844 - INFO - Monetary columns cleaned for 46722 successfully!
2024-12-10 16:22:07,845 - INFO - Filling missing and invalid postcodes...
2024-12-10 16:22:09,336 - INFO - Records with incomplete postal codes: 0
2024-12-10 16:22:09,337 - INFO - Getting missing postal codes ...
2024-12-10 16:22:10,094 - INFO - Filled missing and invalid postal codes for 46722 records!
2024-12-10 16:22:10,095 - INFO - Dropping unusable records...
2024-12-10 16:22:11,934 - INFO - Remaining usable records: 13220                
2024-12-10 16:22:13,

True

In [15]:
# generate the revenue insights per post code
generate_revenue_insights_and_save_gold_data(airbnb_silver_file_path=AIRBNB_SILVER_FILE_PATH,
                                             rentals_silver_file_path=RENTALS_SILVER_FILE_PATH,
                                             gold_file_path=OUTPUT_PATH,
                                             spark=spark)

2024-12-10 16:22:17,873 - INFO - File loaded from ../data/output/silver/airbnb successfully!
2024-12-10 16:22:17,948 - INFO - File loaded from ../data/output/silver/rentals successfully!
2024-12-10 16:22:18,034 - INFO - Successfully calculated average revenue per postcode per night!
2024-12-10 16:22:21,329 - INFO - Revenue insights saved successfully!           


True