# Airbnb NYC Real Estate Analysis - ETL Pipeline

In [3]:
pip install pyspark findspark

Note: you may need to restart the kernel to use updated packages.


# 1️⃣ Environment Setup (Spark + Hadoop Configurations for Windows)

In [1]:
import os
import findspark

# Set Hadoop path first
os.environ['HADOOP_HOME'] = r'C:\Program Files\hadoop\hadoop-3.2.2'
os.environ['PATH'] += os.pathsep + r'C:\Program Files\hadoop\hadoop-3.2.2\bin'

# Initialize Spark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("AirbnbETL") \
    .config("spark.driver.host", "127.0.0.1") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .getOrCreate()

print("✅ SparkSession started successfully!")


✅ SparkSession started successfully!


# 2️⃣ Load Raw Airbnb CSV Data

In [2]:
df = spark.read.csv(r'D:\Darshana\Projects\Real_Estate_Market_Data\data\processed\airbnb_cleaned.csv', header=True, inferSchema=True)
df.show(5)

+------------------+--------------------+---------+------------------+-------------------+------------------+-----------------+------------------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+----------+------+--------+----+-------------+
|                id|                name|  host_id|         host_name|neighbourhood_group|     neighbourhood|         latitude|         longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|number_of_reviews_ltm|   license|rating|bedrooms|beds|        baths|
+------------------+--------------------+---------+------------------+-------------------+------------------+-----------------+------------------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+---

# 3️⃣ Data Cleaning and Transformation

In [3]:
# STEP 3: Apply transformations

from pyspark.sql.functions import col, regexp_replace, to_date

# Clean price column: remove $ and , and cast to double
df = df.withColumn(
    'price',
    regexp_replace(col('price'), r'[\$,]', '').cast('double')
)

# Create price_per_night (optional but cleaner for later)
df = df.withColumn('price_per_night', col('price'))

# Convert last_review column to date
df = df.withColumn('date_scraped', to_date(col('last_review'), 'yyyy-MM-dd'))

# Drop rows where price or lat/lon are missing
df = df.na.drop(subset=['price_per_night', 'latitude', 'longitude'])

# Check result
df.show(5)


+------------------+--------------------+---------+------------------+-------------------+------------------+-----------------+------------------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+----------+------+--------+----+-------------+---------------+------------+
|                id|                name|  host_id|         host_name|neighbourhood_group|     neighbourhood|         latitude|         longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|number_of_reviews_ltm|   license|rating|bedrooms|beds|        baths|price_per_night|date_scraped|
+------------------+--------------------+---------+------------------+-------------------+------------------+-----------------+------------------+---------------+-----+--------------+-----------------+-----------+-----------------+---------------

# 4️⃣ Save Processed Data to Parquet Format (Optimized Storage)

In [4]:
# STEP 4: Save processed data to parquet (for dashboard consumption)

output_path = r"D:\Darshana\Projects\Real_Estate_Market_Data\data\processed\airbnb.parquet"

df.write.mode('overwrite').parquet(output_path)

print("✅ Airbnb ETL processing complete and saved to parquet.")


✅ Airbnb ETL processing complete and saved to parquet.
