In [2]:
# Importing necessary libraries
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [3]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("EDA with PySpark") \
    .getOrCreate()

# Load the datasets
airbnb_file_path = '../data/airbnb.csv'
airbnb_df = spark.read.csv(airbnb_file_path, header=True, inferSchema=True)

rentals_file_path = '../data/rentals.json'
rentals_df = spark.read.json(rentals_file_path)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/10 00:47:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/10 00:47:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

### 1. EDA for the AirBnB dataset

In [4]:
# Display the schema of the DataFrame
airbnb_df.printSchema()

root
 |-- zipcode: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- bedrooms: double (nullable = true)
 |-- price: integer (nullable = true)
 |-- review_scores_value: double (nullable = true)



In [5]:
# Get the number of rows and columns
num_rows = airbnb_df.count()
num_columns = len(airbnb_df.columns)
print(f"The dataset contains {num_rows} rows and {num_columns} columns.")

The dataset contains 9913 rows and 8 columns.


In [6]:
# Summary statistics for numerical columns
airbnb_df.describe().show()

24/12/10 00:47:44 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 8:>                                                          (0 + 1) / 1]

+-------+------------------+--------------------+--------------------+---------------+------------------+------------------+------------------+-------------------+
|summary|           zipcode|            latitude|           longitude|      room_type|      accommodates|          bedrooms|             price|review_scores_value|
+-------+------------------+--------------------+--------------------+---------------+------------------+------------------+------------------+-------------------+
|  count|              7660|                9913|                9913|           9913|              9913|              9899|              9913|               8202|
|   mean|1049.7217948717948|   52.36613477824987|   4.888701607375185|           NULL| 3.044789670130132|1.3891302151732499|222.19388681529304|   91.4459887832236|
| stddev| 33.18556895939768|0.015259527481107983|0.032085220077320474|           NULL|1.6378700743324222|0.8583837323489234|290.78709517530905|  8.453049089025308|
|    min|       

                                                                                

In [7]:
# Check for missing values
missing_value_count = airbnb_df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in airbnb_df.columns])
missing_value_count.show()

+-------+--------+---------+---------+------------+--------+-----+-------------------+
|zipcode|latitude|longitude|room_type|accommodates|bedrooms|price|review_scores_value|
+-------+--------+---------+---------+------------+--------+-----+-------------------+
|   2253|       0|        0|        0|           0|      14|    0|               1711|
+-------+--------+---------+---------+------------+--------+-----+-------------------+



In [8]:
# Display the distribution of a specific column (replace 'column_name' with an actual column)
airbnb_df.groupBy('bedrooms').count().show()

[Stage 12:>                                                         (0 + 1) / 1]

+--------+-----+
|bedrooms|count|
+--------+-----+
|     8.0|    3|
|     0.0|  489|
|     7.0|    5|
|    NULL|   14|
|     1.0| 6298|
|     4.0|  152|
|     3.0|  619|
|     2.0| 2268|
|    10.0|    7|
|     6.0|    9|
|     5.0|   43|
|     9.0|    6|
+--------+-----+



                                                                                

### 2. EDA for the rentals dataset

In [9]:
# Display the schema of the DataFrame
rentals_df.printSchema()

root
 |-- _id: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- additionalCostsRaw: string (nullable = true)
 |-- areaSqm: string (nullable = true)
 |-- availability: string (nullable = true)
 |-- city: string (nullable = true)
 |-- crawlStatus: string (nullable = true)
 |-- crawledAt: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- deposit: string (nullable = true)
 |-- descriptionTranslated: string (nullable = true)
 |-- detailsCrawledAt: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- energyLabel: string (nullable = true)
 |-- firstSeenAt: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- furnish: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- internet: string (nullable = true)
 |-- isRoomActive: string (nullable = true)
 |-- kitchen: string (nullable = true)
 |-- lastSeenAt: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |--

In [10]:
# Get the number of rows and columns
num_rows = rentals_df.count()
num_columns = len(rentals_df.columns)
print(f"The dataset contains {num_rows} rows and {num_columns} columns.")



The dataset contains 46722 rows and 36 columns.


                                                                                

In [11]:
# Summary statistics for numerical columns
rentals_df.describe().show()

[Stage 20:>                                                         (0 + 1) / 1]

+-------+--------------------+-------+--------------------+------------+-----------+--------------------+---------------------+-----------+-----------+-------+--------+------------+-------+------------------+-------+------------------+-------------+--------------------+--------------------+-------------------+----------+------------------+------------+--------------------+--------------------+------------------+-------+-------------+--------+-------------------+-------+
|summary|  additionalCostsRaw|areaSqm|        availability|        city|crawlStatus|             deposit|descriptionTranslated|energyLabel|    furnish| gender|internet|isRoomActive|kitchen|          latitude| living|         longitude|matchCapacity|     pageDescription|           pageTitle|               pets|postalCode|         postedAgo|propertyType|    registrationCost|                rent|         roommates| shower|smokingInside|  source|              title| toilet|
+-------+--------------------+-------+------------

                                                                                

In [12]:
# Check for missing values
columns_of_interest = ["additionalCostsRaw", "deposit", "latitude", "longitude", "postalCode", "registrationCost", "rent"]
missing_value_count = rentals_df.select([F.count(F.when(F.isnan(c) | F.col(c).isNull(), c)).alias(c) for c in columns_of_interest])
missing_value_count.show()



+------------------+-------+--------+---------+----------+----------------+----+
|additionalCostsRaw|deposit|latitude|longitude|postalCode|registrationCost|rent|
+------------------+-------+--------+---------+----------+----------------+----+
|               100|    100|       0|        0|         0|             100|   0|
+------------------+-------+--------+---------+----------+----------------+----+



                                                                                

In [13]:
airbnb_df.show(truncate=False)

+-------+-----------+-----------+---------------+------------+--------+-----+-------------------+
|zipcode|latitude   |longitude  |room_type      |accommodates|bedrooms|price|review_scores_value|
+-------+-----------+-----------+---------------+------------+--------+-----+-------------------+
|1053   |52.37302064|4.868460923|Entire home/apt|4           |2.0     |130  |100.0              |
|NULL   |52.36575451|4.941419235|Private room   |2           |1.0     |59   |100.0              |
|1053   |52.36938767|4.866972319|Entire home/apt|4           |1.0     |95   |90.0               |
|1017   |52.36190508|4.888050037|Entire home/apt|2           |1.0     |100  |100.0              |
|1016 AM|52.37153345|4.887057291|Entire home/apt|6           |2.0     |250  |60.0               |
|1016 AM|52.3713592 |4.888072287|Private room   |4           |1.0     |140  |NULL               |
|1016 AM|52.3704458 |4.889069478|Private room   |2           |1.0     |115  |90.0               |
|1071 VV|52.35564811