In [11]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark aggregation functions") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [2]:
listings = spark.read.csv("data/listings.csv", 
    header=True,
    inferSchema=True,
    sep=",", 
    quote='"',
    escape='"', 
    multiLine=True,
    mode="PERMISSIVE" 
)
listings.printSchema()

                                                                                

root
 |-- id: long (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: long (nullable = true)
 |-- last_scraped: date (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: date (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_count: int

In [3]:
reviews = spark.read.csv("data/reviews.csv", 
    header=True,
    inferSchema=True,
    sep=",",
    quote='"',
    escape='"',
    multiLine=True,
    mode="PERMISSIVE"
)
reviews.printSchema()

[Stage 3:>                                                          (0 + 1) / 1]

root
 |-- listing_id: long (nullable = true)
 |-- id: long (nullable = true)
 |-- date: date (nullable = true)
 |-- reviewer_id: integer (nullable = true)
 |-- reviewer_name: string (nullable = true)
 |-- comments: string (nullable = true)



                                                                                

In [4]:
# 1. Count the number of reviews per listing using the "reviews" dataset
from pyspark.sql.functions import col

# Count reviews per listing_id
reviews_per_listing = reviews.groupBy("listing_id").count() \
    .withColumnRenamed("count", "number_of_reviews") \
    .orderBy(col("number_of_reviews").desc())

print("Number of listings with at least one review:", reviews_per_listing.count())
reviews_per_listing.show(10, truncate=False)

                                                                                

Number of listings with at least one review: 71487


                                                                                

+----------+-----------------+
|listing_id|number_of_reviews|
+----------+-----------------+
|47408549  |1855             |
|30760930  |1682             |
|43120947  |1615             |
|19670926  |1436             |
|45006692  |1433             |
|1436172   |1195             |
|2126708   |1122             |
|1436177   |1005             |
|47438714  |978              |
|3855375   |973              |
+----------+-----------------+
only showing top 10 rows



In [5]:
# 2. Compute the total number of listings and average review score per host
from pyspark.sql.functions import countDistinct, avg

# Compute number of listings and average review score per host
host_stats = listings.groupBy("host_id").agg(
    countDistinct("id").alias("total_listings"),
    avg("review_scores_rating").alias("avg_review_score")
).orderBy(col("total_listings").desc())

print("Number of hosts with at least one listing:", host_stats.count())
host_stats.show(10, truncate=False)

[Stage 13:>                                                         (0 + 1) / 1]

Number of hosts with at least one listing: 55804


[Stage 19:>                                                         (0 + 1) / 1]

+---------+--------------+------------------+
|host_id  |total_listings|avg_review_score  |
+---------+--------------+------------------+
|446820235|495           |4.541666666666667 |
|314162972|420           |4.3734328358208945|
|28820321 |285           |4.580697674418606 |
|1432477  |246           |4.4001843317972344|
|156158778|213           |4.890319148936171 |
|33889201 |197           |4.634916666666666 |
|47609036 |142           |4.751651376146787 |
|228928499|132           |4.846607142857144 |
|124359784|128           |4.404491525423728 |
|83740964 |123           |4.4220000000000015|
+---------+--------------+------------------+
only showing top 10 rows



                                                                                

In [6]:
# 3: Find the top ten listings with the highest number of reviews

# Count reviews per listing_id and join to get listing name
top10_reviews = (
    reviews.groupBy("listing_id").count()
    .withColumnRenamed("count", "number_of_reviews")
    .join(
        listings.select(col("id").alias("listing_id"), "name"),
        on="listing_id",
        how="left"
    )
    .orderBy(col("number_of_reviews").desc(), col("listing_id").asc())
)

print("Top 10 listings by number of reviews:")
top10_reviews.select("listing_id", "name", "number_of_reviews").show(10, truncate=False)

Top 10 listings by number of reviews:


[Stage 25:>                                                         (0 + 1) / 1]

+----------+--------------------------------------------------+-----------------+
|listing_id|name                                              |number_of_reviews|
+----------+--------------------------------------------------+-----------------+
|47408549  |Double Room+ Ensuite                              |1855             |
|30760930  |Double Garden View room - London House Hotel***   |1682             |
|43120947  |Private double room with en suite facilities      |1615             |
|19670926  |Locke Studio Apartment at Leman Locke             |1436             |
|45006692  |Budget Double Room In Colliers Hotel.             |1433             |
|1436172   |Cosy Double in Kings Cross Houseshare nr Eurostar |1195             |
|2126708   |London's best transport hub 5 mins walk! Safe too!|1122             |
|1436177   |En-suite Double in Kings Cross Houseshare Eurostar|1005             |
|47438714  |KX Basic- Small Double- shared bathroom           |978              |
|3855375   |Doub

                                                                                

In [12]:
# 4. Find the top five neighborhoods with the most listings

top5_neighbourhoods = (
    listings
        .filter(col("neighbourhood_cleansed").isNotNull())
        .groupBy("neighbourhood_cleansed")
        .count()
        .withColumnRenamed("count", "listings_count")
        .orderBy(col("listings_count").desc(), col("neighbourhood_cleansed").asc())
)

print("Top 5 neighbourhoods by listings:")
top5_neighbourhoods.show(5, truncate=False)

Top 5 neighbourhoods by listings:


[Stage 29:>                                                         (0 + 1) / 1]

+----------------------+--------------+
|neighbourhood_cleansed|listings_count|
+----------------------+--------------+
|Westminster           |11367         |
|Tower Hamlets         |7566          |
|Camden                |6564          |
|Kensington and Chelsea|6348          |
|Hackney               |6279          |
+----------------------+--------------+
only showing top 5 rows



                                                                                

In [8]:
# 5. Get a data frame with the following four columns:
# * Listing's ID
# * Listing's name
# * Reviewer's name
# * Review's comment
# Use "join" to combine data from two datasets


In [9]:
# 6.Get top five listings with the highest average review comment length. Only return listings with at least 5 reviews
# Use the "length" function from the "pyspark.sql.functions" to get a lenght of a review


In [10]:
# 7. Using the "join" operator find listings without reviews.
# Hint: Use "left_join" or "left_anti" join type when implementing this
