# CSCIGA 2437 Project (Real Estate)
## Chenmeinian Guo (cg3972)

# Data Profiling

In [2]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder()
  .appName("NYC Property Sales Analysis")
  .getOrCreate()

val filePath = "./nyc-property-sales.csv"
val rawData = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv(filePath)

rawData.printSchema()
z.show(rawData)

In [3]:
val totalEntries = rawData.count().toDouble

val nullCounts = rawData.columns.map { colName =>
  val nullCount = rawData.filter(col(colName).isNull).count()
  val nullPercentage = (nullCount / totalEntries) * 100
  (colName, nullCount, nullPercentage)
}

println("Column-wise null value analysis:")
nullCounts.foreach { case (colName, nullCount, nullPercentage) =>
  println(f"$colName%-30s $nullCount%-10d ($nullPercentage%.2f%%)")
}

# Data Cleaning

In [5]:
// here we verified that the column "EASE-MENT" is useless because it's mostly either null or empty, only 14 entries are valid strings
import org.apache.spark.sql.functions._

rawData.filter(col("EASE-MENT").isNotNull).select("EASE-MENT").distinct().show(5) // unique vals for EASE-MENT

val countNullOrEmpty = rawData.filter(col("EASE-MENT").isNull || trim(col("EASE-MENT")) === "").count()
val countNonNullOrEmpty = rawData.filter(!(col("EASE-MENT").isNull || trim(col("EASE-MENT")) === "")).count()

println(s"Number of null and empty values in EASE-MENT: $countNullOrEmpty")
println(s"Number of string values in EASE-MENT: $countNonNullOrEmpty")


In [6]:
// a quick look on the columns that have null vals
rawData.select("RESIDENTIAL UNITS").show(5) 
rawData.select("COMMERCIAL UNITS").show(5) 
rawData.select("TOTAL UNITS").show(5)
rawData.select("GROSS SQUARE FEET").show(5)
rawData.select("LAND SQUARE FEET").show(5)

In [7]:
/* 
clean dataset: 
    1. change " ZIP CODE" (with an unexpected space) to "ZIP CODE"
    2. drop the few entries (e.g. "ZIP CODE") that have important null values
    3. columns "APARTMENT NUMBER" and "EASE-MENT" are not useful for our analysis, so we simply drop it
    4. some column names have extra space (i.e. category_type), so we extract the index to make sure there's no duplicate
*/
val cleanedData = rawData
                .withColumnRenamed(" ZIP CODE", "ZIP CODE")
                .na.drop(Seq(
                "TAX CLASS AT PRESENT", "BUILDING CLASS AT PRESENT",
                "ZIP CODE",
                "YEAR BUILT",
                "RESIDENTIAL UNITS", "COMMERCIAL UNITS"
                ))
                .drop("APARTMENT NUMBER")
                .drop("EASE-MENT")
                .withColumn(
                "CATEGORY_TYPE",
                regexp_extract(col("BUILDING CLASS CATEGORY"), "^\\d+", 0) // extract category index
                )
                .withColumn(
                "CATEGORY_FULL",
                regexp_replace(trim(col("BUILDING CLASS CATEGORY")), "^\\d+\\s*", "") // extract category name
                )

In [8]:
val nullCountsAfterCleansing = cleanedData.columns.map { colName =>
  val nullCount = cleanedData.filter(col(colName).isNull || col(colName) === "").count()
  val nullPercentage = (nullCount / totalEntries) * 100
  (colName, nullCount, nullPercentage)
}

println("Column-wise null value analysis:")
nullCountsAfterCleansing.foreach { case (colName, nullCount, nullPercentage) =>
  println(f"$colName%-30s $nullCount%-10d ($nullPercentage%.2f%%)")
}

In [9]:
/*
    LAND SQUARE FEET and CATEGORY_TYPE are important,
    and we've checked there're only a few data points that have a null/empty value 
    left. Therefore, we can safely drop these entries without 
    significantly affecting our dataset
    
    I later found that there're zip code == 0.0, so we need to get rid of this as well
    
    Here I used fullyCleanedData to analyze the cases where sale price equal to null/zero, whereas filteredData all have a valid sale price.
*/
val fullyCleanedData = cleanedData
                    .filter(col("LAND SQUARE FEET").isNotNull)
val filteredData = cleanedData.filter(col("CATEGORY_TYPE").isNotNull && col("CATEGORY_TYPE") =!= "")
                    .filter(col("CATEGORY_FULL").isNotNull && col("CATEGORY_FULL") =!= "")
                    .filter(col("ZIP CODE").isNotNull && col("CATEGORY_FULL") =!= 0)

// we should only consider sucessfull sales (where column "SALE PRICE" is above zero)
// but we found that too much data have zero for sales so we need to analyze it
val cleanedDataRatio = fullyCleanedData.filter($"SALE PRICE" === 0).count() / totalEntries
val totalCleanedEntries = fullyCleanedData.count()

Initially, I hypothesized that `SALE PRICE = 0` entries might be mostly from the earlier years, such as 2003, due to incomplete records or less robust data collection practices at the time. However, the analysis shows that zero-sale cases occur consistently across all years, including 19,643 instances in 2021, though in declining trend. **(see below)**

In [11]:
val dataWithYear = fullyCleanedData.withColumn(
  "YEAR",
  year(to_date(col("SALE DATE"), "yyyy-MM-dd HH:mm:ss"))
)

val zeroSaleByYear = dataWithYear
  .filter(col("SALE PRICE") === 0)
  .groupBy("YEAR")
  .agg(count("*").alias("ZERO_SALE_COUNT"))
  .orderBy(desc("ZERO_SALE_COUNT"))

val zeroSalePercentageByYear = dataWithYear
  .filter(col("SALE PRICE") === 0)
  .groupBy("YEAR")
  .agg(
    count("*").alias("ZERO_SALE_COUNT"),
    (count("*") / fullyCleanedData.count() * 100).alias("PERCENTAGE_OF_TOTAL")
  )
  .orderBy(desc("PERCENTAGE_OF_TOTAL"))



I noticed a substantial amount of zero sale entries. Since these do not reflect typical market transactions and may skew price trend analyses, to improve accuracy, I removed all entries with `SALE PRICE` = 0. 

In [13]:
// calculate mean and std
val stats = dataWithYear.select(mean("SALE PRICE"), stddev("SALE PRICE")).first()
val meanPrice = stats.getDouble(0)
val stdDevPrice = stats.getDouble(1)

// remove all entries where the sale price is zero
val zeroSaleData = dataWithYear.filter(col("SALE PRICE") !== 0)

println(s"Remaining records after filtering: ${filteredData.count()}")


In [14]:
z.show(dataWithYear)

In [15]:
z.show(fullyCleanedData) // this is the real fully cleaned data

# Data Analysis

## Zero Sale Analysis

After doing some research online, I found that entries with `SALE PRICE = 0` often represent non-standard transactions such as donations, inheritances, family transfers, government acquisitions, or incomplete sales. Analyzing these zero sale cases separately might reveal insights into specific property transfer patterns or anomalies, though the primary focus will remain on valid sales for investment analysis. Here I performed some simple analysis below.

In [18]:
val zeroSaleByYear = dataWithYear
  .filter(col("SALE PRICE") === 0)
  .groupBy("YEAR")
  .agg(count("*").alias("ZERO_SALE_COUNT"))
  .orderBy("YEAR") // sort by year

z.show(zeroSaleByYear)

In [19]:
val zeroSaleByNeighborhood = dataWithYear
  .filter(col("SALE PRICE") === 0)
  .groupBy("NEIGHBORHOOD")
  .agg(count("*").alias("ZERO_SALE_COUNT"))
  .orderBy(desc("ZERO_SALE_COUNT")) // sort by sale amount

z.show(zeroSaleByNeighborhood)


This focuses on neighborhoods in New York City with significant counts of zero-sale transactions—property transfers without a monetary sale price. These often indicate non-standard transactions like inheritances, donations, family transfers, or government acquisitions. For example, neighborhoods like Midtown West (12,217) and Flushing-North (11,537) lead in zero-sale counts, potentially reflecting high activity in corporate or institutional transfers. In contrast, areas like Bedford Stuyvesant (8,930) and Crown Heights (5,160) might reflect more localized, community-driven property exchanges, such as inheritances or family transfers.

Understanding these patterns helps identify where such non-market activities are concentrated, providing insights into urban property trends and socio-economic behaviors within different neighborhoods.

## Category

This analysis tracks the price trends over time and the percentage growth for various categories. The first table highlights the yearly average, median sale prices, and total sales across all categories. The second table focuses on comparing the overall growth rates of these categories and is sorted accordingly.

**Findings from the first table**:

1. Total sales:
   - Coops elevator apartments
   - Condos elevator apartments
   - One family dwellings
   - Walkup apartments
   - Three family dwellings

2. Median sale price:
   - Theatres
   - Rentals elevator apartments
   - Loft buildings
   - Condos elevator apartments
   - Hospital and health facilities

3. Average sale price:
   - Theatres
   - A condo rental
   - Office buildings
   - Tax class 4 utility bureau properties
   - Tax class 3 utility properties

**Some Background knowledge that might help for those who don't know**:

- **Coops vs. condos**:
  - Coops: Buyers own shares in a corporation that grants rights to occupy a unit. Usually involves restrictions on renting and board approval.
  - Condos: Buyers own the unit outright and share common areas. Offers more flexibility in renting.

- **One family dwellings vs. family homes**:
  - Generally refer to standalone properties for single-family use. Differences may arise from regional naming conventions or slight design variations.

- **Tax classes**:
  - Tax class 3: Includes utility properties like power generation facilities.
  - Tax class 4: Covers commercial and industrial properties, such as office buildings or warehouses.


In [23]:
// sum of median for each group
val categoryStats = fullyCleanedData.groupBy("CATEGORY_TYPE", "CATEGORY_FULL")
  .agg(
    avg("SALE PRICE").alias("AVERAGE_SALE_PRICE"),
    expr("percentile_approx(`SALE PRICE`, 0.5)").alias("MEDIAN_SALE_PRICE"),
    count("*").alias("TOTAL_SALES")
  )
  .orderBy(desc("AVERAGE_SALE_PRICE"))

z.show(categoryStats)


For the rest of the code,

- **First Table (yearlyGrowth)**: Tracks yearly growth rates for each category based on the percentage change in average sale prices compared to the previous year. This table provides an overview of the processed data used for analysis.

- **Second Table (categoryGrowthRate)**: Aggregates growth rates to calculate the overall average growth rate across all years for each category. This table identifies categories with consistently high growth rates. However, there are some issues with this table due to outliers, such as the first category, `Government Facilities`, which shows an extremely large growth rate of `6.6E7`. Because of this, a graph is not drawn for this table.

- **Third Table (yearlyStatsWithZip)**: Provides insights into sale price trends at the ZIP code level. For example, the top three ZIP codes are `10020`, `10038`, and `10024`, corresponding to the Midtown Rockefeller Center area and Wall Street area. This reflects the potential of building prices in these regions.

- **Fourth Table (yearlyStatsWithZip)**: Provides insights into sale price trends at the ZIP code level for each category. For example, the most expansive category is commerical condos at `10020`.


In [25]:
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._

val enrichedData = fullyCleanedData.withColumn("YEAR", year(to_date(col("SALE DATE"), "yyyy-MM-dd HH:mm:ss")))

val yearlyStats = enrichedData.groupBy("CATEGORY_FULL", "YEAR")
  .agg(avg("SALE PRICE").alias("AVERAGE_SALE_PRICE"), count("*").alias("TOTAL_SALES"))
  .orderBy("CATEGORY_FULL", "YEAR")

val windowSpec = Window.partitionBy("CATEGORY_FULL").orderBy("YEAR")

val yearlyGrowth = yearlyStats
  .withColumn("PREV_YEAR_AVERAGE", lag("AVERAGE_SALE_PRICE", 1).over(windowSpec))
  .withColumn("GROWTH_RATE", (col("AVERAGE_SALE_PRICE") - col("PREV_YEAR_AVERAGE")) / col("PREV_YEAR_AVERAGE") * 100)
  .filter(col("GROWTH_RATE").isNotNull)

z.show(yearlyGrowth)

val categoryGrowthRate = yearlyGrowth.groupBy("CATEGORY_FULL")
  .agg(avg("GROWTH_RATE").alias("AVERAGE_GROWTH_RATE"), count("*").alias("YEARS_WITH_DATA"))
  .orderBy(desc("AVERAGE_GROWTH_RATE"))

z.show(categoryGrowthRate)

val yearlyStatsWithZip = enrichedData.groupBy("CATEGORY_FULL", "YEAR", "ZIP CODE")
  .agg(avg("SALE PRICE").alias("AVERAGE_SALE_PRICE"), count("*").alias("TOTAL_SALES"))
  .orderBy("CATEGORY_FULL", "ZIP CODE", "YEAR")

z.show(yearlyStatsWithZip) // for zip
z.show(yearlyStatsWithZip) // for zip and category


## Neighborhood

This analysis evaluates neighborhoods based on their growth rates, average sale prices, median sale prices, and total sales:

1. **Growth Rate**: 
   - The `neighborhoodGrowthRate` table identifies neighborhoods with the highest average annual growth rates (sorted in descending order). A particular outlier is the `Clinton` neighborhood, where the growth rate was unexpectedly high. Further inspection confirmed this large number due to specific data points (some with growth rate as high as 1E7).

2. **Sale Price and Volume**: 
   - The `neighborhoodAnalysis` table provides insights into the average and median sale prices, as well as total sales for each neighborhood. This helps pinpoint high-value neighborhoods and those with significant market activity. For example, neighborhoods with high average sale prices may indicate premium real estate markets, while high total sales reflect areas with greater transaction activity.

This analysi overall is similar to the previous category analysis.


In [28]:
val neighborhoodGrowthRate = yearlyGrowth
  .withColumn("GROWTH_RATE", col("GROWTH_RATE") / 100) 
  .groupBy("NEIGHBORHOOD") // group by neighborhood
  .agg(
    avg("GROWTH_RATE").alias("AVERAGE_GROWTH_RATE") // average growth rate
  )
  .withColumn("AVERAGE_GROWTH_RATE", col("AVERAGE_GROWTH_RATE") * 100)
  .orderBy(desc("AVERAGE_GROWTH_RATE"))

z.show(neighborhoodGrowthRate)

In [29]:
// here I felt little confused how can the average growth rate (e.g. this CLINTON neighborhood) be such a large number but after checking the neighborhood below I found that the growth rate are indeed that large.
z.show(filteredData
  .filter(col("NEIGHBORHOOD") === "CLINTON")
  .orderBy(desc("GROWTH_RATE")))

In [30]:
// Group by NEIGHBORHOOD and calculate stats
val neighborhoodAnalysis = filteredData.groupBy("NEIGHBORHOOD")
  .agg(
    avg("SALE PRICE").alias("AVERAGE_SALE_PRICE"),       // average sales
    expr("percentile_approx(`SALE PRICE`, 0.5)").alias("MEDIAN_SALE_PRICE"), // median sales
    count("*").alias("TOTAL_SALES")                     // total sales
  )
  .orderBy(desc("AVERAGE_SALE_PRICE"))

z.show(neighborhoodAnalysis)


## Time

I analyzed the data based on time, focusing on months and quarters:

- Monthly Analysis: 
  - In terms of average sales, months `1`, `3`, `6`, and `12` show relatively high performance, with `12` (December) standing out as the highest. 
  - For total sales, mid-year months like `6` (June) and `8` (August) lead the pack, while December also shows strong performance.

- Quarterly Analysis: 
  - For average sales, the quarters exhibit consistent figures: Q1 (24%), Q2 (2%5), Q3 (24%), and Q4 (27%), with Q4 slightly outperforming the others. 
  - In terms of total sales, Q3 (26%) shows the highest volume, followed closely by Q2 (25%) and Q4 (25%), while Q1 (23%) lags behind. 

In [33]:
// Compare Average & Total Sale Price for different months
val monthlyStats = enrichedData
  .withColumn("MONTH", month(to_date(col("SALE DATE"), "yyyy-MM-dd HH:mm:ss")))
  .groupBy("MONTH")
  .agg(
    avg("SALE PRICE").alias("AVERAGE_SALE_PRICE"),
    count("*").alias("TOTAL_SALES")
  )
  .orderBy("MONTH")

z.show(monthlyStats)


In [34]:
// Extract quarter from sale date
val quarterlyStats = enrichedData
  .withColumn("QUARTER", quarter(to_date(col("SALE DATE"), "yyyy-MM-dd HH:mm:ss")))
  .groupBy("YEAR", "QUARTER")
  .agg(
    avg("SALE PRICE").alias("AVERAGE_SALE_PRICE"),       // average quarter sale
    count("*").alias("TOTAL_SALES")                     // total sales
  )
  .orderBy("YEAR", "QUARTER")

z.show(quarterlyStats)


In [35]:
// Group by neighborhood and year to compute yearly average sale prices and total sales
val neighborhoodYearlyStats = enrichedData
  .groupBy("NEIGHBORHOOD", "YEAR")
  .agg(
    avg("SALE PRICE").alias("AVERAGE_SALE_PRICE"),
    count("*").alias("TOTAL_SALES")
  )
  .orderBy("NEIGHBORHOOD", "YEAR")

// Define a window for calculating growth rate by neighborhood
val neighborhoodWindowSpec = Window.partitionBy("NEIGHBORHOOD").orderBy("YEAR")

// Compute the growth rate for each neighborhood
val neighborhoodGrowth = neighborhoodYearlyStats
  .withColumn("PREV_YEAR_AVERAGE", lag("AVERAGE_SALE_PRICE", 1).over(neighborhoodWindowSpec))
  .withColumn(
    "GROWTH_RATE",
    (col("AVERAGE_SALE_PRICE") - col("PREV_YEAR_AVERAGE")) / col("PREV_YEAR_AVERAGE") * 100
  )
  .filter(col("GROWTH_RATE").isNotNull) // Exclude null growth rates

z.show(neighborhoodGrowth)


## Clustering

This analysis identifies patterns in property categories by clustering based on **average sale price**, **growth rate**, and **total sales**. The data was preprocessed to remove outliers (1st and 99th percentiles) and standardized for better performance. Principal Component Analysis (PCA) reduced the features to two dimensions for visualization.

## Clustering Process

Using KMeans with `k=5`, clusters were created to group property categories with similar price and growth patterns. Each cluster’s characteristics, such as average sale price, growth rate, and dominant property categories, were analyzed. The centroids of clusters indicate the central tendencies for the features.

## Key Findings

- **Cluster Characteristics**: Each cluster represents a distinct market behavior, grouping categories with similar price and growth trends. 
- **High-Growth Categories**: Clusters with the highest growth rate centroids were identified, highlighting categories with the most promising market potential.
- **Category Distribution**: For each cluster, the dominant property categories and their average statistics reveal insights into market segmentation.
- **Visualization**: PCA-based cluster visualization shows clear separation and clustering of property categories in reduced dimensions.

## Results and Implications

This analysis provides actionable insights by highlighting high-growth property categories and segmenting the market into distinct groups. Cluster centroids summarize the tendencies of each group, offering a foundation for targeted decision-making in real estate investment.



In [38]:
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.feature.{VectorAssembler, StandardScaler, PCA}
import org.apache.spark.sql.functions._

// Step 1: Prepare data with additional features
val clusteringData = yearlyGrowth
  .filter(col("GROWTH_RATE").isNotNull && col("AVERAGE_SALE_PRICE").isNotNull)
  .select("CATEGORY_FULL", "AVERAGE_SALE_PRICE", "GROWTH_RATE", "TOTAL_SALES")

// Remove outliers using 1st and 99th percentile thresholds
val priceStats = clusteringData.stat.approxQuantile("AVERAGE_SALE_PRICE", Array(0.01, 0.99), 0.0)
val growthStats = clusteringData.stat.approxQuantile("GROWTH_RATE", Array(0.01, 0.99), 0.0)
val totalSalesStats = clusteringData.stat.approxQuantile("TOTAL_SALES", Array(0.01, 0.99), 0.0)

val filteredClusteringData = clusteringData
  .filter(col("AVERAGE_SALE_PRICE").between(priceStats(0), priceStats(1)))
  .filter(col("GROWTH_RATE").between(growthStats(0), growthStats(1)))
  .filter(col("TOTAL_SALES").between(totalSalesStats(0), totalSalesStats(1)))

// Step 2: Assemble features and scale
val assembler = new VectorAssembler()
  .setInputCols(Array("AVERAGE_SALE_PRICE", "GROWTH_RATE", "TOTAL_SALES"))
  .setOutputCol("rawFeatures")

val assembledData = assembler.transform(filteredClusteringData)

val scaler = new StandardScaler()
  .setInputCol("rawFeatures")
  .setOutputCol("features")
  .setWithMean(true)
  .setWithStd(true)

val scaledData = scaler.fit(assembledData).transform(assembledData)

// Step 3: Apply PCA for dimensionality reduction
val pca = new PCA()
  .setInputCol("features")
  .setOutputCol("pcaFeatures")
  .setK(2) // Reduce to 2 dimensions for visualization
val pcaModel = pca.fit(scaledData)
val pcaData = pcaModel.transform(scaledData)

// Step 4: KMeans clustering
val kmeans = new KMeans()
  .setK(5)
  .setSeed(1L)
  .setFeaturesCol("pcaFeatures")
  .setPredictionCol("cluster")

val kmeansModel = kmeans.fit(pcaData)
val clusteredData = kmeansModel.transform(pcaData)

// Step 5: Analyze cluster composition
val clusterComposition = clusteredData.groupBy("cluster")
  .agg(
    count("*").alias("COUNT"),
    avg("AVERAGE_SALE_PRICE").alias("AVG_SALE_PRICE"),
    avg("GROWTH_RATE").alias("AVG_GROWTH_RATE"),
    collect_list("CATEGORY_FULL").alias("CATEGORY_LIST")
  )
  .orderBy("cluster")

z.show(clusterComposition)

// Step 6: Display cluster centroids
val centroids = kmeansModel.clusterCenters.zipWithIndex.map { case (center, idx) =>
  (idx, center(0), center(1))
}.toSeq.toDF("Cluster", "Centroid_1", "Centroid_2")

z.show(centroids)


In [39]:
z.show(clusteredData.select("CATEGORY_FULL", "AVERAGE_SALE_PRICE", "GROWTH_RATE", "cluster")
  .orderBy("cluster"))


In [40]:
import org.apache.spark.sql.functions._

// Step 1: Analyze each cluster's composition
val clusterComposition = clusteredData.groupBy("cluster")
  .agg(
    count("*").alias("COUNT"), // Total count of data points in each cluster
    avg("AVERAGE_SALE_PRICE").alias("AVG_SALE_PRICE"), // Average sale price in each cluster
    avg("GROWTH_RATE").alias("AVG_GROWTH_RATE"), // Average growth rate in each cluster
    collect_list("CATEGORY_FULL").alias("CATEGORY_LIST") // List of property categories in each cluster
  )
  .orderBy("cluster")

z.show(clusterComposition)

// Step 2: Analyze cluster distribution by CATEGORY
val clusterNeighborhoodAnalysis = clusteredData.groupBy("cluster", "CATEGORY_FULL")
  .agg(
    count("*").alias("CATEGORY_COUNT"), // Total count per category in each cluster
    avg("AVERAGE_SALE_PRICE").alias("AVG_SALE_PRICE_CATEGORY"), // Average sale price per category
    avg("GROWTH_RATE").alias("AVG_GROWTH_RATE_CATEGORY") // Average growth rate per category
  )
  .orderBy(col("cluster"), col("CATEGORY_COUNT").desc) // Fixed sorting logic

z.show(clusterNeighborhoodAnalysis)

// Step 3: Extract centroids for each cluster
val clusterCentroids = kmeansModel.clusterCenters.zipWithIndex.map { case (center, idx) =>
  (idx, center.toArray(0), center.toArray(1)) // Assuming 2D features: sale price and growth rate
}.toSeq

val centroidsDF = spark.createDataFrame(clusterCentroids).toDF("Cluster", "Centroid_Sale_Price", "Centroid_Growth_Rate")

z.show(centroidsDF)

// Step 4: Investigate clusters with the highest growth rates
val highGrowthClusters = clusteredData
  .filter(col("cluster") === clusterCentroids.maxBy(_._3)._1) // Find the cluster with the highest growth rate centroid
  .groupBy("CATEGORY_FULL")
  .agg(
    avg("GROWTH_RATE").alias("AVG_GROWTH_RATE"), // Average growth rate for this cluster
    avg("AVERAGE_SALE_PRICE").alias("AVG_SALE_PRICE") // Average sale price for this cluster
  )
  .orderBy(desc("AVG_GROWTH_RATE"))

z.show(highGrowthClusters)

// Step 5: Visualize cluster results
val visualizationData = clusteredData.select("cluster", "AVERAGE_SALE_PRICE", "GROWTH_RATE")

z.show(visualizationData)

