<a href="https://www.kaggle.com/code/gauravgurjar/rent-contracts-dubai?scriptVersionId=224390180" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<a href="https://www.kaggle.com/code/gauravgurjar/rent-contracts-dubai?scriptVersionId=224066437" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Data Loading
---
Installing the libraries

In [1]:
%%bash
download_latest_file() {
    current_date=$(date +%Y-%m-%d)
    base_url="https://github.com/ggurjar333/real-estate-analysis-dubai/releases/download"
    release="release-${current_date}"
    file="dld_rent_contracts_${current_date}.parquet"

    wget -q "${base_url}/${release}/${file}" # Added -q option to make wget quiet. 
}
download_latest_file

In [2]:
# Installing required packages
!pip install pyspark
!pip install findspark
!pip install pandas

import findspark
findspark.init()

import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Dubai Land Development") \
    .getOrCreate()

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [3]:
from datetime import date

In [4]:
rent_contracts_df = spark.read.parquet(f"dld_rent_contracts_{date.today()}.parquet")
rent_contracts_df.printSchema()

root
 |-- contract_id: string (nullable = true)
 |-- contract_reg_type_id: long (nullable = true)
 |-- contract_reg_type_ar: string (nullable = true)
 |-- contract_reg_type_en: string (nullable = true)
 |-- contract_start_date: string (nullable = true)
 |-- contract_end_date: string (nullable = true)
 |-- contract_amount: long (nullable = true)
 |-- annual_amount: long (nullable = true)
 |-- no_of_prop: long (nullable = true)
 |-- line_number: long (nullable = true)
 |-- is_free_hold: long (nullable = true)
 |-- ejari_bus_property_type_id: long (nullable = true)
 |-- ejari_bus_property_type_ar: string (nullable = true)
 |-- ejari_bus_property_type_en: string (nullable = true)
 |-- ejari_property_type_id: long (nullable = true)
 |-- ejari_property_type_en: string (nullable = true)
 |-- ejari_property_type_ar: string (nullable = true)
 |-- ejari_property_sub_type_id: long (nullable = true)
 |-- ejari_property_sub_type_en: string (nullable = true)
 |-- ejari_property_sub_type_ar: string (

In [5]:
rent_contracts_df.show(5)

+-------------+--------------------+--------------------+--------------------+-------------------+-----------------+---------------+-------------+----------+-----------+------------+--------------------------+--------------------------+--------------------------+----------------------+----------------------+----------------------+--------------------------+--------------------------+--------------------------+-----------------+-----------------+--------------+--------------------+--------------------+--------------------+--------------------+-------+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+---------------+---------------+--------------+--------------+--------------+
|  contract_id|contract_reg_type_id|contract_reg_type_ar|contract_reg_type_en|contract_start_date|contract_end_date|contract_amount|annual_amount|no_of_prop|line_number|is_free_hold|ejari_bus_property_type_id|ejari_bus_property_

In [6]:
from pyspark.sql.functions import col
# Convert contract start and end dates to datetime objects if they aren't already
rent_contracts_df = rent_contracts_df.withColumn("contract_start_date", col("contract_start_date").cast("date"))
rent_contracts_df = rent_contracts_df.withColumn("contract_end_date", col("contract_end_date").cast("date"))


In [7]:
# prompt: 2. Descriptive Statistics
# Contract Amounts: Calculate the average, median, and range of contract_amount to understand pricing trends.
# Contract Duration: Analyze the duration of contracts by calculating the difference between contract_end_date and contract_start_date.
# Property Types: Count the occurrences of each property type (ejari_property_type_en) to identify the most common types of properties rented.

from pyspark.sql.functions import avg, max, min, col, datediff, count

# Calculate average, median, and range of contract_amount
contract_amount_stats = rent_contracts_df.select(
    avg("contract_amount").alias("avg_contract_amount"),
    max("contract_amount").alias("max_contract_amount"),
    min("contract_amount").alias("min_contract_amount")
)

contract_amount_stats.show()

# Calculate contract duration
rent_contracts_df = rent_contracts_df.withColumn(
    "contract_duration", datediff(col("contract_end_date"), col("contract_start_date"))
)
rent_contracts_df.show(5)

# Count occurrences of each property type
property_type_counts = rent_contracts_df.groupBy("ejari_property_type_en").agg(
    count("*").alias("property_count")
)

property_type_counts.show()


+-------------------+-------------------+-------------------+
|avg_contract_amount|max_contract_amount|min_contract_amount|
+-------------------+-------------------+-------------------+
|  731661.5617702793|         4200000003|                  0|
+-------------------+-------------------+-------------------+

+-------------+--------------------+--------------------+--------------------+-------------------+-----------------+---------------+-------------+----------+-----------+------------+--------------------------+--------------------------+--------------------------+----------------------+----------------------+----------------------+--------------------------+--------------------------+--------------------------+-----------------+-----------------+--------------+--------------------+--------------------+--------------------+--------------------+-------+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+----------------

In [8]:
# prompt: 3. Trends Over Time
# Contract Start and End Dates: Analyze the distribution of contract start dates to identify peak rental periods.
# Renewals vs. New Contracts: Compare the number of new contracts versus renewals to assess tenant retention and market stability.

from pyspark.sql.functions import month, year

# Analyze contract start date distribution
start_date_counts = rent_contracts_df.groupBy(month("contract_start_date"), year("contract_start_date")).count().orderBy(year("contract_start_date"),month("contract_start_date"))
start_date_counts.show(5)


# Analyze contract renewal vs. new contracts (assuming you have a column indicating this)
#  Replace "is_renewal" with the actual column name in your DataFrame.
# If there's no such column, you need to engineer it based on your data.
if "contract_reg_type_en" in rent_contracts_df.columns:
    reg_type_counts = rent_contracts_df.groupBy("contract_reg_type_en").count()
    reg_type_counts.show()
else:
    print("No 'contract_reg_type_en' column found. Please add a column to identify contract renewals.")


+--------------------------+-------------------------+-------+
|month(contract_start_date)|year(contract_start_date)|  count|
+--------------------------+-------------------------+-------+
|                      NULL|                     NULL|8559440|
+--------------------------+-------------------------+-------+

+--------------------+-------+
|contract_reg_type_en|  count|
+--------------------+-------+
|               Renew|4094747|
|                 New|4464693|
+--------------------+-------+



In [9]:
# prompt: 4. Property Usage Analysis
# Residential vs. Commercial: Compare the average contract amounts for residential and commercial properties to understand market dynamics.
# Property Subtypes: Analyze the distribution of property subtypes (e.g., 1 bed room, 2 bed rooms) to identify popular configurations.

# Analyze residential vs. commercial property contract amounts
residential_commercial_avg = rent_contracts_df.groupBy("ejari_property_type_en").agg(
    avg("contract_amount").alias("avg_contract_amount")
)
residential_commercial_avg.show()

# Analyze the distribution of property subtypes (assuming you have a 'property_subtype' column)
if "ejari_property_sub_type_en" in rent_contracts_df.columns:
    property_subtype_counts = rent_contracts_df.groupBy("ejari_property_sub_type_en").agg(
        count("*").alias("property_subtype_count")
    )
    property_subtype_counts.show()
else:
    print("No 'ejari_property_sub_type_en' column found. Please add a relevant column to your DataFrame.")


+----------------------+-------------------+
|ejari_property_type_en|avg_contract_amount|
+----------------------+-------------------+
|               Parking|  457364.4467979068|
|           Health club| 1563110.4343845372|
|                  Bank|  3337451.146067416|
|    Resturants Complex|  813715.2333333333|
|        Medical center| 1151954.0466101696|
|                  Farm|  637142.8571428572|
|                   Spa|  643067.9722222222|
|                   ATM|  61935.57249070632|
|     Complex Warehouse| 195434.60373216245|
|            Open space| 464621.38119911175|
|         Land Parking | 147998.73684210525|
|                  Hall| 387615.95454545453|
|                Office|  304415.0374840212|
|  Supermarket, a mu...|  3225602.027586207|
|                School|  1.0064177715625E7|
|           Supermarket| 2226240.6923076925|
|                  Flat|  590509.2390139375|
|        Complex Villas|  168764.4980364289|
|               Nursery|  915195.9939393939|
|      Hot

In [10]:
# 5. Geographic Insights
# Area Analysis: Group data by area_name_en to identify which areas have the highest number of contracts and average contract amounts.
# Proximity to Landmarks: Analyze how proximity to landmarks (e.g., malls, metro stations) affects rental prices.
# nearest_landmark_en, nearest_metro_en, nearest_mall_en
area_analysis = rent_contracts_df.groupBy("area_name_en").agg(
    count("*").alias("contract_count"),
    avg("contract_amount").alias("avg_contract_amount")
)
area_analysis.show()

proximity_analysis = rent_contracts_df.groupBy("nearest_landmark_en", "nearest_metro_en", "nearest_mall_en").agg(
    avg("contract_amount").alias("avg_contract_amount")
)
proximity_analysis.show()

+--------------------+--------------+-------------------+
|        area_name_en|contract_count|avg_contract_amount|
+--------------------+--------------+-------------------+
|    Um Hurair Second|         20521|  379538.1990156425|
|         Al Khabeesi|         53918| 104672.02577988798|
|        Al Rashidiya|         19632| 101951.47376731866|
|Al Barsha South F...|         20882|   107567.288430227|
|          Al Kheeran|         15352|  704012.8445153725|
|       Al Goze Third|        132369|  2099384.749782804|
|      Al Twar Second|           747|  301633.3614457831|
|             Al Ttay|         13335|  900357.6253468316|
|     Um Suqaim First|         13624|  631046.4059013505|
|  Nad Al Shiba First|         11799| 184870.52894313078|
|       Lehbab Second|           919| 46045.532100108816|
|      Zareeba Duviya|            37|   47297.2972972973|
|      Madinat Hind 2|            27| 252629.62962962964|
|       Al Yelayiss 1|         14109| 130574.46580197038|
|       Saih S

In [11]:
# prompt: 6. Tenant Insights
# Tenant Types: Analyze the distribution of tenant types to understand the demographics of renters.
# Contract Amounts by Tenant Type: Compare average contract amounts across different tenant types to identify potential market segments.

# Tenant Insights
# Tenant Types: Analyze the distribution of tenant types to understand the demographics of renters.
tenant_type_distribution = rent_contracts_df.groupBy("tenant_type_en").count().orderBy("count", ascending=False)
tenant_type_distribution.show()

# Contract Amounts by Tenant Type: Compare average contract amounts across different tenant types to identify potential market segments.
avg_contract_by_tenant_type = rent_contracts_df.groupBy("tenant_type_en").agg(avg("contract_amount").alias("avg_contract_amount"))
avg_contract_by_tenant_type.show()


+--------------+-------+
|tenant_type_en|  count|
+--------------+-------+
|        Person|4268131|
|     Authority|3480826|
|              | 810483|
+--------------+-------+

+--------------+-------------------+
|tenant_type_en|avg_contract_amount|
+--------------+-------------------+
|        Person| 208043.31137797787|
|     Authority|  1476358.804588911|
|              | 290825.37117003073|
+--------------+-------------------+



In [12]:
# prompt: 8. Predictive Analysis
# Price Prediction: Use regression analysis to predict contract amounts based on features such as property type, area, and contract duration.
# Churn Prediction: Analyze factors that may lead to tenant churn (e.g., contract renewals) to develop strategies for tenant retention.

from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col

# Filter out rows where 'ejari_property_type_en' or 'area_name_en' is null or empty
cleaned_data = rent_contracts_df.filter(
    col('ejari_property_type_en').isNotNull() & (col('ejari_property_type_en') != "") &
    col('area_name_en').isNotNull() & (col('area_name_en') != "")
)

# Handle categorical columns (ejari_property_type_en, area_name_en) with StringIndexer and OneHotEncoder

area_indexer = StringIndexer(inputCol="area_name_en", outputCol="area_index")
property_type_indexer = StringIndexer(inputCol="ejari_property_type_en", outputCol="property_type_index")

area_encoder = OneHotEncoder(inputCol="area_index", outputCol="area_encoded")
property_type_encoder = OneHotEncoder(inputCol="property_type_index", outputCol="property_type_encoded")

# Assemble features
assembler = VectorAssembler(inputCols=["area_encoded", "property_type_encoded", "contract_duration", "contract_amount"], outputCol="features")

# StandardScaler (optional but can improve model performance)
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

# Regression model
lr = LinearRegression(featuresCol="scaled_features", labelCol="contract_amount")

# Pipeline
pipeline = Pipeline(stages=[area_indexer, property_type_indexer, area_encoder, property_type_encoder, assembler, scaler, lr])

# Split the data into training and testing sets
train_data, test_data = cleaned_data.randomSplit([0.8, 0.2], seed=42)

# Train the model
model = pipeline.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="contract_amount", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")

Py4JJavaError: An error occurred while calling o165.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 39.0 failed 1 times, most recent failure: Lost task 3.0 in stage 39.0 (TID 66) (3240f3bf19f6 executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`VectorAssembler$$Lambda$4126/0x00000008417d4040`: (struct<area_encoded:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,property_type_encoded:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,contract_duration_double_VectorAssembler_ad93cbce6720:double,contract_amount_double_VectorAssembler_ad93cbce6720:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$1(ObjectHashAggregateExec.scala:92)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$1$adapted(ObjectHashAggregateExec.scala:90)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:880)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:880)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 26 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (`VectorAssembler$$Lambda$4126/0x00000008417d4040`: (struct<area_encoded:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,property_type_encoded:struct<type:tinyint,size:int,indices:array<int>,values:array<double>>,contract_duration_double_VectorAssembler_ad93cbce6720:double,contract_amount_double_VectorAssembler_ad93cbce6720:double>) => struct<type:tinyint,size:int,indices:array<int>,values:array<double>>).
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:198)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$1(ObjectHashAggregateExec.scala:92)
	at org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec.$anonfun$doExecute$1$adapted(ObjectHashAggregateExec.scala:90)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2(RDD.scala:880)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsWithIndexInternal$2$adapted(RDD.scala:880)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Encountered null while assembling a row with handleInvalid = "error". Consider
removing nulls from dataset or using handleInvalid = "keep" or "skip".
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1(VectorAssembler.scala:291)
	at org.apache.spark.ml.feature.VectorAssembler$.$anonfun$assemble$1$adapted(VectorAssembler.scala:260)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:38)
	at org.apache.spark.ml.feature.VectorAssembler$.assemble(VectorAssembler.scala:260)
	at org.apache.spark.ml.feature.VectorAssembler.$anonfun$transform$6(VectorAssembler.scala:143)
	... 26 more
