<a href="https://www.kaggle.com/code/gauravgurjar/rent-contracts-dubai?scriptVersionId=224066437" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Data Loading
---
Installing the libraries

In [None]:
%%bash
download_latest_file() {
    current_date=$(date +%Y-%m-%d)
    base_url="https://github.com/ggurjar333/real-estate-analysis-dubai/releases/download"
    release="release-${current_date}"
    file="dld_rent_contracts_${current_date}.parquet"

    wget -q "${base_url}/${release}/${file}" # Added -q option to make wget quiet. 
}
download_latest_file

In [None]:
# Installing required packages
!pip install pyspark
!pip install findspark
!pip install pandas

import findspark
findspark.init()

import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

# Creating a spark session
spark = SparkSession \
    .builder \
    .appName("Dubai Land Development") \
    .getOrCreate()

In [None]:
from datetime import date

In [None]:
rent_contracts_df = spark.read.parquet(f"dld_rent_contracts_{date.today()}.parquet")
rent_contracts_df.printSchema()

In [None]:
rent_contracts_df.show(5)

In [None]:
from pyspark.sql.functions import col
# Convert contract start and end dates to datetime objects if they aren't already
rent_contracts_df = rent_contracts_df.withColumn("contract_start_date", col("contract_start_date").cast("date"))
rent_contracts_df = rent_contracts_df.withColumn("contract_end_date", col("contract_end_date").cast("date"))


In [None]:
# prompt: 2. Descriptive Statistics
# Contract Amounts: Calculate the average, median, and range of contract_amount to understand pricing trends.
# Contract Duration: Analyze the duration of contracts by calculating the difference between contract_end_date and contract_start_date.
# Property Types: Count the occurrences of each property type (ejari_property_type_en) to identify the most common types of properties rented.

from pyspark.sql.functions import avg, max, min, col, datediff, count

# Calculate average, median, and range of contract_amount
contract_amount_stats = rent_contracts_df.select(
    avg("contract_amount").alias("avg_contract_amount"),
    max("contract_amount").alias("max_contract_amount"),
    min("contract_amount").alias("min_contract_amount")
)

contract_amount_stats.show()

# Calculate contract duration
rent_contracts_df = rent_contracts_df.withColumn(
    "contract_duration", datediff(col("contract_end_date"), col("contract_start_date"))
)
rent_contracts_df.show(5)

# Count occurrences of each property type
property_type_counts = rent_contracts_df.groupBy("ejari_property_type_en").agg(
    count("*").alias("property_count")
)

property_type_counts.show()


In [None]:
# prompt: 3. Trends Over Time
# Contract Start and End Dates: Analyze the distribution of contract start dates to identify peak rental periods.
# Renewals vs. New Contracts: Compare the number of new contracts versus renewals to assess tenant retention and market stability.

from pyspark.sql.functions import month, year

# Analyze contract start date distribution
start_date_counts = rent_contracts_df.groupBy(month("contract_start_date"), year("contract_start_date")).count().orderBy(year("contract_start_date"),month("contract_start_date"))
start_date_counts.show(5)


# Analyze contract renewal vs. new contracts (assuming you have a column indicating this)
#  Replace "is_renewal" with the actual column name in your DataFrame.
# If there's no such column, you need to engineer it based on your data.
if "contract_reg_type_en" in rent_contracts_df.columns:
    reg_type_counts = rent_contracts_df.groupBy("contract_reg_type_en").count()
    reg_type_counts.show()
else:
    print("No 'contract_reg_type_en' column found. Please add a column to identify contract renewals.")


In [None]:
# prompt: 4. Property Usage Analysis
# Residential vs. Commercial: Compare the average contract amounts for residential and commercial properties to understand market dynamics.
# Property Subtypes: Analyze the distribution of property subtypes (e.g., 1 bed room, 2 bed rooms) to identify popular configurations.

# Analyze residential vs. commercial property contract amounts
residential_commercial_avg = rent_contracts_df.groupBy("ejari_property_type_en").agg(
    avg("contract_amount").alias("avg_contract_amount")
)
residential_commercial_avg.show()

# Analyze the distribution of property subtypes (assuming you have a 'property_subtype' column)
if "ejari_property_sub_type_en" in rent_contracts_df.columns:
    property_subtype_counts = rent_contracts_df.groupBy("ejari_property_sub_type_en").agg(
        count("*").alias("property_subtype_count")
    )
    property_subtype_counts.show()
else:
    print("No 'ejari_property_sub_type_en' column found. Please add a relevant column to your DataFrame.")


In [None]:
# 5. Geographic Insights
# Area Analysis: Group data by area_name_en to identify which areas have the highest number of contracts and average contract amounts.
# Proximity to Landmarks: Analyze how proximity to landmarks (e.g., malls, metro stations) affects rental prices.
# nearest_landmark_en, nearest_metro_en, nearest_mall_en
area_analysis = rent_contracts_df.groupBy("area_name_en").agg(
    count("*").alias("contract_count"),
    avg("contract_amount").alias("avg_contract_amount")
)
area_analysis.show()

proximity_analysis = rent_contracts_df.groupBy("nearest_landmark_en", "nearest_metro_en", "nearest_mall_en").agg(
    avg("contract_amount").alias("avg_contract_amount")
)
proximity_analysis.show()

In [None]:
# prompt: 6. Tenant Insights
# Tenant Types: Analyze the distribution of tenant types to understand the demographics of renters.
# Contract Amounts by Tenant Type: Compare average contract amounts across different tenant types to identify potential market segments.

# Tenant Insights
# Tenant Types: Analyze the distribution of tenant types to understand the demographics of renters.
tenant_type_distribution = rent_contracts_df.groupBy("tenant_type_en").count().orderBy("count", ascending=False)
tenant_type_distribution.show()

# Contract Amounts by Tenant Type: Compare average contract amounts across different tenant types to identify potential market segments.
avg_contract_by_tenant_type = rent_contracts_df.groupBy("tenant_type_en").agg(avg("contract_amount").alias("avg_contract_amount"))
avg_contract_by_tenant_type.show()


In [None]:
# prompt: 8. Predictive Analysis
# Price Prediction: Use regression analysis to predict contract amounts based on features such as property type, area, and contract duration.
# Churn Prediction: Analyze factors that may lead to tenant churn (e.g., contract renewals) to develop strategies for tenant retention.

from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import col

# Filter out rows where 'ejari_property_type_en' or 'area_name_en' is null or empty
cleaned_data = rent_contracts_df.filter(
    col('ejari_property_type_en').isNotNull() & (col('ejari_property_type_en') != "") &
    col('area_name_en').isNotNull() & (col('area_name_en') != "")
)

# Handle categorical columns (ejari_property_type_en, area_name_en) with StringIndexer and OneHotEncoder

area_indexer = StringIndexer(inputCol="area_name_en", outputCol="area_index")
property_type_indexer = StringIndexer(inputCol="ejari_property_type_en", outputCol="property_type_index")

area_encoder = OneHotEncoder(inputCol="area_index", outputCol="area_encoded")
property_type_encoder = OneHotEncoder(inputCol="property_type_index", outputCol="property_type_encoded")

# Assemble features
assembler = VectorAssembler(inputCols=["area_encoded", "property_type_encoded", "contract_duration", "contract_amount"], outputCol="features")

# StandardScaler (optional but can improve model performance)
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

# Regression model
lr = LinearRegression(featuresCol="scaled_features", labelCol="contract_amount")

# Pipeline
pipeline = Pipeline(stages=[area_indexer, property_type_indexer, area_encoder, property_type_encoder, assembler, scaler, lr])

# Split the data into training and testing sets
train_data, test_data = cleaned_data.randomSplit([0.8, 0.2], seed=42)

# Train the model
model = pipeline.fit(train_data)

# Make predictions
predictions = model.transform(test_data)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="contract_amount", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")