In [0]:
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    DoubleType,
    DateType
)

filename = '/Volumes/caio_moreno/uk_house_advisor_bot/files/pp-complete.csv'
print(filename)

schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("date_of_transfer", DateType(), True),
    StructField("postcode", StringType(), True),
    StructField("property_type", StringType(), True),
    StructField("old_new", StringType(), True),
    StructField("duration", StringType(), True),
    StructField("paon", StringType(), True),
    StructField("saon", StringType(), True),
    StructField("street", StringType(), True),
    StructField("locality", StringType(), True),
    StructField("town_city", StringType(), True),
    StructField("district", StringType(), True),
    StructField("county", StringType(), True),
    StructField("ppd_category_type", StringType(), True),
    StructField("record_status", StringType(), True)
])

df_silver = spark.read.format("csv").schema(schema).option("header", False).load(filename)

display(df_silver)

In [0]:
from pyspark.sql.functions import when, col

df_silver = df_silver.withColumn(
    "old_new_desc",
    when(col("old_new") == "N", "Newly built propertyâ€‹")
    .when(col("old_new") == "Y", "Existing property (not a new build)")
    .otherwise(None)
)

display(df_silver)

In [0]:
from pyspark.sql.functions import when, col

df_silver = df_silver.withColumn(
    "duration_desc",
    when(col("duration") == "F", "Freehold")
    .when(col("duration") == "L", "Leasehold")
    .otherwise(None)
)

display(df_silver)

In [0]:
from pyspark.sql.functions import when, col

df_silver = df_silver.withColumn(
    "property_type_desc",
    when(col("property_type") == "D", "Detached")
    .when(col("property_type") == "F", "Flat/Maisonette")
    .when(col("property_type") == "O", "Other (non-residential or \"other\" property type)")
    .when(col("property_type") == "S", "Semi-Detached")
    .when(col("property_type") == "T", "Terraced")
    .otherwise(None)
)

display(df_silver)

In [0]:
from pyspark.sql.functions import when, col

df_silver = df_silver.withColumn(
    "ppd_category_type_desc",
    when(col("ppd_category_type") == "A", "Standard Price Paid entry")
    .when(col("ppd_category_type") == "B", "Additional Price Paid entries")
    .otherwise(None)
)

display(df_silver)

In [0]:
from pyspark.sql.functions import when, col

df_silver = df_silver.withColumn(
    "record_status_desc",
    when(col("record_status") == "A", "Addition")
    .when(col("record_status") == "C", "Change")
    .when(col("record_status") == "D", "Deletion")
    .otherwise(None)
)

display(df_silver)

In [0]:
# Write the DataFrame to a Delta table
df_silver.write.format("delta").mode("overwrite").saveAsTable("caio_moreno.uk_house_advisor_bot.silver_pp_complete")

In [0]:
sql_df = spark.sql("SELECT * FROM caio_moreno.uk_house_advisor_bot.silver_pp_complete")
display(sql_df)

In [0]:
%sql
SELECT distinct ppd_category_type FROM caio_moreno.uk_house_advisor_bot.silver_pp_complete

In [0]:
%sql
SELECT distinct duration FROM caio_moreno.uk_house_advisor_bot.silver_pp_complete

In [0]:
from pyspark.sql.functions import col

df = spark.table("caio_moreno.uk_house_advisor_bot.silver_pp_complete")
count_df = df.filter(col("duration").isin("F", "L")).groupBy("duration").count()
display(count_df)

Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql.functions import col

df = spark.table("caio_moreno.uk_house_advisor_bot.silver_pp_complete")
count_df = df.filter(col("old_new").isin("N", "Y")).groupBy("old_new").count()
display(count_df)

Databricks visualization. Run in Databricks to view.