In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

##Reading bronze data

In [0]:
df = spark.read.format("parquet")\
                .option("inferSchema",True)\
                    .option("header",True)\
                .load("abfss://bronze@carsalesdeltalake.dfs.core.windows.net/raw")

##Transforming data
### Creating a new date column

In [0]:
df.withColumn('Model_Category',split(col('Model_ID'),'-')[0]).display()

In [0]:
#Below are two ways to create a new DateType column.However, the second method doesn't yield the desired results because Day files doesn't have a consistent format throughout.For e.g. for some records have single digit while others have double digits. So, therefore, first method looks to be more suitable in this specific case.

df = df.withColumn('Date_of_Sale',concat_ws('-',col('Year'),col('Month'),col('Day'))).withColumn('Date_of_Sale', (col('Date_of_Sale').cast( DateType())))

#df.withColumn('Date_of_Sale',concat_ws('-',col('Year'),col('Month'),col('Day'))).withColumn('Date_of_Sale', to_date(col('Date_of_Sale'),'yyyy-MM-dd')).display() 

## Filtering bad data before writing it to silver layer

In [0]:
df = df.filter(df.Date_of_Sale.isNotNull())

### Performing visualization

In [0]:
#Performing aggregation to calculate branch wise total revenue for each year and also listing the branches in descending order of total revenue starting with latest year.

display(df.groupBy('Year','BranchName').agg(sum('Revenue').alias('Total_Revenue')).sort('Year','Total_Revenue',ascending=[0,0]))

Databricks visualization. Run in Databricks to view.

## Writing data to silver layer

In [0]:
df.write.format('parquet')\
        .mode('overwrite')\
        .option('path', 'abfss://silver@carsalesdeltalake.dfs.core.windows.net/carsales')\
        .save()