In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

#### Product Data

In [0]:
product_df = spark.read.format("delta")\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .load(f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Product")

In [0]:
product_df.printSchema()
product_df.display()

root
 |-- ProductKey: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- StandardCost: double (nullable = true)
 |-- Color: string (nullable = true)
 |-- Subcategory: string (nullable = true)
 |-- Category: string (nullable = true)



ProductKey,Product,StandardCost,Color,Subcategory,Category
210,"HL Road Frame - Black, 58",868.63,Black,Road Frames,Components
215,"Sport-100 Helmet, Black",12.03,Black,Helmets,Accessories
216,"Sport-100 Helmet, Black",13.88,Black,Helmets,Accessories
217,"Sport-100 Helmet, Black",13.09,Black,Helmets,Accessories
253,"LL Road Frame - Black, 58",176.2,Black,Road Frames,Components
254,"LL Road Frame - Black, 58",170.14,Black,Road Frames,Components
255,"LL Road Frame - Black, 58",204.63,Black,Road Frames,Components
256,"LL Road Frame - Black, 60",176.2,Black,Road Frames,Components
257,"LL Road Frame - Black, 60",170.14,Black,Road Frames,Components
258,"LL Road Frame - Black, 60",204.63,Black,Road Frames,Components


In [0]:
product_df = product_df.drop(col('BackgroundColorFormat'), col('FontColorFormat'))
product_df = product_df.withColumn('StandardCost', regexp_replace('StandardCost', "[$,]","").cast(DoubleType()))
product_df.printSchema()
product_df.display()

root
 |-- ProductKey: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- StandardCost: double (nullable = true)
 |-- Color: string (nullable = true)
 |-- Subcategory: string (nullable = true)
 |-- Category: string (nullable = true)



ProductKey,Product,StandardCost,Color,Subcategory,Category
210,"HL Road Frame - Black, 58",868.63,Black,Road Frames,Components
215,"Sport-100 Helmet, Black",12.03,Black,Helmets,Accessories
216,"Sport-100 Helmet, Black",13.88,Black,Helmets,Accessories
217,"Sport-100 Helmet, Black",13.09,Black,Helmets,Accessories
253,"LL Road Frame - Black, 58",176.2,Black,Road Frames,Components
254,"LL Road Frame - Black, 58",170.14,Black,Road Frames,Components
255,"LL Road Frame - Black, 58",204.63,Black,Road Frames,Components
256,"LL Road Frame - Black, 60",176.2,Black,Road Frames,Components
257,"LL Road Frame - Black, 60",170.14,Black,Road Frames,Components
258,"LL Road Frame - Black, 60",204.63,Black,Road Frames,Components


In [0]:
product_df.write.format("delta").mode("overwrite")\
    .option("overwriteSchema", "true")\
    .option("path", f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Product")\
    .save()

### Region Data

In [0]:
region_df = spark.read.format("delta")\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .load(f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Region")

In [0]:
region_df.write.format("delta").mode("overwrite")\
    .option("path", f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Region")\
    .save()

### Reseller Data

In [0]:
reseller_df = spark.read.format("delta")\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .load(f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Reseller")

In [0]:
reseller_df.write.format("delta").mode("overwrite")\
    .option("path", f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Reseller")\
    .save()

### Sales Data

In [0]:
sales_df = spark.read.format("delta")\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .load(f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Sales")

In [0]:
sales_df.printSchema()
sales_df.display()

root
 |-- SalesOrderNumber: string (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- ProductKey: integer (nullable = true)
 |-- ResellerKey: integer (nullable = true)
 |-- EmployeeKey: integer (nullable = true)
 |-- SalesTerritoryKey: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- Sales: double (nullable = true)
 |-- Cost: double (nullable = true)



SalesOrderNumber,OrderDate,ProductKey,ResellerKey,EmployeeKey,SalesTerritoryKey,Quantity,UnitPrice,Sales,Cost
SO43897,2017-08-25,235,312,282,4,2,28.84,57.68,63.45
SO43897,2017-08-25,351,312,282,4,2,2024.99,4049.98,3796.19
SO43897,2017-08-25,348,312,282,4,2,2024.99,4049.98,3796.19
SO43897,2017-08-25,232,312,282,4,2,28.84,57.68,63.45
SO44544,2017-11-18,292,312,282,4,2,818.7,1637.4,1413.62
SO44544,2017-11-18,220,312,282,4,2,20.19,40.38,24.06
SO44544,2017-11-18,351,312,282,4,2,2024.99,4049.98,3796.19
SO44544,2017-11-18,349,312,282,4,2,2024.99,4049.98,3796.19
SO44544,2017-11-18,344,312,282,4,2,2039.99,4079.98,3824.31
SO45321,2018-02-18,346,312,282,4,2,2039.99,4079.98,3824.31


In [0]:
sales_df = sales_df\
    .withColumn('UnitPrice', regexp_replace('UnitPrice', "[$,]","").cast(DoubleType()))\
    .withColumn('Sales', regexp_replace('Sales', "[$,]","").cast(DoubleType()))\
    .withColumn('Cost', regexp_replace('Cost', "[$,]","").cast(DoubleType()))\
    .withColumn('OrderDate', to_date(col('OrderDate'), 'dd/MM/yyyy'))

sales_df.printSchema()
sales_df.display()

root
 |-- SalesOrderNumber: string (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- ProductKey: integer (nullable = true)
 |-- ResellerKey: integer (nullable = true)
 |-- EmployeeKey: integer (nullable = true)
 |-- SalesTerritoryKey: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- Sales: double (nullable = true)
 |-- Cost: double (nullable = true)



SalesOrderNumber,OrderDate,ProductKey,ResellerKey,EmployeeKey,SalesTerritoryKey,Quantity,UnitPrice,Sales,Cost
SO43897,2017-08-25,235,312,282,4,2,28.84,57.68,63.45
SO43897,2017-08-25,351,312,282,4,2,2024.99,4049.98,3796.19
SO43897,2017-08-25,348,312,282,4,2,2024.99,4049.98,3796.19
SO43897,2017-08-25,232,312,282,4,2,28.84,57.68,63.45
SO44544,2017-11-18,292,312,282,4,2,818.7,1637.4,1413.62
SO44544,2017-11-18,220,312,282,4,2,20.19,40.38,24.06
SO44544,2017-11-18,351,312,282,4,2,2024.99,4049.98,3796.19
SO44544,2017-11-18,349,312,282,4,2,2024.99,4049.98,3796.19
SO44544,2017-11-18,344,312,282,4,2,2039.99,4079.98,3824.31
SO45321,2018-02-18,346,312,282,4,2,2039.99,4079.98,3824.31


In [0]:
sales_df.write.format("delta").mode("overwrite")\
    .option("overwriteSchema", "true")\
    .option("path", f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Sales")\
    .save()

### Salesperson Data

In [0]:
salesperson_df = spark.read.format("delta")\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .load(f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Salesperson")

In [0]:
salesperson_df.write.format("delta").mode("overwrite")\
    .option("path", f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Salesperson")\
    .save()

### SalespersonRegion Data

In [0]:
salespersonregion_df = spark.read.format("delta")\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .load(f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/SalespersonRegion")

In [0]:
salesperson_df.write.format("delta").mode("overwrite")\
    .option("path", f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Salesperson")\
    .save()

### Targets Data

In [0]:
targets_df = spark.read.format("delta")\
            .option("header", "true")\
            .option("inferSchema", "true")\
            .load(f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Targets")

In [0]:
targets_df.printSchema()
targets_df.display()

root
 |-- EmployeeID: integer (nullable = true)
 |-- Target: double (nullable = true)
 |-- TargetDate: string (nullable = true)



EmployeeID,Target,TargetDate
90836195,500000.0,01-12-2017
112432117,500000.0,01-07-2017
139397894,500000.0,01-12-2017
191644724,500000.0,01-09-2017
502097814,500000.0,01-07-2017
716374314,500000.0,01-12-2017
841560125,500000.0,01-08-2017
987554265,500000.0,01-12-2017
61161660,500000.0,01-02-2018
90836195,500000.0,01-05-2018


In [0]:
targets_df = targets_df.withColumnRenamed('TargetMonth', 'TargetDate')
targets_df = targets_df.withColumn('Target', regexp_replace('Target', "[$,]","").cast(DoubleType()))
targets_df.display()

EmployeeID,Target,TargetDate
90836195,500000.0,01-12-2017
112432117,500000.0,01-07-2017
139397894,500000.0,01-12-2017
191644724,500000.0,01-09-2017
502097814,500000.0,01-07-2017
716374314,500000.0,01-12-2017
841560125,500000.0,01-08-2017
987554265,500000.0,01-12-2017
61161660,500000.0,01-02-2018
90836195,500000.0,01-05-2018


In [0]:
targets_df.write.format("delta").mode("overwrite")\
    .option('overwriteSchema', 'true')\
    .option("path", f"abfss://silver@azureprojectdatalakegen2.dfs.core.windows.net/Targets")\
    .save()