In [0]:
from pyspark.sql.functions import sum, col, when

In [0]:
# dbutils.fs.mount(
#     source = "wasbs://raw-data@azureeastus2st.blob.core.windows.net/",
#     mount_point = "/mnt/raw-data",
#     extra_configs = {"fs.azure.account.key.azureeastus2st.blob.core.windows.net" : dbutils.secrets.get(
#         "databricksScope", "qarsme"
#     )}
# )

dbutils.fs.mount(
    source = "wasbs://transformed-data@azureeastus2st.blob.core.windows.net/",
    mount_point = "/mnt/transformed-data",
    extra_configs = {"fs.azure.account.key.azureeastus2st.blob.core.windows.net" : dbutils.secrets.get(
        "databricksScope", "qarsme"
    )}
)

True

In [0]:
dbutils.fs.ls("/mnt/raw-data")

[FileInfo(path='dbfs:/mnt/raw-data/accounts.csv', name='accounts.csv', size=4670, modificationTime=1764545467000),
 FileInfo(path='dbfs:/mnt/raw-data/data_dictionary.csv', name='data_dictionary.csv', size=996, modificationTime=1764545483000),
 FileInfo(path='dbfs:/mnt/raw-data/products.csv', name='products.csv', size=171, modificationTime=1764545545000),
 FileInfo(path='dbfs:/mnt/raw-data/sales_pipeline.csv', name='sales_pipeline.csv', size=637773, modificationTime=1764545568000),
 FileInfo(path='dbfs:/mnt/raw-data/sales_teams.csv', name='sales_teams.csv', size=1284, modificationTime=1764545585000)]

In [0]:
dbutils.fs.ls("/mnt/transformed-data")

[]

In [0]:
accounts_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/mnt/raw-data/accounts.csv")
data_dictionary_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/mnt/raw-data/data_dictionary.csv")
products_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/mnt/raw-data/products.csv")
sales_pipeline_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/mnt/raw-data/sales_pipeline.csv")
sales_teams_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/mnt/raw-data/sales_teams.csv")
accounts_df.show(2)

+----------------+---------+----------------+-------+---------+---------------+-------------+
|         account|   sector|year_established|revenue|employees|office_location|subsidiary_of|
+----------------+---------+----------------+-------+---------+---------------+-------------+
|Acme Corporation|technolgy|            1996|1100.04|     2822|  United States|         NULL|
|      Betasoloin|  medical|            1999| 251.41|      495|  United States|         NULL|
+----------------+---------+----------------+-------+---------+---------------+-------------+
only showing top 2 rows


In [0]:
print(accounts_df.columns)
print(data_dictionary_df.columns)
print(products_df.columns)
print(sales_pipeline_df.columns)
print(sales_teams_df.columns)

['account', 'sector', 'year_established', 'revenue', 'employees', 'office_location', 'subsidiary_of']
['Table', 'Field', 'Description']
['product', 'series', 'sales_price']
['opportunity_id', 'sales_agent', 'product', 'account', 'deal_stage', 'engage_date', 'close_date', 'close_value']
['sales_agent', 'manager', 'regional_office']


In [0]:
accounts_df = accounts_df.withColumnRenamed("subsidiary_of", "parent_company")
data_dictionary_df = data_dictionary_df.withColumnRenamed("Table", "table").withColumnRenamed("Field", "field").withColumnRenamed("Description", "description")

In [0]:
accounts_df.show(1)
data_dictionary_df.show(1)

+----------------+---------+----------------+-------+---------+---------------+--------------+
|         account|   sector|year_established|revenue|employees|office_location|parent_company|
+----------------+---------+----------------+-------+---------+---------------+--------------+
|Acme Corporation|technolgy|            1996|1100.04|     2822|  United States|          NULL|
+----------------+---------+----------------+-------+---------+---------------+--------------+
only showing top 1 row
+--------+-------+------------+
|   table|  field| description|
+--------+-------+------------+
|accounts|account|Company name|
+--------+-------+------------+
only showing top 1 row


In [0]:
null_accounts_df = accounts_df.select([sum(when(col(column).isNull(), 1).otherwise(0)).alias(column) for column in accounts_df.columns])

null_data_dictionary_df = data_dictionary_df.select([sum(when(col(column).isNull(), 1).otherwise(0)).alias(column) for column in data_dictionary_df.columns])

null_products_df = products_df.select([sum(when(col(column).isNull(), 1).otherwise(0)).alias(column) for column in products_df.columns])
 
null_sales_pipeline_df = sales_pipeline_df.select([sum(when(col(column).isNull(), 1).otherwise(0)).alias(column) for column in sales_pipeline_df.columns])

null_sales_teams_df = sales_teams_df.select([sum(when(col(column).isNull(), 1).otherwise(0)).alias(column) for column in sales_teams_df.columns])

display(null_accounts_df)
display(null_data_dictionary_df)
display(null_products_df)
display(null_sales_pipeline_df)
display(null_sales_teams_df)

account,sector,year_established,revenue,employees,office_location,subsidiary_of
0,0,0,0,0,0,70


Table,Field,Description
0,0,0


product,series,sales_price
0,0,0


opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value
0,0,0,1425,0,500,2089,2089


sales_agent,manager,regional_office
0,0,0


In [0]:
sales_pipeline_df.display()
accounts_df.display()

opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value
1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0
Z063OYW0,Darcel Schlecht,GTXPro,Isdom,Won,2016-10-25,2017-03-11,4514.0
EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0
MV1LWRNH,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0
PE84CX4O,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0
ZNBS69V1,Anna Snelling,MG Special,Ron-tech,Won,2016-10-29,2017-03-01,49.0
9ME3374G,Vicki Laflamme,MG Special,J-Texon,Won,2016-10-30,2017-03-02,57.0
7GN8Q4LL,Markita Hansen,GTX Basic,Cheers,Won,2016-11-01,2017-03-07,601.0
OLK9LKZB,Niesha Huffines,GTX Plus Basic,Zumgoity,Won,2016-11-01,2017-03-03,1026.0
HAXMC4IX,James Ascencio,MG Advanced,,Engaging,2016-11-03,,


account,sector,year_established,revenue,employees,office_location,subsidiary_of
Acme Corporation,technolgy,1996,1100.04,2822,United States,
Betasoloin,medical,1999,251.41,495,United States,
Betatech,medical,1986,647.18,1185,Kenya,
Bioholding,medical,2012,587.34,1356,Philipines,
Bioplex,medical,1991,326.82,1016,United States,
Blackzim,retail,2009,497.11,1588,United States,
Bluth Company,technolgy,1993,1242.32,3027,United States,Acme Corporation
Bubba Gump,software,2002,987.39,2253,United States,
Cancity,retail,2001,718.62,2448,United States,
Cheers,entertainment,1993,4269.9,6472,United States,Massive Dynamic


In [0]:
accounts_df = accounts_df.fillna({
    "parent_company" : "independent"
})

sales_pipeline_df = sales_pipeline_df.fillna({
    "account": "unknown"
})



In [0]:
accounts_df.display()
sales_pipeline_df.display()

account,sector,year_established,revenue,employees,office_location,parent_company
Acme Corporation,technolgy,1996,1100.04,2822,United States,independent
Betasoloin,medical,1999,251.41,495,United States,independent
Betatech,medical,1986,647.18,1185,Kenya,independent
Bioholding,medical,2012,587.34,1356,Philipines,independent
Bioplex,medical,1991,326.82,1016,United States,independent
Blackzim,retail,2009,497.11,1588,United States,independent
Bluth Company,technolgy,1993,1242.32,3027,United States,Acme Corporation
Bubba Gump,software,2002,987.39,2253,United States,independent
Cancity,retail,2001,718.62,2448,United States,independent
Cheers,entertainment,1993,4269.9,6472,United States,Massive Dynamic


opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value
1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0
Z063OYW0,Darcel Schlecht,GTXPro,Isdom,Won,2016-10-25,2017-03-11,4514.0
EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0
MV1LWRNH,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0
PE84CX4O,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0
ZNBS69V1,Anna Snelling,MG Special,Ron-tech,Won,2016-10-29,2017-03-01,49.0
9ME3374G,Vicki Laflamme,MG Special,J-Texon,Won,2016-10-30,2017-03-02,57.0
7GN8Q4LL,Markita Hansen,GTX Basic,Cheers,Won,2016-11-01,2017-03-07,601.0
OLK9LKZB,Niesha Huffines,GTX Plus Basic,Zumgoity,Won,2016-11-01,2017-03-03,1026.0
HAXMC4IX,James Ascencio,MG Advanced,unknown,Engaging,2016-11-03,,


In [0]:
null_accounts_df = accounts_df.select([sum(when(col(column).isNull(), 1).otherwise(0)).alias(column) for column in accounts_df.columns])
null_accounts_df.show()

+-------+------+----------------+-------+---------+---------------+--------------+
|account|sector|year_established|revenue|employees|office_location|parent_company|
+-------+------+----------------+-------+---------+---------------+--------------+
|      0|     0|               0|      0|        0|              0|             0|
+-------+------+----------------+-------+---------+---------------+--------------+



In [0]:
null_sales_pipeline_df = sales_pipeline_df.select([sum(when(col(column).isNull(), 1).otherwise(0)).alias(column) for column in sales_pipeline_df.columns])
null_sales_pipeline_df.show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-4735821366776745>, line 1[0m
[0;32m----> 1[0m null_sales_pipeline_df [38;5;241m=[39m sales_pipeline_df[38;5;241m.[39mselect([[38;5;28msum[39m(when(col(column)[38;5;241m.[39misNull(), [38;5;241m1[39m)[38;5;241m.[39motherwise([38;5;241m0[39m))[38;5;241m.[39malias(column) [38;5;28;01mfor[39;00m column [38;5;129;01min[39;00m sales_pipeline_df[38;5;241m.[39mcolumns])
[1;32m      2[0m null_sales_pipeline_df[38;5;241m.[39mshow()

[0;31mNameError[0m: name 'sales_pipeline_df' is not defined

In [0]:
accounts_df.write.option("header", "true").csv("/mnt/transformed-data/accounts.csv")
data_dictionary_df.write.option("header", "true").csv("/mnt/transformed-data/data_dictionary.csv")
products_df.write.option("header", "true").csv("/mnt/transformed-data/products.csv")
sales_pipeline_df.write.option("header", "true").csv("/mnt/transformed-data/sales_pipeline.csv")
sales_teams_df.write.option("header", "true").csv("/mnt/transformed-data/sales_teams.csv")