In [0]:
import os

client_id = os.environ.get('client_id')
tenant_id = os.environ.get('tenant_id')
client_secret = os.environ.get('secret_value')
storage_account = "project1azure1"

spark.conf.set(f"fs.azure.account.auth.type.{storage_account}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{storage_account}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{storage_account}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{storage_account}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{storage_account}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
df_csv = spark.read.csv(
    "abfss://bronze@project1azure1.dfs.core.windows.net/sales/events.csv",
    header=True,
    inferSchema=True
)
from pyspark.sql.functions import col,count

df_csv.display()


timestamp,visitorid,event,itemid,transactionid
1442213697562,440915,view,296459,
1442219041500,987557,view,37371,
1442227849700,582811,view,412230,
1442211277819,1182386,view,125999,
1442211426217,51583,view,408737,
1442226937616,1016622,view,189879,
1442213133232,941344,view,294614,
1442232816731,421070,view,373384,
1442211959695,711110,view,171878,
1442226582093,111285,view,311863,


Checking duplicates

In [0]:
dupes_df = df_csv.groupBy(*df_csv.columns).count().where(col("count") > 1)

print(f"duplicates : {dupes_df.count()}")

duplicates : 458


Removing Duplicates

In [0]:
df_csv_removed_dupes = df_csv.dropDuplicates()

print(f"expected count after removing duplicates : {df_csv.distinct().count()}")
from functools import reduce
print(f"actual count after removing duplicates : {df_csv_removed_dupes.count()}")

expected count after removing duplicates : 2755641
actual count after removing duplicates : 2755641


Checking for nulls

In [0]:
from functools import reduce
columns_to_check = ["timestamp", "visitorid", "event", "itemid"]
null_conditions = [col(c).isNull() for c in columns_to_check]

#join all those conditions with OR (|) so you get: col("timestamp").isNull() | col("visitor_id").isNull() | ...
combined_condition = reduce(lambda a, b: a | b, null_conditions)

# filter the DataFrame
df_nulls = df_csv_removed_dupes.filter(combined_condition)

df_nulls.display()

timestamp,visitorid,event,itemid,transactionid


Checking for invalid values in event column

In [0]:
valid_event_types = ["view", "addtocart", "transaction"]

df_invalid_event_types = df_csv_removed_dupes.filter(~col("event").isin(valid_event_types))

df_invalid_event_types.display()


timestamp,visitorid,event,itemid,transactionid


Checking for invalid values in transactionid column

In [0]:
df_wrong_transaction_id = df_csv_removed_dupes.filter((col("transactionid").isNotNull()) & (col("event") != "transaction"))
df_wrong_transaction_id.display()

timestamp,visitorid,event,itemid,transactionid


Converting timestamp to utc format

In [0]:
from pyspark.sql.functions import from_unixtime

df_final = df_csv_removed_dupes.withColumn("timestamp", from_unixtime(col("timestamp")/1000).cast("timestamp"))





Checking for invalid timestamps

In [0]:
from pyspark.sql.functions import current_timestamp

df_future_timestamps = df_final.filter(col("timestamp") > current_timestamp())
df_future_timestamps.display()

timestamp,visitorid,event,itemid,transactionid


Saving the file as delta format in adls

In [0]:
output_path_parquet = "abfss://silver@project1azure1.dfs.core.windows.net/sales/events"
df_final.write.format("delta").mode("overwrite").save(output_path_parquet)

In [0]:
df =spark.read.format("delta").load(output_path_parquet)
df.display()

timestamp,visitorid,event,itemid,transactionid
2015-09-14T06:27:23Z,404835,view,432902,
2015-09-15T03:50:51Z,409610,view,112239,
2015-09-14T17:53:57Z,1348864,view,286804,
2015-09-14T17:37:16Z,1310032,view,409655,
2015-09-14T16:51:04Z,135727,view,42519,
2015-09-14T17:02:29Z,17738,view,118382,
2015-09-14T16:39:35Z,560891,view,400946,
2015-09-14T21:36:34Z,401658,view,81998,
2015-09-15T04:53:57Z,413162,view,131015,
2015-09-15T00:59:33Z,162285,view,361463,
