In [0]:
%pip install s3fs --quiet

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import s3fs
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from datetime import timedelta, datetime 
from pyspark.sql.functions import lit, col, to_date, date_format

In [0]:
source_path = "s3://tracking-analytics/consents-export-parquet"
target_path = "s3a://zalando-tracking/consents-export-delta"

In [0]:
schema = spark.read.format("parquet").load(f"{source_path}/01-05-2024/").schema
schema

StructType([StructField('browser-id', StringType(), True), StructField('client-id', StringType(), True), StructField('consents', ArrayType(StructType([StructField('action', StringType(), True), StructField('consentStatus', BooleanType(), True), StructField('dataProcessingService', StringType(), True), StructField('language', StringType(), True)]), True), True), StructField('created_at', StringType(), True), StructField('host', StringType(), True), StructField('updated_at', StringType(), True)])

In [0]:
# Get date list for specified range in required format
def list_of_dates(start_date_str, end_date_str):
	start_date_obj = datetime.strptime(start_date_str, "%Y-%m-%d")
	end_date_obj = datetime.strptime(end_date_str, "%Y-%m-%d")
	dates_list = [ 
	            (start_date_obj + timedelta(days=i)).strftime("%d-%m-%Y")
	            for i in range((end_date_obj - start_date_obj).days + 1)
	            ] 
	return dates_list

In [0]:
default_date = "2023-01-01"
dbutils.widgets.text("start_date", default_date)
dbutils.widgets.text("end_date", default_date)
start_date = dbutils.widgets.get("start_date")
end_date = dbutils.widgets.get("end_date")

In [0]:
date_list = list_of_dates(start_date, end_date)

In [0]:
s3 = s3fs.S3FileSystem()

# Loop through the dates
for date_str in date_list:
    try:
        files = s3.ls(f"{source_path}/{date_str}")
        combined_df: DataFrame = None  

        for file in files:
            file_name = file.split('/')[-1]
        
            parquet_df = (
                spark.read
                .format("parquet")
                .schema(schema)
                .load(f"{source_path}/{date_str}/{file_name}")
            )
            
            formatted_date_str = datetime.strptime(date_str, "%d-%m-%Y").strftime("%Y-%m-%d")
            parquet_df = parquet_df.withColumn("dt", lit(formatted_date_str))
            
            # Combine DataFrames
            if combined_df is None:
                combined_df = parquet_df
            else:
                combined_df = combined_df.unionByName(parquet_df)

        # If combined_df is not empty, write to Delta
        if combined_df is not None:
            (
            combined_df.write
                .format("delta")
                .mode("append")
                .partitionBy("dt")
                .save(target_path)
            )
            print(f"Successfully converted files for date: {formatted_date_str}")
        else:
            print(f"No files found for date: {formatted_date_str}")

    except Exception as e:
        print(f"Error processing date {formatted_date_str}: {e}")

Successfully converted files for date: 2023-01-01
