s3selectJSON is useful to read a subset of JSON files. For more details to see options or s3selectCSV : https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-s3select.html 

In [0]:
%pip install s3fs --quiet

In [0]:
# Disable caching , auto-cache makes this process just slower 
spark.conf.set("spark.databricks.io.cache.enabled", "false") 

In [0]:
from datetime import timedelta, datetime
import s3fs

s3 = s3fs.S3FileSystem()

day = (datetime.now() - timedelta(1)).strftime("%Y-%m-%d")

events = [
    "shop.tracking.brand.change.v2",
    "shop.tracking.brand.load.v2",
    "shop.tracking.campaign-page.view.v2",
    "shop.tracking.campaign.click.v2",
]

In [0]:
# Get each event's schema from s3 bucket directory and write them to s3 bucket
for event_name in events:
    try:
        df_schema = (
                    spark.read
                    .format("s3selectJSON")
                    .json(f"s3a://datalake-eu-central-1/data/eventqueue/{event_name}/dt={day}" )
                ).schema

        with s3.open(f"s3://teamanalytics/schemas/{event_name}.json", "w") as f:
            f.write(df_schema.json())

    except Exception as e:
        print(f"Error processing {event_name}: {e}")
        continue  # Continue to the next iteration

In [0]:
# Check the schema files
dbutils.fs.ls("s3://teamanalytics/schemas/")