In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, TimestampType, ArrayType, LongType
from pyspark.sql.functions import col, explode
from datetime import datetime

In [0]:
# Define schema
json_schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("TransactionRef", StringType(), True),
    StructField("Amount", DoubleType(), True),
    StructField("PaymentMethod", StringType(), True),
    StructField("PaymentStatus", StringType(), True),
    StructField("Title", StringType(), True),
    StructField("Firstname", StringType(), True),
    StructField("Lastname", StringType(), True),
    StructField("Email", StringType(), True),
    StructField("AddressLine1", StringType(), True),
    StructField("AddressLine2", StringType(), True),
    StructField("Town", StringType(), True),
    StructField("County", StringType(), True),
    StructField("Postcode", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("EmailOptIn", StringType(), True),
    StructField("PostOptIn", StringType(), True),
    StructField("PhoneOptIn", StringType(), True),
    StructField("SMSOptIn", StringType(), True),
    StructField("Campaign", StringType(), True),
    StructField("GiftAidOptIn", StringType(), True),
    StructField("DateTimeSubmitted", StringType(), True),
    StructField("ClickStreamData", StructType([
        StructField("version", StringType(), True),
        StructField("data", ArrayType(StructType([
            StructField("source", StringType(), True),
            StructField("medium", StringType(), True),
            StructField("campaign", StringType(), True),
            StructField("content", StringType(), True),
            StructField("adset", StringType(), True),
            StructField("campaign_id", StringType(), True),
            StructField("source_platform", StringType(), True),
            StructField("referrer", StringType(), True),
            StructField("timestamp", LongType(), True),
            StructField("path", StringType(), True)
        ])), True)
    ]), True)
])

In [0]:
json_data = [
    {
        "Id": 1,
        "TransactionRef": "b2px7huihdpf",
        "Amount": 2.00,
        "PaymentMethod": "Credit Card",
        "PaymentStatus": "Success",
        "Title": "Miss",
        "Firstname": "Firstname",
        "Lastname": "Lastname",
        "Email": "firstname.lastname@ageuk.org.uk",
        "AddressLine1": "Address line 1",
        "AddressLine2": "Address",
        "Town": "Town",
        "County": "County",
        "Postcode": "POSTCODE",
        "Country": "GB",
        "EmailOptIn": "false",
        "PostOptIn": "false",
        "PhoneOptIn": "false",
        "SMSOptIn": "false",
        "Campaign": "MXXX-25II05-U001",
        "GiftAidOptIn": "false",
        "DateTimeSubmitted": "2025-04-23T09:23:31.117",
        "ClickStreamData": {"version":"0.0.1","data":[
            {"source":"none","medium":"other","campaign":"none","content":"","adset":"","campaign_id":"","source_platform":"","referrer":"","timestamp":1745397169883,"path":"\/"},
            {"source":"none","medium":"other","campaign":"none","content":"","adset":"","campaign_id":"","source_platform":"","referrer":"","timestamp":1745397169887,"path":"\/"}
            ]}
    },
    {
    "Id": 2,
    "TransactionRef": "blknfskguvr",
    "Amount": 5.00,
    "PaymentMethod": "Credit Card",
    "PaymentStatus": "Success",
    "Title": "Mr",
    "Firstname": "Firstname",
    "Lastname": "Lastname",
    "Email": "firstname.lastname@ageuk.org.uk",
    "AddressLine1": "Address line 1",
    "AddressLine2": "Address",
    "Town": "Town",
    "County": "County",
    "Postcode": "POSTCODE",
    "Country": "GB",
    "EmailOptIn": "false",
    "PostOptIn": "false",
    "PhoneOptIn": "false",
    "SMSOptIn": "false",
    "Campaign": "MXXX-25II05-U001",
    "GiftAidOptIn": "false",
    "DateTimeSubmitted": "2025-04-24T09:23:31.117",
    "ClickStreamData": {"version":"0.0.1","data":[
        {"source":"none","medium":"other","campaign":"none","content":"","adset":"","campaign_id":"","source_platform":"","referrer":"","timestamp":1745397169884,"path":"\/"}
        ]}
}
]

In [0]:
# Create DataFrame
df = spark.createDataFrame(json_data, json_schema)

# Show DataFrame
df.display()


In [0]:
df_donations = df.select(['Id',
    'TransactionRef',
    'Amount',
    'PaymentMethod',
    'PaymentStatus',
    'Title',
    'Firstname',
    'Lastname',
    'Email',
    'AddressLine1',
    'AddressLine2',
    'Town',
    'County',
    'Postcode',
    'Country',
    'EmailOptIn',
    'PostOptIn',
    'PhoneOptIn',
    'SMSOptIn',
    'Campaign',
    'GiftAidOptIn',
    'DateTimeSubmitted'])

df_donations.display()

In [0]:
df_donation_attribution = df.select(
    "Id",
    explode("ClickStreamData.data").alias("click_event")
).select([
    "id",
    "click_event.source",
    "click_event.medium",
    "click_event.campaign",
    "click_event.content",
    "click_event.adset",
    "click_event.campaign_id",
    "click_event.source_platform",
    "click_event.referrer",
    "click_event.timestamp",
    "click_event.path"])
    
df_donation_attribution.display()