In [42]:
from pyspark.sql.types import StructType, StructField, StringType
from delta import configure_spark_with_delta_pip
from pyspark.sql import DataFrame, SparkSession, functions as F

In [43]:
def get_session (app_name: str = "MyApp"):
    builder = (
        SparkSession.builder.appName(app_name)
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config(
            "spark.sql.catalog.spark_catalog",
            "org.apache.spark.sql.delta.catalog.DeltaCatalog",
        )
    )
    return configure_spark_with_delta_pip(builder).getOrCreate()

In [44]:
spark = get_session()

In [None]:
# Based on: https://datamadness.medium.com/column-encryption-and-decryption-in-databricks-baf9ada3a7cf

In [45]:
sample_data = [
    ('James', 'Smith', '111-22-3333'),
    ('Michael', 'Jones', '222-33-4444'),
    ('Maria', 'Anne', '333-44-5555'),
    ('James', 'Smith', '111-22-3333')
  ]

sample_schema = StructType([ \
    StructField('FirstName', StringType(),True),
    StructField('LastName', StringType(),True),
    StructField('SSN', StringType(),True)
  ])

df = spark.createDataFrame(data=sample_data, schema=sample_schema)

In [46]:
df.show()

+---------+--------+-----------+
|FirstName|LastName|        SSN|
+---------+--------+-----------+
|    James|   Smith|111-22-3333|
|  Michael|   Jones|222-33-4444|
|    Maria|    Anne|333-44-5555|
|    James|   Smith|111-22-3333|
+---------+--------+-----------+



In [47]:
# The Spark documentation for aes_encrypt and aes_decrypt says that the key needs to be 16, 24, or 32 bytes long.
ENCRYPTION_KEY = "PayclipPeopleEncryptionKey123456"

In [48]:
def encrypt_columns(df: DataFrame, columns: list, encryption_key: str, encryption_mode: str = "ECB") -> DataFrame:
    """
    Encrypts the specified columns in the DataFrame using AES encryption and Base64 encoding.

    Args:
        df (DataFrame): The input DataFrame.
        columns (list): List of column names to encrypt.
        encryption_key (str): The encryption key for AES encryption.
        encryption_mode (str): The encryption mode for AES (default: "ECB").

    Returns:
        DataFrame: The DataFrame with encrypted columns.
    """
    for col_name in columns:
        encrypted_col = F.expr(f"aes_encrypt({col_name}, '{encryption_key}', '{encryption_mode}')")
        base64_encoded_col = F.base64(encrypted_col)
        df = df.withColumn(col_name, base64_encoded_col)

    return df

In [49]:
def decrypt_columns(df: DataFrame, columns: list, encryption_key: str, encryption_mode: str = "ECB") -> DataFrame:
    """
    Decrypts the specified columns in the DataFrame using AES decryption and Base64 decoding.

    Args:
        df (DataFrame): The input DataFrame.
        columns (list): List of column names to decrypt.
        encryption_key (str): The decryption key for AES decryption.
        encryption_mode (str): The decryption mode for AES (default: "ECB").

    Returns:
        DataFrame: The DataFrame with decrypted columns.
    """
    for col_name in columns:
        decrypted_col = F.expr(
            f"aes_decrypt(unbase64({col_name}), '{encryption_key}', '{encryption_mode}')"
        ).cast("string")
        df = df.withColumn(col_name, decrypted_col)

    return df

In [50]:
df_encrypted = (
    encrypt_columns(
        df=df,
        columns=["SSN"], 
        encryption_key=ENCRYPTION_KEY
    )
)
df_encrypted.show()

+---------+--------+--------------------+
|FirstName|LastName|                 SSN|
+---------+--------+--------------------+
|    James|   Smith|XGhgYPX7X0YEJ0XWT...|
|  Michael|   Jones|q4c7hkvFWcdkRNZ4o...|
|    Maria|    Anne|ROA+9VUHcaYShLu4C...|
|    James|   Smith|XGhgYPX7X0YEJ0XWT...|
+---------+--------+--------------------+



In [51]:
df_decrypted = (
    decrypt_columns(
        df=df_encrypted,
        columns=["SSN"], 
        encryption_key=ENCRYPTION_KEY
    )
)
df_decrypted.show()

+---------+--------+-----------+
|FirstName|LastName|        SSN|
+---------+--------+-----------+
|    James|   Smith|111-22-3333|
|  Michael|   Jones|222-33-4444|
|    Maria|    Anne|333-44-5555|
|    James|   Smith|111-22-3333|
+---------+--------+-----------+

