In [2]:
!pip --disable-pip-version-check install -q numpy
!pip --disable-pip-version-check install -q pandas
!pip --disable-pip-version-check install -q pyhive
!pip --disable-pip-version-check install -q faker
!pip --disable-pip-version-check install -q boto3

from pyspark import SparkConf
from pyspark.sql import SparkSession
import pandas as pd
from IPython.display import display, HTML
from pyhive import presto as pres
from datetime import datetime
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Row
import boto3

BUCKET = "presto-workshop-bucket-us-east-2"
SANDBOX_BASE_PATH = f"s3a://{BUCKET}/sandbox/"
INPUT = f"s3a://{BUCKET}/input/1gb/"

def listFiles(prefixOrPath: str):
    s3 = boto3.client('s3')
    if prefixOrPath.startswith("s3://") or prefixOrPath.startswith("s3a://"):
        # Extract bucket name and prefix
        parts = prefixOrPath.split("/", 3)
        bucket_name = parts[2]
        prefix = "" if len(parts) == 3 else parts[3]
        
        # List objects in the specified bucket with the given prefix
        response = s3.list_objects_v2(
            Bucket=bucket_name,
            Prefix=prefix
        )

    else:
        response = s3.list_objects_v2(
            Bucket=BUCKET,
            Prefix=prefixOrPath
        )
    if 'Contents' in response:
        contents = response['Contents']
        df = pd.DataFrame(contents)
        display(HTML(df.to_html()))
    else:
        print("No files found in the specified path.")

def copy_files_within_bucket(source_bucket, source_prefix, destination_bucket, destination_prefix):
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(Bucket=source_bucket, Prefix=source_prefix)
    for obj in response.get('Contents', []):
        key = obj['Key']
        copy_source = {'Bucket': source_bucket, 'Key': key}
        new_key = key.replace(source_prefix, destination_prefix, 1)
        s3.copy_object(CopySource=copy_source, Bucket=destination_bucket, Key=new_key)

def dis(df):
    start_time = datetime.now()
    html_content = df.limit(50).toPandas().to_html()
    end_time = datetime.now() 
    time_difference = end_time - start_time
    print("Time: ", time_difference)
    return display(HTML(html_content))

def presto(query):
    start_time = datetime.now()
    presto_conn = pres.Connection(host="presto.hudi-infra.svc.cluster.local", port=9090)
    presto_cursor=presto_conn.cursor()
    presto_cursor.execute(query)
    rows = presto_cursor.fetchmany(50)
    column_names = [desc[0] for desc in presto_cursor.description]
    if len(rows) == 0:
        print("No results")
        return
    else:
        df2 = pd.DataFrame(rows, columns=column_names)
        end_time = datetime.now() 
        time_difference = end_time - start_time
        print("Time: ", time_difference)
        return display(HTML(df2.to_html()))
    
def presto_with_metadata_enabled(query):
    start_time = datetime.now()
    presto_conn = pres.Connection(host="presto.hudi-infra.svc.cluster.local", port=9090)
    presto_cursor=presto_conn.cursor()
    key = "hive.hudi_metadata_enabled"
    value = "true"
    q = f"SET session {key} = {value}"
    presto_cursor.execute(q)
    presto_cursor.fetchall()
    presto_cursor.execute(query)
    rows = presto_cursor.fetchmany(50)
    column_names = [desc[0] for desc in presto_cursor.description]
    if len(rows) == 0:
        print("No results")
        return
    else:
        df2 = pd.DataFrame(rows, columns=column_names)
        end_time = datetime.now() 
        time_difference = end_time - start_time
        print("Time: ", time_difference)
        return display(HTML(df2.to_html()))


ZOOKEEPER_LOCK_CONFIGS = {
                    "hoodie.cleaner.policy.failed.writes" : "LAZY",
                    "hoodie.write.concurrency.mode" : "optimistic_concurrency_control",
                    "hoodie.write.lock.provider" : "org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider",
                    "hoodie.write.lock.zookeeper.url" : "zk-cs.hudi-infra.svc.cluster.local",
                    "hoodie.write.lock.zookeeper.port" : "2181",
                    "hoodie.write.lock.zookeeper.base_path" : "/test"
}

DISABLE_TIMELINE_CONFIGS = {
    "hoodie.write.markers.type" : 'direct',
    'hoodie.embed.timeline.server' : 'false',
}


def get_spark_session(app_name = "Jupyter Job", hudi_version="0.14.1", num_executors=1 ):
    conf=swan_spark_conf
    config = {
    "spark.kubernetes.namespace": "spark",
    "spark.kubernetes.container.image": "itayb/spark:3.1.1-hadoop-3.2.0-aws",
    "spark.executor.instances": num_executors,
    "spark.executor.memory": "48g",
    "spark.executor.cores": "8",
    "spark.driver.blockManager.port": "7777",
    "spark.driver.port": "2222",
    "spark.driver.host": "jupyter.spark.svc.cluster.local",
    "spark.driver.bindAddress": "0.0.0.0",
    "spark.jars.packages" : f"org.apache.hudi:hudi-spark3.1-bundle_2.12:{hudi_version}",
    "spark.serializer" : "org.apache.spark.serializer.KryoSerializer",
    "spark.sql.extensions" : "org.apache.spark.sql.hudi.HoodieSparkSessionExtension",
    "spark.kryo.registrator" : "org.apache.spark.HoodieSparkKryoRegistrar",
    "spark.hadoop.hive.metastore.uris" : "thrift://metastore123.hudi-infra.svc.cluster.local:9083",
    }
    conf.setMaster("k8s://https://kubernetes.default.svc.cluster.local")
    for key, value in config.items():
        conf.set(key, value)
    spark=SparkSession.builder.appName(app_name).config(conf=conf).enableHiveSupport().getOrCreate()
    print("Spark Session started at port : " + spark.sparkContext.uiWebUrl.split(":")[-1])
    return spark


# Hudi Quickstart

In [None]:

QUICKSTART_BASE_PATH = SANDBOX_BASE_PATH + "/quickstart_sandbox/"

def quickstart(spark, tableName = "hudi_table", extraConfigs = {}, partitioned = True, pkless = False, is_mor = False):
    basePath = QUICKSTART_BASE_PATH + tableName
    columns = ["ts","uuid","rider","driver","fare","city"]
    data =[(1695159649087,"334e26e9-8355-45cc-97c6-c31daf0df330","rider-A","driver-K",19.10,"san_francisco"),
       (1695091554788,"e96c4396-3fad-413a-a942-4cb36106d721","rider-C","driver-M",27.70 ,"san_francisco"),
       (1695046462179,"9909a8b1-2d15-4d3d-8ec9-efc48c536a00","rider-D","driver-L",33.90 ,"san_francisco"),
       (1695516137016,"e3cf430c-889d-4015-bc98-59bdce1e530c","rider-F","driver-P",34.15,"sao_paulo"),
       (1695115999911,"c8abbe79-8d89-47ea-b4ce-4d224bae5bfa","rider-J","driver-T",17.85,"chennai")]
    inserts = spark.createDataFrame(data).toDF(*columns)

    hudi_options = {
        'hoodie.table.name': tableName,
        'hoodie.datasource.write.recordkey.field': 'uuid'
    }
    
    hudi_options.update(DISABLE_TIMELINE_CONFIGS)
    
    if(partitioned):
        hudi_options.update({'hoodie.datasource.write.partitionpath.field': 'city'})
    if(not pkless):
        hudi_options.update({
            'hoodie.datasource.write.recordkey.field': 'uuid',
            'hoodie.datasource.write.precombine.field': 'ts'
        })
        
    hudi_options.update(extraConfigs)

    inserts.write.format("hudi"). \
        options(**hudi_options). \
        mode("overwrite"). \
        save(basePath)
    print("Data generated successfully")
    tripsDF = spark.read.format("hudi").load(basePath)
    tripsDF.createOrReplaceTempView("trips_table")

    spark.sql("SELECT uuid, fare, ts, rider, driver, city FROM  trips_table WHERE fare > 20.0").show()
    spark.sql("SELECT _hoodie_commit_time, _hoodie_record_key, _hoodie_partition_path, rider, driver, fare FROM trips_table").show()
    
    updatesDf = spark.read.format("hudi").load(basePath).filter("rider == 'rider-D'").withColumn("fare",col("fare")*10)

    updatesDf.write.format("hudi"). \
      options(**hudi_options). \
      mode("append"). \
      save(basePath)
    dis(spark.read.format("hudi").load(basePath))
    
    print("Data upserted successfully")
    deletesDF = spark.read.format("hudi").load(basePath).filter("rider == 'rider-F'")

    hudi_hard_delete_options = {
      'hoodie.table.name': tableName,
      'hoodie.datasource.write.partitionpath.field': 'city',
      'hoodie.datasource.write.operation': 'delete',
    }
    
    hudi_options.update(hudi_hard_delete_options)

    deletesDF.write.format("hudi"). \
        options(**hudi_options). \
        mode("append"). \
        save(basePath)
    print("Data deleted successfully")
    
    dis(spark.read.format("hudi").load(basePath))
