In [None]:
from pyspark.sql.functions import col, avg, max, when
from pyspark.sql.window import Window
from pyspark.sql import functions as F
from pyspark.sql.functions import col, avg, when
import time
import random
import string

def generate_table_name(length=10):
    letters = string.ascii_lowercase
    table_name = ''.join(random.choice(letters) for i in range(length))
    return table_name

def clean_tables(table_name):
    for table in table_name:
        q = '''
        drop table if exists '''+ table
        spark.sql(q)


def queries(pause: bool):
    tables = []

    % TODO parametrize your delta table e.g., name_of_your_lakehouse.delta_table_name
    % true_big_table is based on yellow_tripdata_2015-01.csv

    print("Scenario 1")
    q = spark.sql("""
    SELECT 
        Year, 
        VendorID, 
        total_fare 
    FROM 
        (SELECT 
            'Year1' as Year, 
            VendorID, 
            SUM(fare_amount) AS total_fare,
            ROW_NUMBER() OVER (ORDER BY SUM(fare_amount) DESC) as row_num 
        FROM Bronze.true_big_table 
        GROUP BY VendorID) 
    WHERE row_num = 1

    UNION ALL

    SELECT 
        Year, 
        VendorID, 
        total_fare 
    FROM 
        (SELECT 
            'Year2' as Year, 
            VendorID, 
            SUM(fare_amount) AS total_fare,
            ROW_NUMBER() OVER (ORDER BY SUM(fare_amount) DESC) as row_num 
        FROM Bronze.broadcast_true_big_table_52402 
        GROUP BY VendorID) 
    WHERE row_num = 1
    """)

    table_name = generate_table_name()
    q.write.format("delta").mode("overwrite").saveAsTable(table_name)
    tables.append(table_name)
    

    # around 10 s
    print("Scenario 2")
    q = spark.sql("""
    SELECT 
        'Year1' as Year, 
        payment_type, 
        AVG(tip_amount) AS avg_tip 
    FROM Bronze.true_big_table 
    GROUP BY payment_type
    UNION
    SELECT 
        'Year2' as Year, 
        payment_type, 
        AVG(tip_amount) AS avg_tip 
    FROM Bronze.broadcast_true_big_table_52402 
    GROUP BY payment_type
    """)
    table_name = generate_table_name()
    q.write.format("delta").mode("overwrite").saveAsTable(table_name)
    tables.append(table_name)
    
    
    
    print("Scenario 8")
    q = spark.sql("""
    SELECT t1.VendorID, t1.avg_fare, t2.max_fare
    FROM
        (SELECT VendorID, AVG(fare_amount) AS avg_fare 
        FROM Bronze.true_big_table 
        GROUP BY VendorID) t1
    JOIN 
        (SELECT VendorID, MAX(fare_amount) AS max_fare 
        FROM Bronze.broadcast_true_big_table_52402 
        GROUP BY VendorID) t2
    ON t1.VendorID = t2.VendorID
    """)
    q.write.format("delta").mode("overwrite").saveAsTable("dpscenario8")

    print("Scenario 8 extra ")
    q = spark.sql("""
    SELECT *
    FROM Bronze.true_big_table
    WHERE VendorID IN (SELECT VendorID FROM Bronze.broadcast_true_big_table_52402)
    """)
    table_name = generate_table_name()
    q.write.format("delta").mode("overwrite").saveAsTable(table_name)
    tables.append(table_name)


    print("Scenario 9")
    q = spark.sql("""
    SELECT *
    FROM Bronze.true_big_table
    WHERE VendorID NOT IN (SELECT VendorID FROM Bronze.broadcast_true_big_table_52402)
    """)
    table_name = generate_table_name()
    q.write.format("delta").mode("overwrite").saveAsTable(table_name)
    tables.append(table_name)


    print("Scenario 10")
    q = spark.sql("""
    SELECT *
    FROM Bronze.true_big_table
    CROSS JOIN Bronze.broadcast_true_big_table_52402
    """)
    # table_name = generate_table_name()
    # q.write.format("delta").mode("overwrite").saveAsTable(table_name)
    # tables.append(table_name)
    q.count()


    print("Scenario 11")
    q = spark.sql("""
    SELECT /*+ BROADCAST(t1) */ *
    FROM Bronze.broadcast_true_big_table_52402 t1
    JOIN Bronze.broadcast_true_big_table_52402 t2
    ON t1.VendorID = t2.VendorID
    """)
    table_name = generate_table_name()
    q.write.format("delta").mode("overwrite").saveAsTable(table_name)
    tables.append(table_name)

    time.sleep(200) if pause else time.sleep(1)

    clean_tables(tables)



In [None]:
queries(pause = False)