Start jupyter-lab

```bash
jupyter-lab --notebook-dir=$HOME/gits/NVIDIA/spark-rapids-examples
```
or simply open in VS Code

In [None]:
# Easy imports
import findspark
import glob
import os
import shutil
import sys

In [None]:
# "Feature" switches 
should_gen = False
should_join = True
should_sort = False

In [None]:
# Easy settings
gpu_alloc_size = '5000m' if should_gen else '512m'
cores_per_exec = 4

# spark_master = f"local-cluster[1,{cores_per_exec},10000]"
spark_master = f"local[{cores_per_exec}]"

# data gen settings
data_gen_length = 100*1000*1000 # generate one file 
num_copies = 10 # and replicate it this many times

home_dir = os.environ['HOME']
work_dir = f"{home_dir}/jupyter_run_dir"
dfgen_path = f"{work_dir}/dfgen"

# debugger string
# jdwp = '-agentlib:jdwp=transport=dt_socket,server=n,address=localhost:5005'
jdwp = ''

In [None]:
# Environment
os.environ['TZ'] = 'UTC'
os.environ['SPARK_HOME'] = f"{home_dir}/dist/spark-3.1.1-bin-hadoop3.2"
sys.path.append(f"{home_dir}/gits/NVIDIA/spark-rapids/integration_tests/src/main/python")

In [None]:
findspark.init()
findspark.add_jars(f"{home_dir}/gits/NVIDIA/spark-rapids/dist/target/rapids-4-spark_2.12-22.10.0-SNAPSHOT-cuda11.jar") 

In [None]:
import pyspark
from pyspark.sql.functions import *
conf = pyspark.SparkConf(loadDefaults=False)
conf.setAll([
    ('spark.driver.extraJavaOptions', f"-Dai.rapids.refcount.debug=true {jdwp}"),
    ('spark.driver.memory', '8g'),
    ('spark.driver.maxResultSize', '2g'),
    ('spark.executor.memory', '8g'),
    # ('spark.executor.extraJavaOptions', jdwp ),
    ('spark.plugins', 'com.nvidia.spark.SQLPlugin'),
    ('spark.rpc.message.maxSize', 2047),
    ('spark.task.maxFailures', 1),
    # ('spark.rapids.memory.gpu.allocFraction', 0.2),
    ('spark.rapids.memory.gpu.allocSize', gpu_alloc_size),
    ('spark.rapids.memory.gpu.oomDumpDir', f"{work_dir}/gpuOoms"),
    # ('spark.rapids.memory.gpu.minAllocFraction', 0.1),
    # ('spark.rapids.memory.gpu.maxAllocFraction', 0.5),
    # reader and target batch sizes to avoid running OOM on a single batch 
    ('spark.rapids.sql.batchSizeBytes', '16m'),
    ('spark.rapids.sql.explain', 'ALL'),
    ('spark.rapids.sql.reader.batchSizeBytes', '16m'),
    ('spark.sql.adaptive.enabled', False),   
])
spark = pyspark.sql.SparkSession.builder\
    .appName('Spill Experiments Notebook')\
    .master(spark_master)\
    .config(conf=conf)\
    .getOrCreate()

In [None]:
spark

In [None]:
spark._jvm.com.nvidia.spark.rapids.RapidsPluginUtils.loadProps("cudf-java-version-info.properties")

In [None]:
spark._jvm.com.nvidia.spark.rapids.RapidsPluginUtils.loadProps("spark-rapids-jni-version-info.properties")

In [None]:
spark._jvm.com.nvidia.spark.rapids.RapidsPluginUtils.loadProps("rapids4spark-version-info.properties")

In [None]:
# Enable debug for Rapids Stores
debugLogClasses = [
    # 'com.nvidia.spark.rapids.RapidsBufferStore',
    # 'com.nvidia.spark.rapids.RapidsDeviceMemoryStore',
    # 'com.nvidia.spark.rapids.RapidsHostMemoryStore',
    # 'com.nvidia.spark.rapids.RapidsDiskStore',
    # 'com.nvidia.spark.rapids.RapidsGdsStore',
    'org.apache.spark.sql.rapids.execution.SerializeConcatHostBuffersDeserializeBatch'
]

for c in debugLogClasses:
    spark._jvm.org.apache.log4j.Logger\
        .getLogger(c)\
        .setLevel(spark._jvm.org.apache.log4j.Level.ALL)

## Generate Data 

In [None]:
import spark_init_internal
setattr(spark_init_internal, '_spark', spark)
from data_gen import *

In [None]:
if should_gen: 
    dfgen = unary_op_df(
        spark=spark, 
        gen=IntegerGen(nullable=False), 
        length=data_gen_length, 
        num_slices=1)

In [None]:
# if should_gen: dfgen = spark.range(0, 1 << 28)

In [None]:
if should_gen: 
    dfgen.write.mode('overwrite').parquet(dfgen_path)
generated_files = glob.glob(f"{dfgen_path}/*.parquet")
if len(generated_files) == 1:
    orig_path = generated_files[0]
    print(f"replicating generated file {orig_path}\n")
    for i in range(num_copies):
        shutil.copyfile(src=orig_path, dst=f"{dfgen_path}/part-00000-copy-{i}.snappy.parquet")

## Repro for OutOfCore Sort Spilling 

In [None]:
if should_sort: 
    df = spark.read.parquet(dfgen_path)
    df.printSchema()
    q2 = df.orderBy(col('a').desc())
    q2.write.mode('overwrite').parquet(f"{work_dir}/q2")

## Broadcast Join Not Spilling and OOMing

Allocate a huge buffer

In [None]:
if should_join:
    # execute a few times
    # non-sense scenario small table on the left
    df = spark.read.parquet(dfgen_path)\
        .sample(0.0000001)\
        .repartition(1)

    # bigger table on the right
    rhs = spark.read.parquet(dfgen_path)\
        .sample(0.01)\
        .withColumnRenamed('a', 'b')\
        .repartition(1)
    bdf = broadcast(rhs.cache())

In [None]:
if should_join:
    q3 = df.join(bdf, df.a == bdf.b).select(df.a)
    q3.printSchema()
    q3.write.mode('overwrite').parquet(f"{work_dir}/q3")    

In [None]:
hugeBuffers = []

In [None]:
hugeBuffers.append(spark._jvm.ai.rapids.cudf.DeviceMemoryBuffer.allocate(100 * 1024 * 1024))

In [None]:
for buf in hugeBuffers:
    buf.close()