Start jupyter-lab

```bash
jupyter-lab --notebook-dir=$HOME/gits/NVIDIA/spark-rapids-examples
```
or simply open in VS Code

In [None]:
# Easy imports
import findspark
import glob
import os
import shutil
import sys
import tempfile

In [None]:
# Environment
home_dir = os.environ['HOME']
work_dir = f"{home_dir}/jupyter_run_dir"
rapids_version = '22.12.0-SNAPSHOT'
tz = 'UTC'
os.environ['TZ'] = tz
os.environ['SPARK_HOME'] = f"{home_dir}/dist/spark-3.1.1-bin-hadoop3.2"
os.environ['PARALLEL_LEVEL'] = "2"
sys.path.append(f"{home_dir}/gits/NVIDIA/spark-rapids/integration_tests/src/main/python")

In [None]:
# "Feature" switches
should_gen = False
should_replicate = False
should_join = True
should_sort = False

## build spark-rapids-jni
os.chdir(f"{home_dir}/gits/NVIDIA/spark-rapids-jni")
os.system('''build/build-in-docker install -DGPU_ARCHS=NATIVE -DBUILD_TESTS=OFF -DskipTests -Dmaven.javadoc.skip''')

# Build shims
os.environ['SV'] = '311 330'
os.chdir(f"{home_dir}/gits/NVIDIA/spark-rapids")
os.system('''printf '%s\n' $SV | xargs -I% -n 1 -P${PARALLEL_LEVEL} \
    mvn -B install -pl aggregator -am -Dbuildver=% \
        -Dmaven.javadoc.skip \
        -Dskip -DskipTests -Dmaven.javadoc.skip \
        -Dmaven.test.skip''')

# Build spark-rapids dist jar
os.system('''mvn -B package -pl dist -Ddist.jar.compress=false -Dincluded_buildvers=$(printf '%s,' $SV)''')

In [None]:
# Easy settings
if should_gen:
    gpu_alloc_size = '5000m'
elif should_join:
    gpu_alloc_size ='512m' 
elif should_sort: 
    gpu_alloc_size = '128m'

cores_per_exec = 1

spark_master = f"local-cluster[1,{cores_per_exec},10000]"
# spark_master = f"local[{cores_per_exec}]"

# data gen settings
data_gen_length = 100*1000*1000 # generate one file 
num_copies = 20 # and replicate it this many times

dfgen_path = f"{work_dir}/dfgen"

# debugger string
jdwp = '-agentlib:jdwp=transport=dt_socket,server=n,address=localhost:5005'
# jdwp = ''

In [None]:
debugLogClasses = [
    'com.nvidia.spark.rapids.DeviceMemoryEventHandler',
    'com.nvidia.spark.rapids.RapidsBufferStore',
    'com.nvidia.spark.rapids.RapidsDeviceMemoryStore',
    'com.nvidia.spark.rapids.RapidsHostMemoryStore',
    'com.nvidia.spark.rapids.RapidsDiskStore',
    'com.nvidia.spark.rapids.RapidsGdsStore',
]

log4j_fname = tempfile.mktemp(suffix=".properties", prefix="log4j")
print(f"Wrting log4j conf to {log4j_fname}")
with open(log4j_fname, 'w+t') as log4j_fobj:
    log4j_fobj.write('''
log4j.rootCategory=INFO, console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
''')
    for d in debugLogClasses:
        log4j_fobj.write(f"log4j.logger.{d}=ALL\n")
log4j_name_comp = log4j_fname.split(os.sep)[-1]
log4j_name_comp

In [None]:
findspark.init()
findspark.add_jars(f"{home_dir}/gits/NVIDIA/spark-rapids/dist/target/rapids-4-spark_2.12-{rapids_version}-cuda11.jar")

In [None]:
import pyspark
from pyspark.sql.functions import *
spark_rapids_conf = pyspark.SparkConf(loadDefaults=False)\
    .setAll([
        ('spark.driver.extraJavaOptions', f"-Dai.rapids.refcount.debug=true -Dlog4j.debug=true -Dlog4j.configuration=file://{log4j_fname}"),
        ('spark.driver.memory', '8g'),
        ('spark.driver.maxResultSize', '2g'),
        ('spark.executor.memory', '8g'),
        ('spark.executor.extraJavaOptions', f"-Dai.rapids.refcount.debug=true {jdwp} -Dlog4j.debug=true -Dlog4j.configuration=file://{log4j_fname}"),
        ('spark.executorEnv.TZ', tz),
        ('spark.plugins', 'com.nvidia.spark.SQLPlugin'),
        ('spark.rpc.message.maxSize', 2047),
        ('spark.task.maxFailures', 1),
        # ('spark.rapids.memory.gpu.allocFraction', 0.2),
        ('spark.rapids.memory.gpu.allocSize', gpu_alloc_size),
        ('spark.rapids.memory.gpu.oomDumpDir', f"{work_dir}/gpuOoms"),
        # ('spark.rapids.memory.gpu.minAllocFraction', 0.1),
        # ('spark.rapids.memory.gpu.maxAllocFraction', 0.5),
        # reader and target batch sizes to avoid running OOM on a single batch 
        ('spark.rapids.sql.batchSizeBytes', '16m'),
        ('spark.rapids.sql.explain', 'ALL'),
        ('spark.rapids.sql.reader.batchSizeBytes', '16m'),
        ('spark.sql.adaptive.enabled', True),
        ('spark.executor.cores', 1),
        ('spark.cores.max', 1),   
    ])
spark = pyspark.sql.SparkSession.builder\
    .config(conf=spark_rapids_conf)\
    .appName('Spill Experiments Notebook')\
    .master(spark_master)\
    .getOrCreate()

In [None]:
spark

In [None]:
spark._jvm.com.nvidia.spark.rapids.RapidsPluginUtils.loadProps("cudf-java-version-info.properties")

In [None]:
spark._jvm.com.nvidia.spark.rapids.RapidsPluginUtils.loadProps("spark-rapids-jni-version-info.properties")

In [None]:
spark._jvm.com.nvidia.spark.rapids.RapidsPluginUtils.loadProps("rapids4spark-version-info.properties")

In [None]:
# Enable debug for Rapids Stores, log4j propereties are useless because REPL
# resets setLogLeve for the root logger
for c in debugLogClasses:
    spark._jvm.org.apache.log4j.Logger\
        .getLogger(c)\
        .setLevel(spark._jvm.org.apache.log4j.Level.ALL)

## Generate Data 

In [None]:
import spark_init_internal
setattr(spark_init_internal, '_spark', spark)
from data_gen import *

In [None]:
# datagen works?
unary_op_df(spark=spark, gen=TimestampGen(nullable=False), length=3, num_slices=1).show(truncate=False)

In [None]:
if should_gen: 
    dfgen = unary_op_df(
        spark=spark, 
        gen=IntegerGen(nullable=False), 
        length=data_gen_length, 
        num_slices=1)

In [None]:
if should_gen: 
    dfgen.write.mode('overwrite').parquet(dfgen_path)
generated_files = glob.glob(f"{dfgen_path}/*.parquet")
if should_replicate and len(generated_files) > 0:
    orig_path = generated_files[0]
    print(f"replicating generated file {orig_path}\n")
    for i in range(num_copies):
        try:
            shutil.copyfile(src=orig_path, dst=f"{dfgen_path}/part-00000-copy-{i}.snappy.parquet")
        except:
            pass

## Repro for OutOfCore Sort Spilling 

In [None]:
if should_sort: 
    df = spark.read.parquet(dfgen_path)
    df.printSchema()
    q2 = df.orderBy(col('a').desc())
    q2.write.mode('overwrite').parquet(f"{work_dir}/q2")

## Broadcast Join Not Spilling and OOMing

In [None]:
if should_join:
    # execute a few times
    # non-sense scenario small table on the left
    df = spark.read.parquet(dfgen_path)\
        .limit(100)
    # bigger table on the right
    rhs = spark.read.parquet(dfgen_path)\
        .limit(500000)
    bdf = broadcast(rhs)

In [None]:
if should_join:
    q3 = df\
            .join(bdf.withColumnRenamed('a', 'b'), col('a') == col('b'))\
            .limit(500000)\
            .join(bdf.withColumnRenamed('a', 'c'), col('a') == col('c'))\
            .limit(500000)\
            .join(bdf.withColumnRenamed('a', 'd'), col('a') == col('d'))
    q3.printSchema()
    q3.write.mode('overwrite').parquet(f"{work_dir}/q3")    

Allocate a huge buffer

In [None]:
hugeBuffers = []

In [None]:
# repeat a few times until you see a spill message
# hugeBuffers.append(spark._jvm.ai.rapids.cudf.DeviceMemoryBuffer.allocate(100 * 1024 * 1024))

In [None]:
for buf in hugeBuffers:
    try:
        buf.close()
    except:
        pass

If the above generates a heap dump, load it to VisualVM and run OQL
```oql
select dmb 
from ai.rapids.cudf.DeviceMemoryBuffer dmb
where dmb.refCount != 0 && dmb.cleaner != null && dmb.cleaner.leakExpected
``` 