Start jupyter-lab

```bash
PYTHONPATH=$HOME/gits/NVIDIA/spark-rapids/integration_tests/src/main/python \
SPARK_HOME=$HOME/dist/spark-3.1.1-bin-hadoop3.2 \
TZ=UTC \
jupyter-lab --notebook-dir=$HOME/gits/NVIDIA/spark-rapids-examples
```

In [1]:
should_gen = False
should_join = True
should_sort = False

In [2]:
gpu_alloc_size = '5000m' if should_gen else '128m'
cores_per_exec = 4
dfgen_path = '/tmp/dfgen'
data_gen_length = 100*1000*1000 # generate one file 
num_copies = 10 # and replicate it this many times

In [3]:
import findspark
findspark.init()
findspark.add_jars('/home/gshegalov/gits/NVIDIA/spark-rapids/dist/target/rapids-4-spark_2.12-22.10.0-SNAPSHOT-cuda11.jar') 

In [4]:
import pyspark
from pyspark.sql.functions import *
conf = pyspark.SparkConf(loadDefaults=False)
conf.setAll([
    ('spark.driver.memory', '8g'),
    ('spark.driver.maxResultSize', '2g'),
    ('spark.executor.memory', '8g'),
    # ('spark.executor.extraJavaOptions', 
    #      '-agentlib:jdwp=transport=dt_socket,server=y,address=localhost:5005'),
    ('spark.plugins', 'com.nvidia.spark.SQLPlugin'),
    ('spark.rpc.message.maxSize', 2047),
    ('spark.task.maxFailures', 1),
    # ('spark.rapids.memory.gpu.allocFraction', 0.2),
    ('spark.rapids.memory.gpu.allocSize', gpu_alloc_size),
    # ('spark.rapids.memory.gpu.minAllocFraction', 0.1),
    # ('spark.rapids.memory.gpu.maxAllocFraction', 0.5),
    # reader and target batch sizes to avoid running OOM on a single batch 
    ('spark.rapids.sql.batchSizeBytes', '16m'),
    ('spark.rapids.sql.explain', 'ALL'),
    ('spark.rapids.sql.reader.batchSizeBytes', '16m'),
    ('spark.sql.adaptive.enabled', False),   
])
spark = pyspark.sql.SparkSession.builder\
    .appName('Spill Experiments Notebook')\
    .master(f"local-cluster[1,{cores_per_exec},10000]")\
    .config(conf=conf)\
    .getOrCreate()

22/09/20 20:38:03 WARN Utils: Your hostname, gshegalov-dual-5760 resolves to a loopback address: 127.0.1.1; using 10.0.0.133 instead (on interface wlp0s20f3)
22/09/20 20:38:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/09/20 20:38:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/09/20 20:38:04 WARN RapidsPluginUtils: RAPIDS Accelerator 22.10.0-SNAPSHOT using cudf 22.10.0-SNAPSHOT.
22/09/20 20:38:04 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.
22/09/20 20:38:04 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `ALL`. Set it to 'NONE' to suppress the diagnostics logging about the query plac

In [5]:
spark

## Generate Data 

In [6]:
import spark_init_internal
setattr(spark_init_internal, '_spark', spark)
from data_gen import *

In [7]:
if should_gen: 
    dfgen = unary_op_df(
        spark=spark, 
        gen=IntegerGen(nullable=False), 
        length=data_gen_length, 
        num_slices=1)

In [8]:
# if should_gen: dfgen = spark.range(0, 1 << 28)

In [9]:
if should_gen: 
    dfgen.write.mode('overwrite').parquet(dfgen_path)
import glob
import shutil
generated_files = glob.glob(f"{dfgen_path}/*.parquet")
if len(generated_files) == 1:
    orig_path = generated_files[0]
    print(f"replicating generated file {orig_path}\n")
    for i in range(num_copies):
        shutil.copyfile(src=orig_path, dst=f"{dfgen_path}/part-00000-copy-{i}.snappy.parquet")

## Repro for OutOfCore Sort spilling 

In [10]:
if should_sort: 
    df = spark.read.parquet(dfgen_path)
    df.printSchema()
    q2 = df.orderBy(col('a').desc())
    q2.write.mode('overwrite').parquet('/tmp/q2')

## Broadcast Join

In [13]:
if should_join:
    df = spark.read.parquet(dfgen_path)
    bdf = spark.read.parquet('/tmp/q2').sample(0.1).withColumnRenamed('a', 'b')
    q3 = df.join(broadcast(bdf), df.a == bdf.b)
    q3.printSchema()
    q3.write.mode('overwrite').parquet('/tmp/q3')    

root
 |-- a: integer (nullable = true)
 |-- b: integer (nullable = true)



22/09/20 20:42:28 WARN GpuOverrides: 
*Exec <DataWritingCommandExec> will run on GPU
  *Output <InsertIntoHadoopFsRelationCommand> will run on GPU
  *Exec <BroadcastHashJoinExec> will run on GPU
    *Exec <FilterExec> will run on GPU
      *Expression <IsNotNull> isnotnull(a#20) will run on GPU
      *Exec <FileSourceScanExec> will run on GPU
    *Exec <BroadcastExchangeExec> will run on GPU
      *Exec <ProjectExec> will run on GPU
        *Expression <Alias> a#22 AS b#24 will run on GPU
        *Exec <FilterExec> will run on GPU
          *Expression <IsNotNull> isnotnull(a#22) will run on GPU
          *Exec <SampleExec> will run on GPU
            *Exec <FileSourceScanExec> will run on GPU

22/09/20 20:42:39 WARN TaskSetManager: Lost task 2.0 in stage 9.0 (TID 42) (10.0.0.133 executor 0): java.lang.OutOfMemoryError: Could not allocate native memory: std::bad_alloc: out_of_memory: RMM failure at:/home/jenkins/agent/workspace/jenkins-spark-rapids-jni_nightly-dev-219-cuda11/thirdparty

Py4JJavaError: An error occurred while calling o186.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.rapids.GpuFileFormatWriter$.write(GpuFileFormatWriter.scala:264)
	at org.apache.spark.sql.rapids.GpuInsertIntoHadoopFsRelationCommand.runColumnar(GpuInsertIntoHadoopFsRelationCommand.scala:168)
	at com.nvidia.spark.rapids.GpuDataWritingCommandExec.sideEffectResult$lzycompute(GpuDataWritingCommandExec.scala:114)
	at com.nvidia.spark.rapids.GpuDataWritingCommandExec.sideEffectResult(GpuDataWritingCommandExec.scala:113)
	at com.nvidia.spark.rapids.GpuDataWritingCommandExec.doExecuteColumnar(GpuDataWritingCommandExec.scala:137)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnar$1(SparkPlan.scala:207)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.executeColumnar(SparkPlan.scala:203)
	at com.nvidia.spark.rapids.GpuColumnarToRowExec.doExecute(GpuColumnarToRowExec.scala:320)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:874)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 9.0 failed 1 times, most recent failure: Lost task 2.0 in stage 9.0 (TID 42) (10.0.0.133 executor 0): java.lang.OutOfMemoryError: Could not allocate native memory: std::bad_alloc: out_of_memory: RMM failure at:/home/jenkins/agent/workspace/jenkins-spark-rapids-jni_nightly-dev-219-cuda11/thirdparty/cudf/cpp/build/_deps/rmm-src/include/rmm/mr/device/limiting_resource_adaptor.hpp:143: Exceeded memory limit
	at ai.rapids.cudf.Rmm.allocInternal(Native Method)
	at ai.rapids.cudf.Rmm.alloc(Rmm.java:246)
	at ai.rapids.cudf.DeviceMemoryBuffer.allocate(DeviceMemoryBuffer.java:143)
	at ai.rapids.cudf.DeviceMemoryBuffer.allocate(DeviceMemoryBuffer.java:133)
	at ai.rapids.cudf.JCudfSerialization.readTableFrom(JCudfSerialization.java:1884)
	at org.apache.spark.sql.rapids.execution.SerializeConcatHostBuffersDeserializeBatch.$anonfun$batch$1(GpuBroadcastExchangeExec.scala:118)
	at org.apache.spark.sql.rapids.execution.SerializeConcatHostBuffersDeserializeBatch.$anonfun$batch$1$adapted(GpuBroadcastExchangeExec.scala:113)
	at com.nvidia.spark.rapids.Arm.withResource(Arm.scala:28)
	at com.nvidia.spark.rapids.Arm.withResource$(Arm.scala:26)
	at org.apache.spark.sql.rapids.execution.SerializeConcatHostBuffersDeserializeBatch.withResource(GpuBroadcastExchangeExec.scala:91)
	at org.apache.spark.sql.rapids.execution.SerializeConcatHostBuffersDeserializeBatch.batch(GpuBroadcastExchangeExec.scala:113)
	at org.apache.spark.sql.rapids.execution.GpuBroadcastHelper$.getBroadcastBatch(GpuBroadcastHelper.scala:44)
	at com.nvidia.spark.rapids.GpuBroadcastHashJoinExec.$anonfun$doExecuteColumnar$1(GpuBroadcastHashJoinExec.scala:171)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.sql.rapids.GpuFileFormatWriter$.write(GpuFileFormatWriter.scala:233)
	... 39 more
Caused by: java.lang.OutOfMemoryError: Could not allocate native memory: std::bad_alloc: out_of_memory: RMM failure at:/home/jenkins/agent/workspace/jenkins-spark-rapids-jni_nightly-dev-219-cuda11/thirdparty/cudf/cpp/build/_deps/rmm-src/include/rmm/mr/device/limiting_resource_adaptor.hpp:143: Exceeded memory limit
	at ai.rapids.cudf.Rmm.allocInternal(Native Method)
	at ai.rapids.cudf.Rmm.alloc(Rmm.java:246)
	at ai.rapids.cudf.DeviceMemoryBuffer.allocate(DeviceMemoryBuffer.java:143)
	at ai.rapids.cudf.DeviceMemoryBuffer.allocate(DeviceMemoryBuffer.java:133)
	at ai.rapids.cudf.JCudfSerialization.readTableFrom(JCudfSerialization.java:1884)
	at org.apache.spark.sql.rapids.execution.SerializeConcatHostBuffersDeserializeBatch.$anonfun$batch$1(GpuBroadcastExchangeExec.scala:118)
	at org.apache.spark.sql.rapids.execution.SerializeConcatHostBuffersDeserializeBatch.$anonfun$batch$1$adapted(GpuBroadcastExchangeExec.scala:113)
	at com.nvidia.spark.rapids.Arm.withResource(Arm.scala:28)
	at com.nvidia.spark.rapids.Arm.withResource$(Arm.scala:26)
	at org.apache.spark.sql.rapids.execution.SerializeConcatHostBuffersDeserializeBatch.withResource(GpuBroadcastExchangeExec.scala:91)
	at org.apache.spark.sql.rapids.execution.SerializeConcatHostBuffersDeserializeBatch.batch(GpuBroadcastExchangeExec.scala:113)
	at org.apache.spark.sql.rapids.execution.GpuBroadcastHelper$.getBroadcastBatch(GpuBroadcastHelper.scala:44)
	at com.nvidia.spark.rapids.GpuBroadcastHashJoinExec.$anonfun$doExecuteColumnar$1(GpuBroadcastHashJoinExec.scala:171)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


22/09/20 20:43:30 ERROR TaskSchedulerImpl: Lost executor 0 on 10.0.0.133: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/09/20 20:43:30 WARN TaskSetManager: Lost task 4.0 in stage 9.0 (TID 44) (10.0.0.133 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/09/20 20:43:30 WARN TaskSetManager: Lost task 7.0 in stage 9.0 (TID 47) (10.0.0.133 executor 0): ExecutorLostFailure (executor 0 exited caused by one of the running tasks) Reason: Remote RPC client disassociated. Likely due to containers exceeding thresholds, or network issues. Check driver logs for WARN messages.
22/09/20 20:43:30 WARN TaskSetManager: Lost task 6.0 in stage 9.0 (TID 46) (10.0.0.133 executor 0): ExecutorLostFailure (executor 0 exited caused by one