Start jupyter-lab

```bash
PYTHONPATH=$HOME/gits/NVIDIA/spark-rapids/integration_tests/src/main/python \
SPARK_HOME=$HOME/dist/spark-3.1.1-bin-hadoop3.2 \
TZ=UTC \
jupyter-lab --notebook-dir=$HOME/gits/NVIDIA/spark-rapids-examples
```

In [1]:
# Easy imports
import findspark
import glob
import os
import shutil

In [2]:
should_gen = False
should_join = True
should_sort = False

In [3]:
gpu_alloc_size = '5000m' if should_gen else '128m'
cores_per_exec = 4
# spark_master = f"local-cluster[1,{cores_per_exec},10000]"
spark_master = f"local[{cores_per_exec}]"
work_dir = f"{os.environ['HOME']}/jupyter_run_dir"
dfgen_path = f"{work_dir}/dfgen"
data_gen_length = 100*1000*1000 # generate one file 
num_copies = 10 # and replicate it this many times
jdwp = '-agentlib:jdwp=transport=dt_socket,server=n,address=localhost:5005'

In [4]:
findspark.init()
findspark.add_jars('/home/gshegalov/gits/NVIDIA/spark-rapids/dist/target/rapids-4-spark_2.12-22.10.0-SNAPSHOT-cuda11.jar') 

In [5]:
import pyspark
from pyspark.sql.functions import *
conf = pyspark.SparkConf(loadDefaults=False)
conf.setAll([
    ('spark.driver.extraJavaOptions', jdwp ),
    ('spark.driver.memory', '8g'),
    ('spark.driver.maxResultSize', '2g'),
    ('spark.executor.memory', '8g'),
    # ('spark.executor.extraJavaOptions', jdwp ),
    ('spark.plugins', 'com.nvidia.spark.SQLPlugin'),
    ('spark.rpc.message.maxSize', 2047),
    ('spark.task.maxFailures', 1),
    # ('spark.rapids.memory.gpu.allocFraction', 0.2),
    ('spark.rapids.memory.gpu.allocSize', gpu_alloc_size),
    ('spark.rapids.memory.gpu.oomDumpDir', f"{work_dir}/gpuOoms"),
    # ('spark.rapids.memory.gpu.minAllocFraction', 0.1),
    # ('spark.rapids.memory.gpu.maxAllocFraction', 0.5),
    # reader and target batch sizes to avoid running OOM on a single batch 
    ('spark.rapids.sql.batchSizeBytes', '16m'),
    ('spark.rapids.sql.explain', 'ALL'),
    ('spark.rapids.sql.reader.batchSizeBytes', '16m'),
    ('spark.sql.adaptive.enabled', False),   
])
spark = pyspark.sql.SparkSession.builder\
    .appName('Spill Experiments Notebook')\
    .master(spark_master)\
    .config(conf=conf)\
    .getOrCreate()

22/09/22 05:14:05 WARN Utils: Your hostname, gshegalov-dual-5760 resolves to a loopback address: 127.0.1.1; using 172.17.0.1 instead (on interface docker0)
22/09/22 05:14:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/09/22 05:14:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/09/22 05:14:06 WARN RapidsPluginUtils: RAPIDS Accelerator 22.10.0-SNAPSHOT using cudf 22.10.0-SNAPSHOT.
22/09/22 05:14:06 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.
22/09/22 05:14:06 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `ALL`. Set it to 'NONE' to suppress the diagnostics logging about the query placem

In [6]:
spark

## Generate Data 

In [7]:
import spark_init_internal
setattr(spark_init_internal, '_spark', spark)
from data_gen import *

In [8]:
if should_gen: 
    dfgen = unary_op_df(
        spark=spark, 
        gen=IntegerGen(nullable=False), 
        length=data_gen_length, 
        num_slices=1)

In [9]:
# if should_gen: dfgen = spark.range(0, 1 << 28)

In [10]:
if should_gen: 
    dfgen.write.mode('overwrite').parquet(dfgen_path)
generated_files = glob.glob(f"{dfgen_path}/*.parquet")
if len(generated_files) == 1:
    orig_path = generated_files[0]
    print(f"replicating generated file {orig_path}\n")
    for i in range(num_copies):
        shutil.copyfile(src=orig_path, dst=f"{dfgen_path}/part-00000-copy-{i}.snappy.parquet")

## Repro for OutOfCore Sort Spilling 

In [11]:
if should_sort: 
    df = spark.read.parquet(dfgen_path)
    df.printSchema()
    q2 = df.orderBy(col('a').desc())
    q2.write.mode('overwrite').parquet(f"{work_dir}/q2")

## Broadcast Join Not Spilling and OOMing

Allocate a huge buffer

In [None]:
if should_join:
    # execute a few times
    # non-sense scenario small table on the left
    df = spark.read.parquet(dfgen_path)\
        .sample(0.0000001)\
        .repartition(1)

    # bigger table on the right
    bdf = spark.read.parquet(dfgen_path)\
        .sample(0.001)\
        .withColumnRenamed('a', 'b')\
        .repartition(1)
    bdf1 = broadcast(bdf)
    q3 = df.join(bdf1, df.a == bdf1.b).select(df.a)
    q3.printSchema()
    q3.write.mode('overwrite').parquet(f"{work_dir}/q3")    

root
 |-- a: integer (nullable = true)



22/09/22 05:37:53 WARN GpuOverrides: 
*Exec <DataWritingCommandExec> will run on GPU
  *Output <InsertIntoHadoopFsRelationCommand> will run on GPU
  *Exec <ProjectExec> will run on GPU
    *Exec <BroadcastHashJoinExec> will run on GPU
      *Exec <ShuffleExchangeExec> will run on GPU
        *Partitioning <RoundRobinPartitioning> will run on GPU
        *Exec <FilterExec> will run on GPU
          *Expression <IsNotNull> isnotnull(a#36) will run on GPU
          *Exec <SampleExec> will run on GPU
            *Exec <FileSourceScanExec> will run on GPU
      *Exec <BroadcastExchangeExec> will run on GPU
        *Exec <ShuffleExchangeExec> will run on GPU
          *Partitioning <RoundRobinPartitioning> will run on GPU
          *Exec <ProjectExec> will run on GPU
            *Expression <Alias> a#38 AS b#40 will run on GPU
            *Exec <FilterExec> will run on GPU
              *Expression <IsNotNull> isnotnull(a#38) will run on GPU
              *Exec <SampleExec> will run on GPU
 

In [13]:
# hugeBuffer = spark._jvm.ai.rapids.cudf.DeviceMemoryBuffer.allocate(4*1024*1024*1024)
# hugeBuffer.close()