Start jupyter-lab

```bash
jupyter-lab --notebook-dir=$HOME/gits/gerashegalov/rapids-shell/src/jupyter
```
or simply open in VS Code with Jupyter extension

# Repro for [NVIDIA/spark-rapids#8625](https://github.com/NVIDIA/spark-rapids/issues/8625)

In [1]:
import datetime
import fastparquet
import findspark
import glob
import os

### Versions

In [2]:
cuda_version = 'cuda11'
hadoop_version = '3.2'
java_version = '8'
rapids_version = '23.10.0-SNAPSHOT'
scala_version = '2.12'
spark_version = '3.1.1'

### Environment Variables

In [3]:
os.environ['JAVA_HOME'] = f"/usr/lib/jvm/java-{java_version}-openjdk-amd64"
os.environ['SPARK_LOCAL_IP'] = '127.0.0.1'
os.environ['TZ'] = 'UTC'

In [4]:
home_dir = os.environ['HOME']
work_dir = f"{home_dir}/jupyter_run_dir"
m2_local_repo = f"{home_dir}/.m2/repository"
groupId = "com.nvidia"
artifactId = f"rapids-4-spark_{scala_version}"
dist_jar = f"{artifactId}-{rapids_version}-{cuda_version}.jar"
spark_home = f"{home_dir}/dist/spark-{spark_version}-bin-hadoop{hadoop_version}"
# spark_home = f"{home_dir}/gits/apache/spark"

### Find Spark 

In [5]:
findspark.init(spark_home = spark_home)
findspark.add_jars(f"{m2_local_repo}/com/nvidia/{artifactId}/{rapids_version}/{dist_jar}")

import pyspark

### Configure Spark

In [6]:
cores_per_exec = 1
jdwp = '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005'
spark_master = f"local[{cores_per_exec}]"
spark_builder = pyspark.sql.SparkSession.builder
spark_builder.config('spark.app.name', 'RAPIDS PySpark Notebook')
spark_builder.config('spark.driver.extraJavaOptions', f"-Dai.rapids.cudf.preserve-dependencies=true {jdwp}")
spark_builder.config('spark.master', spark_master)
spark_builder.config('spark.plugins', 'com.nvidia.spark.SQLPlugin')
spark_builder.config('spark.rapids.sql.explain', 'ALL')

spark = spark_builder.getOrCreate()

Listening for transport dt_socket at address: 5005


23/07/29 19:25:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/29 19:25:31 WARN RapidsPluginUtils: RAPIDS Accelerator 23.10.0-SNAPSHOT using cudf 23.10.0-SNAPSHOT.
23/07/29 19:25:31 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.
23/07/29 19:25:31 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `ALL`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.


# Test Scenario

In [7]:
ts_tup = (
    # datetime.datetime(2023, 7, 14, 7, 38, 45, 418688),
    datetime.datetime(3023, 7, 14, 7, 38, 45, 418688),
)
ts_tup

(datetime.datetime(3023, 7, 14, 7, 38, 45, 418688),)

In [8]:
df = spark.createDataFrame([ts_tup,], 'big_ts timestamp')
df.printSchema()
df.show(truncate = False)

root
 |-- big_ts: timestamp (nullable = true)



23/07/29 19:25:42 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  *Exec <ProjectExec> will run on GPU
    *Expression <Alias> cast(big_ts#0 as string) AS big_ts#3 will run on GPU
      *Expression <Cast> cast(big_ts#0 as string) will run on GPU
    ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
      @Expression <AttributeReference> big_ts#0 could run on GPU

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------------+
|big_ts                    |
+--------------------------+
|3023-07-14 07:38:45.418688|
+--------------------------+



                                                                                

In [9]:
from enum import Enum
class Chip(Enum):
    CPU = 1
    GPU = 2

def test_path(chip, type_str):
    return f"/tmp/out_{chip}_{type_str}.parquet"


In [10]:
def test_case(chip, parquet_ts_type = 'INT96'):
    print(f"##### TEST chip={chip} parquet_ts_type={parquet_ts_type}\n")
    spark.conf.set('spark.rapids.sql.enabled', chip == Chip.GPU)
    spark.conf.set('spark.sql.parquet.outputTimestampType', parquet_ts_type)
    path = test_path(chip, parquet_ts_type)
    print(f"IO to/from {path}")
    df.write.mode('overwrite').parquet(path)
    spark.read.parquet(path).show(truncate = False)
    parquet_file_path, = glob.glob(f"{path}/*.parquet")
    pf = fastparquet.ParquetFile(parquet_file_path)
    print(f"fastparquet metadata {pf.fmd}")
    print(f"fastparquet data: {pf.head(1)}")

### GPU

In [11]:
test_case(chip = Chip.GPU, parquet_ts_type = 'INT96')

##### TEST on_gpu=Chip.GPU parquet_ts_type=INT96

IO to/from /tmp/out_Chip.GPU_INT96.parquet


23/07/29 19:25:43 WARN GpuOverrides: 
*Exec <DataWritingCommandExec> will run on GPU
  *Output <InsertIntoHadoopFsRelationCommand> will run on GPU
  ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
    @Expression <AttributeReference> big_ts#0 could run on GPU

23/07/29 19:25:44 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  *Exec <ProjectExec> will run on GPU
    *Expression <Alias> cast(big_ts#8 as string) AS big_ts#11 will run on GPU
      *Expression <Cast> cast(big_ts#8 as string) will run on GPU
    *Exec <Fi

+--------------------------+
|big_ts                    |
+--------------------------+
|1854-06-04 08:29:37.999584|
+--------------------------+

fastparquet metadata column_orders:
- TYPE_ORDER: {}
created_by: null
encryption_algorithm: null
footer_signing_key_metadata: null
key_value_metadata:
- key: b'org.apache.spark.sql.parquet.row.metadata'
  value: b'{"type":"struct","fields":[{"name":"big_ts","type":"timestamp","nullable":true,"metadata":{}}]}'
- key: b'org.apache.spark.version'
  value: b'3.1.1'
num_rows: 1
row_groups:
- columns:
  - column_index_length: null
    column_index_offset: null
    crypto_metadata: null
    encrypted_column_metadata: null
    file_offset: 0
    file_path: null
    meta_data:
      bloom_filter_offset: null
      codec: 0
      data_page_offset: 4
      dictionary_page_offset: null
      encoding_stats: null
      encodings:
      - 0
      - 3
      index_page_offset: null
      key_value_metadata: null
      num_values: 1
      path_in_schema:
    

In [12]:
test_case(chip = Chip.GPU, parquet_ts_type = 'TIMESTAMP_MICROS') 

##### TEST on_gpu=Chip.GPU parquet_ts_type=TIMESTAMP_MICROS

IO to/from /tmp/out_Chip.GPU_TIMESTAMP_MICROS.parquet


23/07/29 19:25:45 WARN GpuOverrides: 
*Exec <DataWritingCommandExec> will run on GPU
  *Output <InsertIntoHadoopFsRelationCommand> will run on GPU
  ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
    @Expression <AttributeReference> big_ts#0 could run on GPU



+--------------------------+
|big_ts                    |
+--------------------------+
|3023-07-14 07:38:45.418688|
+--------------------------+

fastparquet metadata column_orders:
- TYPE_ORDER: {}
created_by: null
encryption_algorithm: null
footer_signing_key_metadata: null
key_value_metadata:
- key: b'org.apache.spark.sql.parquet.row.metadata'
  value: b'{"type":"struct","fields":[{"name":"big_ts","type":"timestamp","nullable":true,"metadata":{}}]}'
- key: b'org.apache.spark.version'
  value: b'3.1.1'
num_rows: 1
row_groups:
- columns:
  - column_index_length: null
    column_index_offset: null
    crypto_metadata: null
    encrypted_column_metadata: null
    file_offset: 0
    file_path: null
    meta_data:
      bloom_filter_offset: null
      codec: 0
      data_page_offset: 4
      dictionary_page_offset: null
      encoding_stats: null
      encodings:
      - 0
      - 3
      index_page_offset: null
      key_value_metadata: null
      num_values: 1
      path_in_schema:
    

23/07/29 19:25:45 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  *Exec <ProjectExec> will run on GPU
    *Expression <Alias> cast(big_ts#16 as string) AS big_ts#19 will run on GPU
      *Expression <Cast> cast(big_ts#16 as string) will run on GPU
    *Exec <FileSourceScanExec> will run on GPU



### CPU

In [13]:
test_case(chip = Chip.CPU, parquet_ts_type = 'INT96')

##### TEST on_gpu=Chip.CPU parquet_ts_type=INT96

IO to/from /tmp/out_Chip.CPU_INT96.parquet
+--------------------------+
|big_ts                    |
+--------------------------+
|3023-07-14 07:38:45.418688|
+--------------------------+

fastparquet metadata column_orders:
- TYPE_ORDER: {}
created_by: b'parquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)'
encryption_algorithm: null
footer_signing_key_metadata: null
key_value_metadata:
- key: b'org.apache.spark.version'
  value: b'3.1.1'
- key: b'org.apache.spark.sql.parquet.row.metadata'
  value: b'{"type":"struct","fields":[{"name":"big_ts","type":"timestamp","nullable":true,"metadata":{}}]}'
num_rows: 1
row_groups:
- columns:
  - column_index_length: null
    column_index_offset: null
    crypto_metadata: null
    encrypted_column_metadata: null
    file_offset: 4
    file_path: null
    meta_data:
      bloom_filter_offset: null
      codec: 1
      data_page_offset: 4
      dictionary_page_offset: null
     

In [14]:
test_case(chip = Chip.CPU, parquet_ts_type = 'TIMESTAMP_MICROS')

##### TEST on_gpu=Chip.CPU parquet_ts_type=TIMESTAMP_MICROS

IO to/from /tmp/out_Chip.CPU_TIMESTAMP_MICROS.parquet
+--------------------------+
|big_ts                    |
+--------------------------+
|3023-07-14 07:38:45.418688|
+--------------------------+

fastparquet metadata column_orders:
- TYPE_ORDER: {}
created_by: b'parquet-mr version 1.10.1 (build a89df8f9932b6ef6633d06069e50c9b7970bebd1)'
encryption_algorithm: null
footer_signing_key_metadata: null
key_value_metadata:
- key: b'org.apache.spark.version'
  value: b'3.1.1'
- key: b'org.apache.spark.sql.parquet.row.metadata'
  value: b'{"type":"struct","fields":[{"name":"big_ts","type":"timestamp","nullable":true,"metadata":{}}]}'
num_rows: 1
row_groups:
- columns:
  - column_index_length: null
    column_index_offset: null
    crypto_metadata: null
    encrypted_column_metadata: null
    file_offset: 4
    file_path: null
    meta_data:
      bloom_filter_offset: null
      codec: 1
      data_page_offset: 4
      dictionary_p