Start jupyter-lab

```bash
jupyter-lab --notebook-dir=$HOME/gits/gerashegalov/rapids-shell/src/jupyter
```
or simply open in VS Code with Jupyter extension

# Repro for [NVIDIA/spark-rapids#8625](https://github.com/NVIDIA/spark-rapids/issues/8625)

In [1]:
import datetime
import fastparquet
import findspark
import glob
import os
import pyarrow
import pyarrow.parquet

### Versions

In [2]:
rapids_version = '23.08.0-SNAPSHOT'
spark_version = '3.4.1'
cuda_version = 'cuda11'
scala_version = '2.12'

### Environment Variables

In [3]:
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['SPARK_LOCAL_IP'] = '127.0.0.1'
os.environ['TZ'] = 'UTC'

In [4]:
home_dir = os.environ['HOME']
work_dir = f"{home_dir}/jupyter_run_dir"
rapids_home = f"{home_dir}/gits/NVIDIA/spark-rapids"
dist_jar = f"rapids-4-spark_{scala_version}-{rapids_version}-{cuda_version}.jar"
spark_home = f"{home_dir}/dist/spark-{spark_version}-bin-hadoop3"

### Find Spark 

In [5]:
findspark.init(spark_home = spark_home)
findspark.add_jars(f"{rapids_home}/dist/target/{dist_jar}")
import pyspark

### Configure Spark

In [6]:
cores_per_exec = 1
jdwp = '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005'
spark_master = f"local[{cores_per_exec}]"
spark_builder = pyspark.sql.SparkSession.builder.config(
    map = {
        'spark.app.name': 'RAPIDS PySpark Notebook',
        'spark.driver.extraJavaOptions': f"-Dai.rapids.cudf.preserve-dependencies=true {jdwp}",
        'spark.master': spark_master,
        'spark.plugins': 'com.nvidia.spark.SQLPlugin',
        'spark.rapids.sql.enabled': True,
        'spark.rapids.sql.explain': 'ALL', 
    }   
)
spark = spark_builder.getOrCreate()

Listening for transport dt_socket at address: 5005


23/07/20 18:49:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/20 18:49:30 WARN RapidsPluginUtils: RAPIDS Accelerator 23.08.0-SNAPSHOT using cudf 23.08.0-SNAPSHOT.
23/07/20 18:49:30 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.
23/07/20 18:49:30 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `ALL`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.


# Test Scenario

In [7]:
ts_tup = (
    datetime.datetime(2023, 7, 14, 7, 38, 45, 418688),
    datetime.datetime(3023, 7, 14, 7, 38, 45, 418688),
)
ts_tup

(datetime.datetime(2023, 7, 14, 7, 38, 45, 418688),
 datetime.datetime(3023, 7, 14, 7, 38, 45, 418688))

In [8]:
df1 = spark.createDataFrame([ts_tup,], 'ts timestamp, big_ts timestamp')
df1.createOrReplaceTempView('df1')
df1.printSchema()
df1.show(truncate = False)

root
 |-- ts: timestamp (nullable = true)
 |-- big_ts: timestamp (nullable = true)



23/07/20 18:49:42 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  *Exec <ProjectExec> will run on GPU
    *Expression <Alias> cast(ts#0 as string) AS ts#6 will run on GPU
      *Expression <Cast> cast(ts#0 as string) will run on GPU
    *Expression <Alias> cast(big_ts#1 as string) AS big_ts#7 will run on GPU
      *Expression <Cast> cast(big_ts#1 as string) will run on GPU
    ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
      @Expression <AttributeReference> ts#0 could run on GPU
      @Expression <AttributeRef

+--------------------------+--------------------------+
|ts                        |big_ts                    |
+--------------------------+--------------------------+
|2023-07-14 07:38:45.418688|3023-07-14 07:38:45.418688|
+--------------------------+--------------------------+



                                                                                

In [9]:
spark.conf.set('spark.sql.parquet.outputTimestampType', 'TIMESTAMP_MICROS')
spark.conf.get('spark.sql.parquet.outputTimestampType')

'TIMESTAMP_MICROS'

In [10]:
cpu_path = '/tmp/int96_out_cpu.parquet'
gpu_path = '/tmp/int96_out_gpu.parquet'

### GPU

In [11]:
spark.conf.set('spark.rapids.sql.enabled', True)

In [12]:
spark.sql("SELECT MAX(ts) as max_ts, MAX(big_ts) as max_big_ts FROM df1").write.mode('overwrite').parquet(gpu_path)

23/07/20 18:49:44 WARN GpuOverrides: 
*Exec <DataWritingCommandExec> will run on GPU
  *Output <InsertIntoHadoopFsRelationCommand> will run on GPU
  *Exec <WriteFilesExec> will run on GPU
    *Exec <HashAggregateExec> will run on GPU
      *Expression <AggregateExpression> max(ts#0) will run on GPU
        *Expression <Max> max(ts#0) will run on GPU
      *Expression <AggregateExpression> max(big_ts#1) will run on GPU
        *Expression <Max> max(big_ts#1) will run on GPU
      *Expression <Alias> max(ts#0)#15 AS max_ts#13 will run on GPU
      *Expression <Alias> max(big_ts#1)#16 AS max_big_ts#14 will run on GPU
      *Exec <ShuffleExchangeExec> will run on GPU
        *Partitioning <SinglePartition$> will run on GPU
        *Exec <HashAggregateExec> will run on GPU
          *Expression <AggregateExpression> partial_max(ts#0) will run on GPU
            *Expression <Max> max(ts#0) will run on GPU
          *Expression <AggregateExpression> partial_max(big_ts#1) will run on GPU
     

In [13]:
gdf = spark.read.parquet(gpu_path)
gdf.printSchema()
gdf.show(truncate = False)

root
 |-- max_ts: timestamp (nullable = true)
 |-- max_big_ts: timestamp (nullable = true)



23/07/20 18:49:45 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  *Exec <ProjectExec> will run on GPU
    *Expression <Alias> cast(max_ts#65 as string) AS max_ts#71 will run on GPU
      *Expression <Cast> cast(max_ts#65 as string) will run on GPU
    *Expression <Alias> cast(max_big_ts#66 as string) AS max_big_ts#72 will run on GPU
      *Expression <Cast> cast(max_big_ts#66 as string) will run on GPU
    *Exec <FileSourceScanExec> will run on GPU



+--------------------------+--------------------------+
|max_ts                    |max_big_ts                |
+--------------------------+--------------------------+
|2023-07-14 07:38:45.418688|3023-07-14 07:38:45.418688|
+--------------------------+--------------------------+



### CPU

In [14]:
spark.conf.set('spark.rapids.sql.enabled', False)

In [15]:
spark.sql("SELECT MAX(ts) as max_ts, MAX(big_ts) as max_big_ts FROM df1").write.mode('overwrite').parquet(cpu_path)

In [16]:
gdf = spark.read.parquet(cpu_path)
gdf.printSchema()
gdf.show(truncate = False)

root
 |-- max_ts: timestamp (nullable = true)
 |-- max_big_ts: timestamp (nullable = true)

+--------------------------+--------------------------+
|max_ts                    |max_big_ts                |
+--------------------------+--------------------------+
|2023-07-14 07:38:45.418688|3023-07-14 07:38:45.418688|
+--------------------------+--------------------------+



## Other Readers

### Read Spark GPU output

In [17]:
gpu_parquet_file, = glob.glob(f"{gpu_path}/*.parquet")
cpu_parquet_file, = glob.glob(f"{cpu_path}/*.parquet")

In [18]:
pmd = pyarrow.parquet.read_metadata(gpu_parquet_file)
pmd.to_dict()

{'created_by': '',
 'num_columns': 2,
 'num_rows': 1,
 'num_row_groups': 1,
 'row_groups': [{'num_columns': 2,
   'num_rows': 1,
   'total_byte_size': 62,
   'columns': [{'file_offset': 0,
     'file_path': '',
     'physical_type': 'INT64',
     'num_values': 1,
     'path_in_schema': 'max_ts',
     'is_stats_set': True,
     'statistics': {'has_min_max': True,
      'min': datetime.datetime(2023, 7, 14, 7, 38, 45, 418688),
      'max': datetime.datetime(2023, 7, 14, 7, 38, 45, 418688),
      'null_count': 0,
      'distinct_count': 0,
      'num_values': 1,
      'physical_type': 'INT64'},
     'compression': 'UNCOMPRESSED',
     'encodings': ('PLAIN', 'RLE'),
     'has_dictionary_page': False,
     'dictionary_page_offset': None,
     'data_page_offset': 4,
     'total_compressed_size': 31,
     'total_uncompressed_size': 31},
    {'file_offset': 0,
     'file_path': '',
     'physical_type': 'INT64',
     'num_values': 1,
     'path_in_schema': 'max_big_ts',
     'is_stats_set': Tr

In [19]:
for k, v in pmd.metadata.items():
    print(k.decode('ascii'))
    print(v.decode('ascii'))

org.apache.spark.version
3.4.1
org.apache.spark.sql.parquet.row.metadata
{"type":"struct","fields":[{"name":"max_ts","type":"timestamp","nullable":true,"metadata":{}},{"name":"max_big_ts","type":"timestamp","nullable":true,"metadata":{}}]}


In [20]:
pmd.schema

<pyarrow._parquet.ParquetSchema object at 0x7f3bbc190840>
required group field_id=-1 schema {
  optional int64 field_id=-1 max_ts (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=true, force_set_converted_type=false));
  optional int64 field_id=-1 max_big_ts (Timestamp(isAdjustedToUTC=true, timeUnit=microseconds, is_from_converted_type=true, force_set_converted_type=false));
}

In [21]:
pyarrow.parquet.read_table(gpu_parquet_file)

pyarrow.Table
max_ts: timestamp[us]
max_big_ts: timestamp[us]
----
max_ts: [[2023-07-14 07:38:45.418688]]
max_big_ts: [[3023-07-14 07:38:45.418688]]

### Inspect CPU output with Fastparquet 

In [22]:
cpu_fpf = fastparquet.ParquetFile(cpu_parquet_file)

In [23]:
cpu_fpf.dtypes

OrderedDict([('max_ts', dtype('<M8[us]')), ('max_big_ts', dtype('<M8[us]'))])

In [24]:
cpu_fpf.fmd

column_orders:
- TYPE_ORDER: {}
- TYPE_ORDER: {}
created_by: b'parquet-mr version 1.12.3 (build f8dced182c4c1fbdec6ccb3185537b5a01e6ed6b)'
encryption_algorithm: null
footer_signing_key_metadata: null
key_value_metadata:
- key: b'org.apache.spark.version'
  value: b'3.4.1'
- key: b'org.apache.spark.sql.parquet.row.metadata'
  value: b'{"type":"struct","fields":[{"name":"max_ts","type":"timestamp","nullable":true,"metadata":{}},{"name":"max_big_ts","type":"timestamp","nullable":true,"metadata":{}}]}'
num_rows: 1
row_groups:
- columns:
  - column_index_length: 31
    column_index_offset: 82
    crypto_metadata: null
    encrypted_column_metadata: null
    file_offset: 4
    file_path: null
    meta_data:
      bloom_filter_offset: null
      codec: 1
      data_page_offset: 4
      dictionary_page_offset: null
      encoding_stats:
      - count: 1
        encoding: 0
        page_type: 0
      encodings:
      - 4
      - 0
      - 3
      index_page_offset: null
      key_value_metadata

In [25]:
cpu_fpf.head(1)

Unnamed: 0,max_ts,max_big_ts
0,2023-07-14 07:38:45.418688,3023-07-14 07:38:45.418688


### Inspect GPU output with Fastparquet 

In [26]:
gpu_fpf = fastparquet.ParquetFile(gpu_parquet_file)

In [27]:
gpu_fpf.dtypes

OrderedDict([('max_ts', dtype('<M8[ns]')), ('max_big_ts', dtype('<M8[ns]'))])

In [28]:
gpu_fpf.fmd

column_orders:
- TYPE_ORDER: {}
- TYPE_ORDER: {}
created_by: null
encryption_algorithm: null
footer_signing_key_metadata: null
key_value_metadata:
- key: b'org.apache.spark.sql.parquet.row.metadata'
  value: b'{"type":"struct","fields":[{"name":"max_ts","type":"timestamp","nullable":true,"metadata":{}},{"name":"max_big_ts","type":"timestamp","nullable":true,"metadata":{}}]}'
- key: b'org.apache.spark.version'
  value: b'3.4.1'
num_rows: 1
row_groups:
- columns:
  - column_index_length: null
    column_index_offset: null
    crypto_metadata: null
    encrypted_column_metadata: null
    file_offset: 0
    file_path: null
    meta_data:
      bloom_filter_offset: null
      codec: 0
      data_page_offset: 4
      dictionary_page_offset: null
      encoding_stats: null
      encodings:
      - 0
      - 3
      index_page_offset: null
      key_value_metadata: null
      num_values: 1
      path_in_schema:
      - max_ts
      statistics:
        distinct_count: null
        max: null
   

In [29]:
gpu_fpf.head(1)

Unnamed: 0,max_ts,max_big_ts
0,2023-07-14 07:38:45.418688,1854-06-04 08:29:37.999584768
