Start jupyter-lab

```bash
jupyter-lab --notebook-dir=$HOME/gits/gerashegalov/rapids-shell/src/jupyter
```
or simply open in VS Code with Jupyter extension

# Repro for [TIMESTAMP_MICROS missing logicalType](https://github.com/NVIDIA/spark-rapids/issues/8778) 

In [1]:
import datetime
import numpy
import fastparquet
import findspark
import glob
import os
import pandas
import sys
import tempfile

### Versions

In [2]:
cuda_version = 'cuda11'
hadoop_version = '3'
rapids_version = '23.06.0'
scala_version = '2.12'
spark_version = '3.3.2'

### Environment Variables

In [3]:
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-11-openjdk-amd64'
os.environ['SPARK_LOCAL_IP'] = '127.0.0.1'
os.environ['TZ'] = 'UTC'

In [4]:
home_dir = os.environ['HOME']
work_dir = f"{home_dir}/jupyter_run_dir"
m2_local_repo = f"{home_dir}/.m2/repository"
groupId = "com.nvidia"
artifactId = f"rapids-4-spark_{scala_version}"
dist_jar = f"{artifactId}-{rapids_version}-{cuda_version}.jar"
spark_home = f"{home_dir}/dist/spark-{spark_version}-bin-hadoop{hadoop_version}"

### Find Spark 

In [5]:
findspark.init(spark_home = spark_home)
findspark.add_jars(f"{m2_local_repo}/com/nvidia/{artifactId}/{rapids_version}/{dist_jar}")

import pyspark

### Configure Spark

In [6]:
cores_per_exec = 1
jdwp = '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005'
spark_master = f"local[{cores_per_exec}]"
spark_builder = pyspark.sql.SparkSession.builder
spark_builder.config('spark.app.name', 'RAPIDS PySpark Notebook')
spark_builder.config('spark.driver.extraJavaOptions', f"-Dai.rapids.cudf.preserve-dependencies=true {jdwp}")
spark_builder.config('spark.master', spark_master)
spark_builder.config('spark.plugins', 'com.nvidia.spark.SQLPlugin')
spark_builder.config('spark.rapids.sql.enabled', True)
spark_builder.config('spark.rapids.sql.explain', 'ALL')

spark = spark_builder.getOrCreate()

Listening for transport dt_socket at address: 5005
23/07/25 22:43:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/07/25 22:43:09 WARN RapidsPluginUtils: RAPIDS Accelerator 23.06.0 using cudf 23.06.0.
23/07/25 22:43:09 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.
23/07/25 22:43:09 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `ALL`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.


# Test Scenario

In [7]:
df = spark.createDataFrame([(datetime.datetime(3023, 7, 14, 7, 38, 45, 418688),)], 'ts timestamp')
cpu_path = tempfile.mkdtemp("cpu_ts")
gpu_path = tempfile.mkdtemp("gpu_ts")
spark.conf.set('spark.sql.parquet.outputTimestampType', 'TIMESTAMP_MICROS')

In [8]:
def spark_to_fastparquet(on_gpu = False):
    spark.conf.set('spark.rapids.sql.enabled', on_gpu)
    path = gpu_path if on_gpu else cpu_path
    df.write.mode('overwrite').parquet(path)
    spark.read.parquet(path).show(truncate = False)
    file_path, = glob.glob(f"{path}/*.parquet")
    fastparquet_file = fastparquet.ParquetFile(file_path)
    print(fastparquet_file.head(1))
    print(fastparquet_file.fmd)

Read from CPU Spark

In [9]:
spark_to_fastparquet(on_gpu = False)

                                                                                

+--------------------------+
|ts                        |
+--------------------------+
|3023-07-14 07:38:45.418688|
+--------------------------+

                          ts
0 3023-07-14 07:38:45.418688
column_orders:
- TYPE_ORDER: {}
created_by: b'parquet-mr version 1.12.2 (build 77e30c8093386ec52c3cfa6c34b7ef3321322c94)'
encryption_algorithm: null
footer_signing_key_metadata: null
key_value_metadata:
- key: b'org.apache.spark.version'
  value: b'3.3.2'
- key: b'org.apache.spark.sql.parquet.row.metadata'
  value: b'{"type":"struct","fields":[{"name":"ts","type":"timestamp","nullable":true,"metadata":{}}]}'
num_rows: 1
row_groups:
- columns:
  - column_index_length: 31
    column_index_offset: 43
    crypto_metadata: null
    encrypted_column_metadata: null
    file_offset: 4
    file_path: null
    meta_data:
      bloom_filter_offset: null
      codec: 1
      data_page_offset: 4
      dictionary_page_offset: null
      encoding_stats:
      - count: 1
        encoding: 0
        pa

Read from GPU Spark

In [10]:
spark_to_fastparquet(on_gpu = True)

23/07/25 22:43:24 WARN GpuOverrides: 
*Exec <DataWritingCommandExec> will run on GPU
  *Output <InsertIntoHadoopFsRelationCommand> will run on GPU
  ! <RDDScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.RDDScanExec
    @Expression <AttributeReference> ts#0 could run on GPU

23/07/25 22:43:25 WARN GpuOverrides: 
!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it
  @Partitioning <SinglePartition$> could run on GPU
  *Exec <ProjectExec> will run on GPU
    *Expression <Alias> cast(ts#12 as string) AS ts#15 will run on GPU
      *Expression <Cast> cast(ts#12 as string) will run on GPU
    *Exec <FileSourceScanEx

# Python Package Versions

In [11]:
for p in [fastparquet, numpy, pandas]:
    print(f"name={p.__name__} version={p.__version__}\n")

name=fastparquet version=2023.7.0

name=numpy version=1.25.1

name=pandas version=2.0.3



In [12]:
pip list

Package           Version
----------------- --------
asttokens         2.2.1
backcall          0.2.0
cfgv              3.3.1
comm              0.1.3
cramjam           2.6.2
debugpy           1.6.7
decorator         5.1.1
distlib           0.3.7
exceptiongroup    1.1.2
execnet           2.0.2
executing         1.2.0
fastparquet       2023.7.0
filelock          3.12.2
findspark         2.0.1
fsspec            2023.6.0
identify          2.5.25
iniconfig         2.0.0
ipykernel         6.24.0
ipython           8.14.0
jedi              0.18.2
jupyter_client    8.3.0
jupyter_core      5.3.1
matplotlib-inline 0.1.6
nest-asyncio      1.5.6
nodeenv           1.8.0
numpy             1.25.1
packaging         23.1
pandas            2.0.3
parso             0.8.3
pexpect           4.8.0
pickleshare       0.7.5
pip               22.0.2
platformdirs      3.9.1
pluggy            1.2.0
pre-commit        3.3.3
prompt-toolkit    3.0.39
psutil            5.9.5
ptyprocess        0.7.0
pure-eval         0.2.