# Reproduce query86 discrepancy on L40S

In [2]:
rapids_version = '24.12.0-SNAPSHOT'
gpu_type = 'l40s'
# gpu_type = 'h100'
use_decimal = True
# H100 has 2x memory, equalizing
gpu_mem_fraction = 0.5 if gpu_type == 'h100' else 1.0

In [3]:
from difflib import unified_diff
from glob import glob
from importlib.resources import files
import os
from pprint import pprint
import re
import sys

In [4]:
home_dir = os.getenv('HOME')

os.environ['SPARK_CONF_DIR'] = ""
os.environ['CUDA_VISIBLE_DEVICES'] = '0' if gpu_type == 'l40s' else '1'

os.environ['TZ'] = 'UTC'
sys.path.append(f"{home_dir}/gshegalov/gits/NVIDIA/spark-rapids-benchmarks/nds")
sys.path.append('/opt/spark-3.3.3-bin-hadoop3/python/lib/pyspark.zip')

py4j_src, *_ = glob('/opt/spark-3.3.3-bin-hadoop3/python/lib/py4j-*-src.zip')
sys.path.append(py4j_src)
import nds_power

In [5]:
spark_sql_jar_path, *_ = glob(f"/opt/spark-3.3.3-bin-hadoop3/jars/spark-sql_*jar")
spark_sql_jar = os.path.basename(spark_sql_jar_path)
scala_version = re.search(r'^spark-sql_(\d+.\d+)-.*\.jar$', spark_sql_jar).group(1)

In [6]:
# SNAPSHOT versions require a path since unpublished to Maven
rapids_artifact_id = f"rapids-4-spark_{scala_version}"

use_local_rapids_jar = True

if use_local_rapids_jar:

  # Binary Search
  
  # GOOD  
  # rapids_version = '24.10.0'

  # GOOD 
  # rapids_version = '24.12.0-20241015.184449-18'

  # GOOD 
  # rapids_version = '24.12.0-20241101.120509-33'

  # GOOD
  # rapids_version = '24.12.0-20241113.144142-42'

  # BAD
  rapids_version = '24.12.0-20241119.134850-43'
  
  # BAD
  # rapids_version = '24.12.0-20241122.132327-45'
  
  # BAD
  # rapids_version = '24.12.0-20241127.112105-48'
  
  # BAD
  # rapids_version = '24.12.0-20241203.104001-53'

  # BAD
  # rapids_version = '24.12.0-20241203.104001-53-cuda12'
  
  rapids_jar_path = f"/home/spark/gshegalov/dist/rapids/rapids-4-spark_2.12-{rapids_version}.jar"
else:
    rapids_jar_path = None


In [7]:
def dump_version_info(build_number):
  jar_path, *_ = glob(f"/home/spark/gshegalov/dist/rapids/rapids-4-spark_2.12-*-{build_number}.jar")
  version_files = [
    'rapids4spark-version-info.properties',
    'spark320/rapids4spark-private-version-info.properties',
    'spark-rapids-jni-version-info.properties',
    'cudf-java-version-info.properties'
  ]
  !unzip -c {jar_path} {" ".join(version_files)} 

In [8]:
# Last Good Jar
dump_version_info('42')


Archive:  /home/spark/gshegalov/dist/rapids/rapids-4-spark_2.12-24.12.0-20241113.144142-42.jar
  inflating: spark320/rapids4spark-private-version-info.properties  
version=24.12.0-SNAPSHOT
user=root
revision=2f08e20170b66621d1f14ee0fb351ef5630ea811
branch=HEAD
date=2024-11-13T06:34:20Z
url=https://gitlab-master.nvidia.com/nvspark/spark-rapids-private.git

  inflating: rapids4spark-version-info.properties  
version=24.12.0-SNAPSHOT
cudf_version=24.12.0-SNAPSHOT
user=root
revision=862dab0789abb95f860d1d42e06dad4690ca79e2
branch=HEAD
date=2024-11-13T14:24:52Z
url=https://github.com/NVIDIA/spark-rapids.git

  inflating: cudf-java-version-info.properties  
version=24.12.0-SNAPSHOT
user=root
revision=487f97c036ae7919e98ddc8bf5412a8002a493c5
branch=HEAD
date=2024-11-13T08:01:08Z
url=https://github.com/rapidsai/cudf.git
gpu_architectures=70;75;80;86;90

  inflating: spark-rapids-jni-version-info.properties  
version=24.12.0-SNAPSHOT
user=root
revision=ea47ecb58cab8228a81088b8ac44e3b336a9ba7e
b

In [9]:
# First Bad Jar 
dump_version_info('43')

Archive:  /home/spark/gshegalov/dist/rapids/rapids-4-spark_2.12-24.12.0-20241119.134850-43.jar
  inflating: spark320/rapids4spark-private-version-info.properties  
version=24.12.0-SNAPSHOT
user=root
revision=2f08e20170b66621d1f14ee0fb351ef5630ea811
branch=HEAD
date=2024-11-19T06:37:56Z
url=https://gitlab-master.nvidia.com/nvspark/spark-rapids-private.git

  inflating: rapids4spark-version-info.properties  
version=24.12.0-SNAPSHOT
cudf_version=24.12.0-SNAPSHOT
user=root
revision=9b06ae3081d969abdd02686c50741e20a637ef7d
branch=HEAD
date=2024-11-19T13:34:11Z
url=https://github.com/NVIDIA/spark-rapids.git

  inflating: cudf-java-version-info.properties  
version=24.12.0-SNAPSHOT
user=root
revision=302e625bf87dce4059eb7c383dced848ad9d8f4c
branch=HEAD
date=2024-11-19T08:10:37Z
url=https://github.com/rapidsai/cudf.git
gpu_architectures=70;75;80;86;90

  inflating: spark-rapids-jni-version-info.properties  
version=24.12.0-SNAPSHOT
user=root
revision=e3fd02e4a3b7567dc08f78ee75f3c2bd0ee6d03a
b

In [10]:
from pyspark.sql import SparkSession
num_execs = 1
cores_per_exec = 8

spark_builder = (
    SparkSession.builder
        .appName('NDS Query Notebook')
        # .master('local[96]')
        .master(f"local-cluster[{num_execs},{cores_per_exec},{cores_per_exec * 1024}]")
        .config('spark.driver.memory', '24g')
        .config('spark.executorEnv.TZ', 'UTC')
        .config('spark.plugins', 'com.nvidia.spark.SQLPlugin')
        .config('spark.rapids.memory.gpu.minAllocFraction', 0)
        .config('spark.rapids.memory.gpu.allocFraction', gpu_mem_fraction / num_execs)
        .config('spark.rapids.memory.gpu.maxAllocFraction', gpu_mem_fraction / num_execs)
        # .config('spark.rapids.sql.explain', 'ALL')
        .config('spark.sql.files.maxPartitionBytes', '128mb')
        .config("spark.sql.shuffle.partitions", 1152)
        .config('spark.worker.resource.gpu.discoveryScript', f"/opt/getGpusResources-{gpu_type}.sh")
        .config('spark.worker.resource.gpu.amount', 1)
        .config('spark.ui.showConsoleProgress', False)
)

if rapids_jar_path is not None:
    spark_builder.config('spark.driver.extraClassPath', rapids_jar_path)
    spark_builder.config('spark.executor.extraClassPath', rapids_jar_path)
else:
    rapids_version = '24.12.0-SNAPSHOT'
    spark_builder.config('spark.jars.packages', f"com.nvidia:{rapids_artifact_id}:{rapids_version}")
spark = spark_builder.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/12/06 21:58:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/06 21:58:47 WARN RapidsPluginUtils: RAPIDS Accelerator 24.12.0-SNAPSHOT using cudf 24.12.0-SNAPSHOT, private revision 2f08e20170b66621d1f14ee0fb351ef5630ea811
24/12/06 21:58:47 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.
24/12/06 21:58:47 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.
24/12/06 21:58:47 WARN Master: App app-20241206215847-0000 requires more resource than any of Workers could have.


In [11]:
spark

# Setup NDS Tables

In [12]:
nds_scale_factor = 'sf3k'
nds_data_path = f"{nds_data_root}/parquet_{nds_scale_factor}_decimal"
print(f"##### Setting up tables: {use_decimal=} #####")
nds_power.setup_tables(spark, nds_data_path, 'parquet', use_decimal, [])

##### Setting up tables: use_decimal=True #####
24/12/06 21:58:48 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
Time taken: 9314 millis for table customer_address
Time taken: 112 millis for table customer_demographics
24/12/06 21:58:58 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
Time taken: 113 millis for table date_dim
Time taken: 97 millis for table warehouse
Time taken: 87 millis for table ship_mode
Time taken: 82 millis for table time_dim
Time taken: 77 millis for table reason
Time taken: 82 millis for table income_band
Time taken: 79 millis for table item
Time taken: 84 millis for table store
Time taken: 89 millis for table call_center
Time taken: 69 millis for table customer
Time taken: 71 millis for table web_site
Time taken: 5033 millis for table store_returns
Time taken: 160 millis for table h

[('app-20241206215847-0000', 'CreateTempView customer_address', 9314),
 ('app-20241206215847-0000', 'CreateTempView customer_demographics', 112),
 ('app-20241206215847-0000', 'CreateTempView date_dim', 113),
 ('app-20241206215847-0000', 'CreateTempView warehouse', 97),
 ('app-20241206215847-0000', 'CreateTempView ship_mode', 87),
 ('app-20241206215847-0000', 'CreateTempView time_dim', 82),
 ('app-20241206215847-0000', 'CreateTempView reason', 77),
 ('app-20241206215847-0000', 'CreateTempView income_band', 82),
 ('app-20241206215847-0000', 'CreateTempView item', 79),
 ('app-20241206215847-0000', 'CreateTempView store', 84),
 ('app-20241206215847-0000', 'CreateTempView call_center', 89),
 ('app-20241206215847-0000', 'CreateTempView customer', 69),
 ('app-20241206215847-0000', 'CreateTempView web_site', 71),
 ('app-20241206215847-0000', 'CreateTempView store_returns', 5033),
 ('app-20241206215847-0000', 'CreateTempView household_demographics', 160),
 ('app-20241206215847-0000', 'CreateTem

# Verify executor on the right GPU ID

In [13]:
pid = os.getpid()
!pstree -T -p {pid}

python(669569)─┬─java(669696)───java(670412)
               └─pstree(670972)


In [14]:
!nvidia-smi

Fri Dec  6 21:59:28 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L40S                    Off |   00000000:01:00.0 Off |                    0 |
| N/A   47C    P0             88W /  350W |   44863MiB /  46068MiB |      0%   E. Process |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA H100 NVL                Off |   00

# Run Queries from internal_git/ab/-/blob/main/query_streams/nds/sf3k_query_0.sql#L841-870

In [15]:
queries = {

  
  'q36': '''
select
    sum(ss_net_profit)/sum(ss_ext_sales_price) as gross_margin
   ,i_category
   ,i_class
   ,grouping(i_category)+grouping(i_class) as lochierarchy
   ,rank() over (
        partition by grouping(i_category)+grouping(i_class),
        case when grouping(i_class) = 0 then i_category end
        order by sum(ss_net_profit)/sum(ss_ext_sales_price) asc) as rank_within_parent
 from
    store_sales
   ,date_dim       d1
   ,item
   ,store
 where
    d1.d_year = 2002
 and d1.d_date_sk = ss_sold_date_sk
 and i_item_sk  = ss_item_sk
 and s_store_sk  = ss_store_sk
 and s_state in ('MN','TN','MI','WV',
                 'NC','GA','TN','OH')
 group by rollup(i_category,i_class)
 order by
   lochierarchy desc
  ,case when lochierarchy = 0 then i_category end
  ,rank_within_parent
   LIMIT 100;
''',

  
   'q86': '''
select
    sum(ws_net_paid) as total_sum
   ,i_category
   ,i_class
   ,grouping(i_category)+grouping(i_class) as lochierarchy
   ,rank() over (
        partition by grouping(i_category)+grouping(i_class),
        case when grouping(i_class) = 0 then i_category end
        order by sum(ws_net_paid) desc) as rank_within_parent
 from
    web_sales
   ,date_dim       d1
   ,item
 where
    d1.d_month_seq between 1205 and 1205+11
 and d1.d_date_sk = ws_sold_date_sk
 and i_item_sk  = ws_item_sk
 group by rollup(i_category,i_class)
 order by
   lochierarchy desc,
   case when lochierarchy = 0 then i_category end,
   rank_within_parent
'''
}

In [16]:
def compare_cpu_gpu():
  for qid, sql_text in queries.items():       
    print(f"##### Runing {qid=} {use_decimal=} on CPU #####")
    spark.conf.set('spark.rapids.sql.enabled', False)
    cpu_df = spark.sql(sql_text)
    cpu_df.printSchema()
    cpu = sorted(cpu_df.collect())
    print("######### BEGIN Full CPU OUTPUT for REFERENCE #####")
    pprint(cpu)
    print("######### END Full CPU OUTPUT for REFERENCE #####")
  
    print(f"##### Runing {qid=} {use_decimal=} on GPU #####")
    spark.conf.set('spark.rapids.sql.enabled', True)
    gpu_df = spark.sql(sql_text)
    gpu_df.printSchema()
    gpu = sorted(gpu_df.collect())
    print("######### BEGIN Full GPU OUTPUT for REFERENCE #####")
    pprint(gpu)
    print("######### END Full GPU OUTPUT for REFERENCE #####")
  
    print(f"#### checking diff {qid=} {use_decimal=} ####")
    sys.stdout.writelines(unified_diff(
      a=[f"{x}\n" for x in cpu],
      b=[f"{x}\n" for x in gpu],
      fromfile='CPU OUTPUT',
      tofile='GPU OUTPUT'))  
    assert gpu == cpu, f"##### Check for diff of {qid=} {use_decimal=} above #####"
    print(f"#### SUCSESS {qid=} {use_decimal=} GPU and CPU matched ####")
  print(f"##### END of REPRO on {gpu_type=} #####")


In [17]:
compare_cpu_gpu()

##### Runing qid='q36' use_decimal=True on CPU #####
root
 |-- gross_margin: decimal(37,20) (nullable = true)
 |-- i_category: string (nullable = true)
 |-- i_class: string (nullable = true)
 |-- lochierarchy: byte (nullable = false)
 |-- rank_within_parent: integer (nullable = false)

######### BEGIN Full CPU OUTPUT for REFERENCE #####
[Row(gross_margin=Decimal('-0.47828295516352641128'), i_category=None, i_class='audio', lochierarchy=0, rank_within_parent=1),
 Row(gross_margin=Decimal('-0.47230074709745873202'), i_category=None, i_class='history', lochierarchy=0, rank_within_parent=2),
 Row(gross_margin=Decimal('-0.46715300295799058210'), i_category=None, i_class='swimwear', lochierarchy=0, rank_within_parent=3),
 Row(gross_margin=Decimal('-0.46668415445196648666'), i_category=None, i_class='archery', lochierarchy=0, rank_within_parent=4),
 Row(gross_margin=Decimal('-0.46536094067172816315'), i_category=None, i_class='cooking', lochierarchy=0, rank_within_parent=5),
 Row(gross_margin

AssertionError: ##### Check for diff of qid='q86' use_decimal=True above #####