
# Regexes and Raw Strings at Python, Scala, SQL Parser levels

Start jupyter-lab

```bash
jupyter-lab --notebook-dir=$HOME/gits/gerashegalov/rapids-shell/src/jupyter
```
and/or connect to the URL VS Code with Jupyter extension

## Start Spark

In [1]:
%%init_spark

import os
import sys
import tempfile

# Environment
home_dir = os.environ['HOME']
work_dir = f"{home_dir}/jupyter_run_dir"
tz = 'UTC'
os.environ['SPARK_HOME'] = f"{home_dir}/dist/spark-3.4.0-bin-hadoop3"
os.environ['TZ'] = tz
sys.path.append(f"{home_dir}/gits/NVIDIA/spark-rapids/integration_tests/src/main/python")
rapids_version = '23.06.0-SNAPSHOT'
locally_built = True

# Spark Master
cores_per_exec = 2
# spark_master = f"local-cluster[1,{cores_per_exec},4096]"
spark_master = f"local[{cores_per_exec}]"

# debugger string
# jdwp = '-agentlib:jdwp=transport=dt_socket,server=n,address=localhost:5005'
jdwp = ''

if locally_built:
  launcher.jars = [
    f"{home_dir}/gits/NVIDIA/spark-rapids/dist/target/rapids-4-spark_2.12-{rapids_version}-cuda11.jar"
  ]
else:
  launcher.packages = [
    f"com.nvidia:rapids-4-spark_2.12:{rapids_version}"
  ]

launcher.conf.spark.driver.extraJavaOptions = f"\"-Dai.rapids.refcount.debug=true -Dlog4j.debug=true {jdwp}\""
launcher.conf.spark.executor.extraJavaOptions = "\"-Dai.rapids.refcount.debug=true -Dlog4j.debug=true\""
launcher.conf.spark.executorEnv.TZ = tz
launcher.conf.spark.master = spark_master
launcher.conf.spark.plugins = "com.nvidia.spark.SQLPlugin"
launcher.conf.set('spark.rapids.sql.explain', 'ALL')

In [2]:
import org.apache.spark.sql.{functions => f}
spark

Intitializing Scala interpreter ...

Spark Web UI available at http://10.0.0.132:4040
SparkContext available as 'sc' (version = 3.4.0, master = local[2], app id = local-1684264258519)
SparkSession available as 'spark'


import org.apache.spark.sql.{functions=>f}
res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6d2eeaf3


In [3]:
%%python
import pyspark
import pyspark.sql.functions as f
spark

### Define DataFrame in Python and Scala

In [4]:
%%python
df =  spark.createDataFrame(
    [
        (r'1234.5678',),
        (r'1234\5678',),
    ], 'a string')
df.show()
df.createOrReplaceTempView('df');

+---------+
|        a|
+---------+
|1234.5678|
|1234\5678|
+---------+



In [5]:
val df = sql("SELECT * FROM df")
df.show()

+---------+
|        a|
+---------+
|1234.5678|
|1234\5678|
+---------+



df: org.apache.spark.sql.DataFrame = [a: string]


## Use raw strings to avoid double escaping for regex containing `\`

### Original way to produce normalized regex `[\\.]`

In [6]:
%%python
original_query = "SELECT SPLIT(a, '[\\\\\\\\.]', 2) as b FROM df"
df1 = spark.sql(original_query)
df1.explain()
df1.show()

== Physical Plan ==
GpuColumnarToRow false
+- GpuProject [split(a#0, [\\.], 2, [\\.], true) AS b#13]
   +- GpuRowToColumnar targetsize(1073741824)
      +- *(1) Scan ExistingRDD[a#0]


+------------+
|           b|
+------------+
|[1234, 5678]|
|[1234, 5678]|
+------------+



### Using Python raw strings get rid of the escaping between Python to JVM

In [7]:
%%python
raw_python_query = r"""SELECT SPLIT(a, '[\\\\.]', 2) as b FROM df""" 
original_query == raw_python_query

True

### Using SQL raw strings get rid of the need escape for Spark SQL Parser   

In [8]:
%%python
raw_python_and_sql_query = r"""SELECT SPLIT(a, r'[\\.]', 2) as b FROM df"""
df2 = spark.sql(raw_python_and_sql_query)
df2.explain()
df2.show() 

== Physical Plan ==
GpuColumnarToRow false
+- GpuProject [split(a#0, [\\.], 2, [\\.], true) AS b#20]
   +- GpuRowToColumnar targetsize(1073741824)
      +- *(1) Scan ExistingRDD[a#0]


+------------+
|           b|
+------------+
|[1234, 5678]|
|[1234, 5678]|
+------------+



### Or use PySpark DataFrame API

In [9]:
%%python
df3 = df.select(f.split('a', r'[\\.]', 2).alias('b'))
df3.explain()
df3.show()

== Physical Plan ==
GpuColumnarToRow false
+- GpuProject [split(a#0, [\\.], 2, [\\.], true) AS b#27]
   +- GpuRowToColumnar targetsize(1073741824)
      +- *(1) Scan ExistingRDD[a#0]


+------------+
|           b|
+------------+
|[1234, 5678]|
|[1234, 5678]|
+------------+



### Or use Scala raw Strings in Scala Spark DataFrame API   

In [10]:
val df4 = df.select(f.split($"a", raw"[\\.]", 2).alias("b"))
df4.explain()
df4.show()

== Physical Plan ==
GpuColumnarToRow false
+- GpuProject [split(a#0, [\\.], 2, [\\.], true) AS b#34]
   +- GpuRowToColumnar targetsize(1073741824)
      +- *(1) Scan ExistingRDD[a#0]


+------------+
|           b|
+------------+
|[1234, 5678]|
|[1234, 5678]|
+------------+



_: Any = true
_7: Any = true
df4: org.apache.spark.sql.DataFrame = [b: array<string>]
