# UDF

https://datanoon.com/blog/pyspark_udf/

In [7]:
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql import Row
from numpy.random import rand
import pyspark.sql.types as T
from pyspark.sql.types import IntegerType, StringType

My machine has following configuration...
- 6 cores with 12vCores
- 32GB RAM

Spark Standalone server:
```
cd /opt/softwares/spark-3.0.1-bin-hadoop3.2/

export PYSPARK_PYTHON=/opt/envs/ai4e/bin/python
export PYSPARK_DRIVER_PYTHON=/opt/envs/ai4e/bin/python

sbin/start-all.sh
sbin/stop-all.sh
```
Spark UI: [http://localhost:8080](http://localhost:8080)   
Spark Master URL : spark://IMCHLT276:7077

In [2]:
spark = SparkSession.builder \
    .master("spark://IMCHLT276:7077") \
    .config("spark.sql.autoBroadcastJoinThreshold", -1) \
    .config("spark.executor.memory", "2g") \
    .config("spark.executor.cores", "2") \
    .config("spark.cores.max", "6") \
    .config("spark.local.dir", "/opt/tmp/spark-temp/") \
    .appName("DataSkewness") \
    .getOrCreate()

## Case 1 : Multiple invokation of UDF

In [14]:
data = [('Data1', 1, 2),
       ('Data1', 2, 2),
       ('Data1', 3, 6),
       ('Data1', 4, 3),
       ('Data1', 5, 2)]

schema = ['name', 'val1', 'val2']
df = spark.createDataFrame(data, schema=schema)
df.printSchema()
df.show()

root
 |-- name: string (nullable = true)
 |-- val1: long (nullable = true)
 |-- val2: long (nullable = true)

+-----+----+----+
| name|val1|val2|
+-----+----+----+
|Data1|   1|   2|
|Data1|   2|   2|
|Data1|   3|   6|
|Data1|   4|   3|
|Data1|   5|   2|
+-----+----+----+



In [15]:
def my_function(arg1, arg2):
    argsum = arg1 + arg2
    argdiff = arg1 - arg2
    argprod = arg1 * arg2
    return argsum, argdiff, argprod

schema = T.StructType([
    T.StructField('sum', T.LongType(), False),
    T.StructField('difference', T.LongType(), False),
    T.StructField('product', T.LongType(), False),
])

my_function_udf = F.udf(my_function, schema)

results_sdf = (
    df
    .select(
        my_function_udf(
            F.col('val1'), F.col('val2')
        ).alias('metrics'))   # call the UDF
    .select(F.col('metrics.*')) # expand into separate columns
)

results_sdf.explain()

== Physical Plan ==
*(2) Project [pythonUDF2#94.sum AS sum#86L, pythonUDF2#94.difference AS difference#87L, pythonUDF2#94.product AS product#88L]
+- BatchEvalPython [my_function(val1#65L, val2#66L), my_function(val1#65L, val2#66L), my_function(val1#65L, val2#66L)], [pythonUDF0#92, pythonUDF1#93, pythonUDF2#94]
   +- *(1) Project [val1#65L, val2#66L]
      +- *(1) Scan ExistingRDD[name#64,val1#65L,val2#66L]




Note UDF is executed three times

`BatchEvalPython [my_function(val1#1L, val2#2L), my_function(val1#1L, val2#2L), my_function(val1#1L, val2#2L)], [pythonUDF0#39, pythonUDF1#40, pythonUDF2#41]`

In [16]:
results_sdf.show()

+---+----------+-------+
|sum|difference|product|
+---+----------+-------+
|  3|        -1|      2|
|  4|         0|      4|
|  9|        -3|     18|
|  7|         1|     12|
|  7|         3|     10|
+---+----------+-------+



That means that in order to do the star expansion on your metrics field, Spark will call your udf three times 
once for each item in your schema. 

This means you’ll be taking an already inefficient function and running it multiple times.

You can trick Spark into evaluating the UDF only once by making a small change to the code:

In [20]:
results_sdf = (
    df
    .select(
        F.explode(
            F.array(
                my_function_udf(F.col('val1'), F.col('val2'))
            )
        ).alias('metrics')
    )
    .select(F.col('metrics.*'))
)
results_sdf.explain()

== Physical Plan ==
*(3) Project [metrics#115.sum AS sum#117L, metrics#115.difference AS difference#118L, metrics#115.product AS product#119L]
+- Generate explode(array(pythonUDF0#123)), false, [metrics#115]
   +- *(2) Project [pythonUDF0#123]
      +- BatchEvalPython [my_function(val1#65L, val2#66L)], [pythonUDF0#123]
         +- *(1) Project [val1#65L, val2#66L]
            +- *(1) Scan ExistingRDD[name#64,val1#65L,val2#66L]




In [21]:
results_sdf.show()

+---+----------+-------+
|sum|difference|product|
+---+----------+-------+
|  3|        -1|      2|
|  4|         0|      4|
|  9|        -3|     18|
|  7|         1|     12|
|  7|         3|     10|
+---+----------+-------+



# Case 2 : Using NLP in UDF

In [38]:
# !pip install spacy
# ! python -m spacy download en_core_web_sm

In [39]:
import spacy

**Naive way of loading the spaCy model on every element**

In [60]:
lang="en_core_web_sm"
def spacy_tokenize(text):
    # Note this is expensive, in practice you would use something like SpacyMagic, see footnote for link; which caches
    # spacy.load so it isn’t happening multiple times
    nlp = spacy.load(lang)
    # If you are working with Python 2 and getting regular strings add x = unicode(x)
    doc = nlp(text)
    return [token.text for token in doc]

spacy_tokenize_udf = F.udf(spacy_tokenize, T.ArrayType(T.StringType()))

In [61]:
df = spark.read.format("text").load("README.md")
df.show()

+--------------------+
|               value|
+--------------------+
|# pyspark-learnin...|
|                    |
|PySpark refresh g...|
+--------------------+



In [66]:
tokenized = df.select(F.explode(spacy_tokenize_udf(F.col("value"))))
tokenized.explain()
tokenized.show()

== Physical Plan ==
Generate explode(pythonUDF0#239), false, [col#237]
+- *(1) Project [pythonUDF0#239]
   +- BatchEvalPython [spacy_tokenize(value#217)], [pythonUDF0#239]
      +- FileScan text [value#217] Batched: false, DataFilters: [], Format: Text, Location: InMemoryFileIndex[file:/opt/vlab/gyan42/pyspark-learning-ground/README.md], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<value:string>


+---------+
|      col|
+---------+
|        #|
|  pyspark|
|        -|
| learning|
|        -|
|   ground|
|  PySpark|
|  refresh|
|    guide|
|     with|
| examples|
|      and|
|reference|
|    links|
|      for|
|    quick|
|      try|
|     outs|
|        !|
+---------+



In [68]:
# spaCy isn't serializable but loading it is semi-expensive, so cache it and reuse it for every element :)
NLP = None
def get_spacy_magic_for(lang):
    global NLP
    if NLP is None:
        NLP = {}
    if lang not in NLP:
        NLP[lang] = spacy.load(lang)
    return NLP[lang]

In [65]:
lang="en_core_web_sm"
def spacy_tokenize(text):
    # Note this is expensive, in practice you would use something like SpacyMagic, see footnote for link; which caches
    # spacy.load so it isn’t happening multiple times
    nlp = get_spacy_magic_for("en_core_web_sm")
    # If you are working with Python 2 and getting regular strings add x = unicode(x)
    doc = nlp(text)
    return [token.text for token in doc]

spacy_tokenize_udf = F.udf(spacy_tokenize, T.ArrayType(T.StringType()))

In [67]:
tokenized = df.select(F.explode(spacy_tokenize_udf(F.col("value"))))
tokenized.explain()
tokenized.show()

== Physical Plan ==
Generate explode(pythonUDF0#250), false, [col#248]
+- *(1) Project [pythonUDF0#250]
   +- BatchEvalPython [spacy_tokenize(value#217)], [pythonUDF0#250]
      +- FileScan text [value#217] Batched: false, DataFilters: [], Format: Text, Location: InMemoryFileIndex[file:/opt/vlab/gyan42/pyspark-learning-ground/README.md], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<value:string>


+---------+
|      col|
+---------+
|        #|
|  pyspark|
|        -|
| learning|
|        -|
|   ground|
|  PySpark|
|  refresh|
|    guide|
|     with|
| examples|
|      and|
|reference|
|    links|
|      for|
|    quick|
|      try|
|     outs|
|        !|
+---------+



### References
- https://towardsdatascience.com/pyspark-udfs-and-star-expansion-b50f501dcb7b
- https://medium.com/@manuzhang/note-about-spark-python-udf-9ddc3c162f77

In [1]:
class Node:  
    def __init__(self, new_data):  
        self.data = new_data  
        self.next = None
class LinkedList: 
    def __init__(self): 
        self.head = None
  
    # createNode and and make linked list  
    def push(self, new_data):  
        new_node = Node(new_data)  
        new_node.next = self.head  
        self.head = new_node  
  
    def deleteNode(self, n): 
        first = self.head 
        second = self.head 
        for i in range(n): 
              
            # If count of nodes in the  
            # given list is less than 'n' 
            if(second.next == None): 
                  
                # If index = n then  
                # delete the head node 
                if(i == n - 1): 
                    self.head = self.head.next
                return self.head 
            second = second.next
          
        while(second.next != None): 
            second = second.next
            first = first.next
          
        first.next = first.next.next
      
    def printList(self): 
        tmp_head = self.head 
        while(tmp_head != None): 
            print(tmp_head.data, end = ' ') 
            tmp_head = tmp_head.next
          
# Driver Code 
llist = LinkedList()  
llist.push(7)  
llist.push(1)  
llist.push(3)  
llist.push(2)  
print("Created Linked list is:") 
llist.printList() 
llist.deleteNode(1)  
print("\nLinked List after Deletion is:") 
llist.printList() 

Created Linked list is:
2 3 1 7 
Linked List after Deletion is:
2 3 1 

In [10]:
llist = LinkedList()  
llist.push(3)
llist.push(2)  
llist.push(1)  
llist.deleteNode(3)  
print("\nLinked List after Deletion is:") 
llist.printList() 


Linked List after Deletion is:
2 3 

In [11]:
llist = LinkedList()  
llist.push(2)  
llist.push(1)  
llist.deleteNode(1)  
print("\nLinked List after Deletion is:") 
llist.printList() 


Linked List after Deletion is:
1 

In [12]:
llist = LinkedList()  
llist.push(2)  
llist.push(1)  
llist.deleteNode(2)  
print("\nLinked List after Deletion is:") 
llist.printList() 


Linked List after Deletion is:
2 

In [13]:
llist = LinkedList()  
llist.push(1)  
llist.deleteNode(1)  
print("\nLinked List after Deletion is:") 
llist.printList() 


Linked List after Deletion is:
