# User Defined Functions
In this notebook you will solve 2 problems using UDFs.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, split, udf

from pyspark.sql import Window

from pyspark.sql.types import (
    ArrayType, StructType, StructField, StringType, IntegerType
)

import os
import re

In [None]:
spark = (
    SparkSession
    .builder
    .appName('UDFs I')
    .getOrCreate()
)

# Task I
* convert question tags to an array using UDFs

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

data_input_path = os.path.join(project_path, 'data/questions-json')

In [None]:
questionsDF = (
    spark
    .read
    .format('json')
    .option('path', data_input_path)
    .load()
)

#### Implement UDF:

Hint
* implement python function that splits string on ><
* remove < from the first word
* remove > from the last word

In [None]:
@udf(ArrayType(StringType()))
def transform_tags(tags):
    tags_list = tags.split('><')
    tags_list[0] = tags_list[0][1:]
    tags_list[-1] = tags_list[-1][0:-1]
    return tags_list

#### Apply the UDF:

In [None]:
resultDF = (
    questionsDF
    .withColumn('tags_arr', transform_tags('tags'))
    .select('question_id', 'body', 'tags', 'tags_arr')
)

In [None]:
resultDF.show(truncate=30, n=5)

# Task II
* For each question see if tags can be matched in the text and find number of matches.

Hint:
* the function should take the message and tags as arguments
* replace '-' in multiwords tags by white space
* use regular expression to find the match
    * you can use re.findall and add the find array to your result array
    * flatten the result array

In [None]:
@udf(IntegerType())
def detect_tags(message, tags):
    matches = []
    t = map(lambda x: x.replace('-', ' '), tags)
    for tag in t:
        matches.append(re.findall(r"{}".format(tag) , message))
    return len([item for sublist in matches for item in sublist])

In [None]:
(
    resultDF
    .select('question_id', 'body', 'tags_arr')
    .withColumn('matches', detect_tags('body', 'tags_arr'))
    .orderBy(desc('matches'))
).show(truncate=30)

#### Try to make this more efficient:

Hint
* you may consider to do the replacement of '-' natively by pyspark native function regexp_replace and use the UDF only to find the matches

In [None]:
spark.stop()