In [None]:
import findspark
findspark.init()

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, desc, count, explode, split, regexp_replace, collect_list, array_sort, reverse, unix_timestamp, row_number,
    when, lit, lead, avg, udf, length
)

from pyspark.sql import Window

from pyspark.sql.types import (
    ArrayType, StructType, StructField, StringType, LongType, TimestampType, DoubleType, IntegerType
)

import os
import re
import pandas as pd

In [None]:
spark = (
    SparkSession
    .builder
    .appName('UDFs I')
    .getOrCreate()
)

# Task I

* convert question tags to an array using UDFs

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'data/questions-json')

In [None]:
questionsDF = (
    spark
    .read
    .format('json')
    .option('path', data_input_path)
    .load()
)

In [None]:
questionsDF.printSchema()

In [None]:
questionsDF.select('tags').show(truncate=False)

In [None]:
@udf(ArrayType(StringType()))
def transform_tags(tags):
    tags_list = tags.split('><')
    tags_list[0] = tags_list[0][1:]
    tags_list[-1] = tags_list[-1][0:-1]
    return tags_list

In [None]:
resultDF = (
    questionsDF
    .withColumn('tags_arr', transform_tags('tags'))
)

In [None]:
resultDF.show(truncate=5)

# Task II

* For each question see if tags can be matched in the text and find number of matches. Than see if this correlates with the response time

In [None]:
@udf(IntegerType())
def detect_tags(message, tags):
    matches = []
    for tag in tags:
        matches.append(re.findall(r"{}".format(tag) , message))
    #print(matches)
    return len([item for sublist in matches for item in sublist])

In [None]:
# using HOF is not working, result seems to be not deterministic
# result of UDF is argument of HOF which is argument another UDF

with_matchesDF = (
    resultDF
    .select('question_id', 'body', 'tags_arr')
    .selectExpr(
        'question_id',
        'body',
        'TRANSFORM(tags_arr, (value -> regexp_replace(value, "-", " "))) AS tags'
    )
    .withColumn('matches', detect_tags(col('body'), 'tags'))
    .withColumn('body_length', length('body'))
    .orderBy(desc('matches'))
).show(truncate=30)

In [None]:
@udf(IntegerType())
def detect_t(message, tags):
    matches = []
    t = map(lambda x: x.replace('-', ' '), tags)
    for tag in t:
        matches.append(re.findall(r"{}".format(tag) , message))
    return len([item for sublist in matches for item in sublist])

In [None]:
(
    resultDF
    .select('question_id', 'body', 'tags_arr')
    .withColumn('matches', detect_t('body', 'tags_arr'))
    .withColumn('body_length', length('body'))
    .orderBy(desc('matches'))
).show(truncate=30)

In [154]:
spark.stop()