In [1]:
import findspark
findspark.init()

In [19]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, desc, count, explode, split, regexp_replace, collect_list, array_sort, reverse, unix_timestamp, row_number,
    when, lit, lead, avg, udf
)

from pyspark.sql import Window

from pyspark.sql.types import ArrayType, StructType, StructField, StringType, LongType, TimestampType, DoubleType, IntegerType

import os

In [3]:
spark = (
    SparkSession
    .builder
    .appName('UDFs I')
    .getOrCreate()
)

# Task I

* convert question tags to an array using UDFs

In [4]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'data/questions-json')

In [5]:
questionsDF = (
    spark
    .read
    .format('json')
    .option('path', data_input_path)
    .load()
)

In [7]:
questionsDF.select('tags').show(truncate=False)

+------------------------------------------------------------------------------------+
|tags                                                                                |
+------------------------------------------------------------------------------------+
|<special-relativity><mass>                                                          |
|<electromagnetic-radiation><speed-of-light><velocity><history><relative-motion>     |
|<electromagnetic-radiation><condensed-matter><quantum-electrodynamics><quantum-spin>|
|<electricity>                                                                       |
|<homework-and-exercises><rotational-dynamics><reference-frames>                     |
|<epr-experiment><schroedingers-cat>                                                 |
|<homework-and-exercises><elasticity>                                                |
|<mathematical-physics>                                                              |
|<supersymmetry><group-representations><spa

In [20]:
@udf(ArrayType(StringType()))
def transform_tags(tags):
    tags_list = tags.split('><')
    tags_list[0] = tags_list[0][1:]
    tags_list[-1] = tags_list[-1][0:-1]
    return tags_list

In [23]:
(
    questionsDF
    .select('tags')
    .withColumn('tags_arr', transform_tags('tags'))
).show(truncate=40)

+----------------------------------------+----------------------------------------+
|                                    tags|                                tags_arr|
+----------------------------------------+----------------------------------------+
|              <special-relativity><mass>|              [special-relativity, mass]|
|<electromagnetic-radiation><speed-of-...|[electromagnetic-radiation, speed-of-...|
|<electromagnetic-radiation><condensed...|[electromagnetic-radiation, condensed...|
|                           <electricity>|                           [electricity]|
|<homework-and-exercises><rotational-d...|[homework-and-exercises, rotational-d...|
|     <epr-experiment><schroedingers-cat>|     [epr-experiment, schroedingers-cat]|
|    <homework-and-exercises><elasticity>|    [homework-and-exercises, elasticity]|
|                  <mathematical-physics>|                  [mathematical-physics]|
|<supersymmetry><group-representations...|[supersymmetry, group-representati