In [29]:
import findspark
findspark.init()

In [89]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, desc, count, explode, split, regexp_replace, collect_list, array_sort, reverse, unix_timestamp, row_number,
    when, lit, lead, avg, udf, length
)

from pyspark.sql import Window

from pyspark.sql.types import (
    ArrayType, StructType, StructField, StringType, LongType, TimestampType, DoubleType, IntegerType
)

import os
import re
import pandas as pd

In [31]:
spark = (
    SparkSession
    .builder
    .appName('UDFs I')
    .getOrCreate()
)

# Task I

* convert question tags to an array using UDFs

In [32]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'data/questions-json')

In [33]:
questionsDF = (
    spark
    .read
    .format('json')
    .option('path', data_input_path)
    .load()
)

In [77]:
questionsDF.printSchema()

root
 |-- accepted_answer_id: long (nullable = true)
 |-- answers: long (nullable = true)
 |-- body: string (nullable = true)
 |-- comments: long (nullable = true)
 |-- creation_date: string (nullable = true)
 |-- question_id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- tags: string (nullable = true)
 |-- title: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- views: long (nullable = true)



In [34]:
questionsDF.select('tags').show(truncate=False)

+------------------------------------------------------------------------------------+
|tags                                                                                |
+------------------------------------------------------------------------------------+
|<special-relativity><mass>                                                          |
|<electromagnetic-radiation><speed-of-light><velocity><history><relative-motion>     |
|<electromagnetic-radiation><condensed-matter><quantum-electrodynamics><quantum-spin>|
|<electricity>                                                                       |
|<homework-and-exercises><rotational-dynamics><reference-frames>                     |
|<epr-experiment><schroedingers-cat>                                                 |
|<homework-and-exercises><elasticity>                                                |
|<mathematical-physics>                                                              |
|<supersymmetry><group-representations><spa

In [35]:
@udf(ArrayType(StringType()))
def transform_tags(tags):
    tags_list = tags.split('><')
    tags_list[0] = tags_list[0][1:]
    tags_list[-1] = tags_list[-1][0:-1]
    return tags_list

In [41]:
resultDF = (
    questionsDF
    .withColumn('tags_arr', transform_tags('tags'))
)

In [44]:
resultDF.show(truncate=5)

+------------------+-------+-----+--------+-------------+-----------+-----+-----+-----+-------+-----+--------+
|accepted_answer_id|answers| body|comments|creation_date|question_id|score| tags|title|user_id|views|tags_arr|
+------------------+-------+-----+--------+-------------+-----------+-----+-----+-----+-------+-----+--------+
|              null|      1|<p...|       5|        20...|      24...|    3|<s...|Re...|  11...|   90|   [s...|
|              null|      2|<p...|       5|        20...|      21...|    0|<e...|Wh...|  40394|   76|   [e...|
|              null|      1|<p...|      12|        20...|      59458|    9|<e...|Do...|  20629|  753|   [e...|
|              null|      2|<p...|       0|        20...|      86252|    1|<e...|Wh...|  33579| 2365|   [e...|
|             40...|      2|<b...|       0|        20...|      40...|    0|<h...|Ho...|  18...|   45|   [h...|
|              null|      0|<p...|       3|        20...|      21...|    1|<e...|Sc...|  60176|   70|   [e...|
|

# Task II

* For each question see if tags can be matched in the text and find number of matches. Than see if this correlates with the response time

In [124]:
@udf(IntegerType())
def detect_tags(message, tags):
    matches = []
    for tag in tags:
        matches.append(re.findall(r"{}".format(tag) , message))
    #print(matches)
    return len([item for sublist in matches for item in sublist])

In [153]:
# using HOF is not working, result seems to be not deterministic
# result of UDF is argument of HOF which is argument another UDF

with_matchesDF = (
    resultDF
    .select('question_id', 'body', 'tags_arr')
    .selectExpr(
        'question_id',
        'body',
        'TRANSFORM(tags_arr, (value -> regexp_replace(value, "-", " "))) AS tags'
    )
    .withColumn('matches', detect_tags(col('body'), 'tags'))
    .withColumn('body_length', length('body'))
    .orderBy(desc('matches'))
).show(truncate=30)

+-----------+------------------------------+------------------------------+-------+-----------+
|question_id|                          body|                          tags|matches|body_length|
+-----------+------------------------------+------------------------------+-------+-----------+
|     373639|<p>I am reading <a href="ht...|[electromagnetism, forces, ...|   1457|       1456|
|     429390|<p>Traditionally for a free...|[electromagnetism, mass, ch...|   1171|       1165|
|     317957|<p>In the course of learnin...|[electromagnetism, tensor c...|    915|        914|
|     149796|<p>Moment of inertia is the...|[rotational dynamics, defin...|    452|        450|
|     411470|<p>Suppose two objects coll...|[special relativity, mass, ...|     74|       6511|
|     277347|<p>I am aware that there ap...|[newtonian mechanics, energ...|     64|       8106|
|     171914|<p>Second, related question...|    [pressure, water, density]|     60|       6309|
|     244363|<p>If time is a dimension a

In [147]:
@udf(IntegerType())
def detect_t(message, tags):
    matches = []
    t = map(lambda x: x.replace('-', ' '), tags)
    for tag in t:
        matches.append(re.findall(r"{}".format(tag) , message))
    return len([item for sublist in matches for item in sublist])

In [152]:
(
    resultDF
    .select('question_id', 'body', 'tags_arr')
    .withColumn('matches', detect_t('body', 'tags_arr'))
    .withColumn('body_length', length('body'))
    .orderBy(desc('matches'))
).show(truncate=30)

+-----------+------------------------------+------------------------------+-------+-----------+
|question_id|                          body|                      tags_arr|matches|body_length|
+-----------+------------------------------+------------------------------+-------+-----------+
|     411470|<p>Suppose two objects coll...|[special-relativity, mass, ...|     74|       6511|
|     277347|<p>I am aware that there ap...|[newtonian-mechanics, energ...|     64|       8106|
|     171914|<p>Second, related question...|    [pressure, water, density]|     60|       6309|
|     244363|<p>If time is a dimension a...|[special-relativity, spacet...|     55|       6491|
|     266261|<p>[See final additions at ...|[special-relativity, energy...|     53|      14688|
|     433680|<h1>What I'm doing</h1>

<p...|[quantum-mechanics, superpo...|     50|      19550|
|     359977|<p>The other day when I pos...|[energy, particle-physics, ...|     48|       6306|
|      55948|<p>This post is my best eff