# ETL notebook

* load json dataset
* look at the infered schema
* define the schema explicitely
* convert column tags to array of tags using
 * split
 * explode
 * regexp_replace
 * groupBy + collect_list
 * join

In [34]:
import findspark
findspark.init()

In [35]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, count, explode, split, regexp_replace, collect_list

from pyspark.sql.types import StructType, StructField, StringType, LongType, TimestampType

import os

In [36]:
spark = (
    SparkSession
    .builder
    .appName('ETL I')
    .getOrCreate()
)

In [37]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'data/questions-json')

output_path = os.path.join(project_path, 'output/questions-transformed')

First let Spark infer the schema:

In [38]:
questionsDF = (
    spark
    .read
    .format('json')
    .option('path', data_input_path)
    .load()
)

In [39]:
questionsDF.printSchema()

root
 |-- accepted_answer_id: long (nullable = true)
 |-- answers: long (nullable = true)
 |-- body: string (nullable = true)
 |-- comments: long (nullable = true)
 |-- creation_date: string (nullable = true)
 |-- question_id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- tags: string (nullable = true)
 |-- title: string (nullable = true)
 |-- user_id: long (nullable = true)
 |-- views: long (nullable = true)



Now define the schema:

In [40]:
json_schema = StructType(
    [
        StructField('question_id', LongType(), True),
        StructField('creation_date', TimestampType(), True),
        StructField('title', StringType(), True),
        StructField('body', StringType(), True),
        StructField('tags', StringType(), True),
        StructField('accepted_answer_id', LongType(), True),
        StructField('answers', LongType(), True),
        StructField('comments', LongType(), True),
        StructField('user_id', LongType(), True),
        StructField('views', LongType(), True),
    ]
)

In [41]:
questionsDF = (
    spark
    .read
    .format('json')
    .schema(json_schema)
    .option('path', data_input_path)
    .load()
)

In [42]:
questionsDF.show(truncate=5)

+-----------+-------------+-----+-----+-----+------------------+-------+--------+-------+-----+
|question_id|creation_date|title| body| tags|accepted_answer_id|answers|comments|user_id|views|
+-----------+-------------+-----+-----+-----+------------------+-------+--------+-------+-----+
|      24...|        20...|Re...|<p...|<s...|              null|      1|       5|  11...|   90|
|      21...|        20...|Wh...|<p...|<e...|              null|      2|       5|  40394|   76|
|      59458|        20...|Do...|<p...|<e...|              null|      1|      12|  20629|  753|
|      86252|        20...|Wh...|<p...|<e...|              null|      2|       0|  33579| 2365|
|      40...|        20...|Ho...|<b...|<h...|             40...|      2|       0|  18...|   45|
|      21...|        20...|Sc...|<p...|<e...|              null|      0|       3|  60176|   70|
|      93196|        20...|In...|<p...|<h...|              null|      0|       4|  37167|  101|
|       9945|        20...|Ar...|<p...|<

Convert tags to an array

Hint
* use split to get an array
* explode the array
* use regexp_replace

In [43]:
result = (
    questionsDF
    .withColumn('tags_arr', split('tags', '><'))
    .withColumn('tag', explode('tags_arr'))
    .withColumn('tag', regexp_replace('tag', '(<|>)', ''))
    .groupBy('question_id')
    .agg(collect_list('tag').alias('tags'))
    .join(questionsDF.drop('tags'), 'question_id')
)

In [44]:
(
    result
    .repartition(4)
    .write
    .mode('overwrite')
    .option('path', output_path)
    .save()
)