Running the posts requires at least 8g memory. Go to SPARK_HOME/conf/spark-defaults.conf and set spark.driver.memory 8g

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode

import os

In [2]:
spark = (
    SparkSession
    .builder
    .appName('Analyze table')
    .master("local[*]")
    .config("spark.driver.memory", "8g")
    .config("spark.jars.packages", "com.databricks:spark-xml_2.11:0.9.0")
    .getOrCreate()
)

In [3]:
# run with 2.4.5
# 3.0.0-preview2 had some problem with binding expression

spark.version

'2.4.5'

In [4]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:3]) 

In [11]:
# assuming the extracted xml files are in Downloads folder

users_input_path = os.path.join(project_path, 'Downloads/physics.stackexchange.com/Users.xml')
posts_input_path = os.path.join(project_path, 'Downloads/physics.stackexchange.com/Posts.xml')
badges_input_path = os.path.join(project_path, 'Downloads/physics.stackexchange.com/Badges.xml')

users_output_path = os.path.join(project_path, 'spark-trainings/training-data/users')
posts_output_path = os.path.join(project_path, 'spark-trainings/training-data/posts')
questions_output_path = os.path.join(project_path, 'spark-trainings/training-data/questions')
questions_json_output_path = os.path.join(project_path, 'spark-trainings/training-data/questions-json')
answers_output_path = os.path.join(project_path, 'spark-trainings/training-data/answers')
badges_output_path = os.path.join(project_path, 'spark-trainings/training-data/badges')

In [7]:
# users

(
    spark
    .read
    .format('xml')
    .option("rowTag", "users")
    .load(users_input_path)
    .select(explode(col("row")))
    .select(col("col.*"))
    .select(
        col('_Id').alias('user_id'),
        col('_DisplayName').alias('display_name'),
        col('_AboutMe').alias('about'),
        col('_Location').alias('location'),
        col('_DownVotes').alias('downvotes'),
        col('_UpVotes').alias('upvotes'),
        col('_Reputation').alias('reputation'),
        col('_Views').alias('views')
    )
    .repartition(8)
    .write
    .mode('overwrite')
    .format("parquet")
    .option('path', users_output_path)
    .save()
)

In [9]:
# badges

(
    spark
    .read
    .format('xml')
    .option("rowTag", "badges")
    .load(badges_input_path)
    .select(explode(col("row")))
    .select(col("col.*"))
    .select(
        col('_UserId').alias('user_id'),
        (col('_Date').cast('timestamp')).alias('date'),
        col('_Name').alias('name'),
        col('_Class').alias('class')
    )
    .repartition(8)
    .write
    .mode('overwrite')
    .format("parquet")
    .option('path', badges_output_path)
    .save()
)

In [6]:
# posts

(
    spark
    .read
    .format('xml')
    .option("rowTag", "posts")    
    .load(posts_input_path)
    .repartition(200)
    .select(explode(col("row")))
    .select(col("col.*"))
    .write
    .mode('overwrite')
    .format("parquet")
    .option('path', posts_output_path)
    .save()
)

In [8]:
# questions

(
    spark
    .read
    .format('xml')
    .option("rowTag", "posts")
    .load(posts_input_path)
    .select(explode(col("row")))
    .select(col("col.*"))
    .select(
        col('_Id').alias('id'),
        (col('_CreationDate').cast('timestamp')).alias('creation_date'),
        col('_Title').alias('title'),
        col('_Body').alias('body'),
        col('_commentCount').alias('comments'),
        col('_AcceptedAnswerId').alias('accepted_answer_id'),
        col('_AnswerCount').alias('answers'),
        col('_FavoriteCount').alias('favorite_count'),
        col('_OwnerDisplayName').alias('owner_display_name'),
        col('_OwnerUserId').alias('user_id'),
        col('_ParentId').alias('parent_id'),
        col('_PostTypeId').alias('post_type_id'),
        col('_Score').alias('score'),
        col('_Tags').alias('tags'),
        col('_ViewCount').alias('views')
    )
    .filter(col('post_type_id') == 1)  # questions
    .select(
        col('id').alias('question_id'),
        'creation_date',
        'title',
        'body',
        'comments',
        'accepted_answer_id',
        'answers',
        'user_id',
        'score',
        'tags',
        'views'
    )   
    .repartition(8)
    .write
    .mode('overwrite')
    .option('path', questions_output_path)
    .save()
)

In [12]:
(
    spark.read.parquet(questions_output_path)
    .write
    .mode('overwrite')
    .format('json')
    .option('path', questions_json_output_path)
    .save()
)

In [10]:
# answers

(
    spark
    .read
    .format('xml')
    .option("rowTag", "posts")
    .load(posts_input_path)
    .select(explode(col("row")))
    .select(col("col.*"))
    .select(
        col('_Id').alias('id'),
        (col('_CreationDate').cast('timestamp')).alias('creation_date'),
        col('_Title').alias('title'),
        col('_Body').alias('body'),
        col('_commentCount').alias('comments'),
        col('_AcceptedAnswerId').alias('accepted_answer_id'),
        col('_AnswerCount').alias('answers'),
        col('_FavoriteCount').alias('favorite_count'),
        col('_OwnerDisplayName').alias('owner_display_name'),
        col('_OwnerUserId').alias('user_id'),
        col('_ParentId').alias('parent_id'),
        col('_PostTypeId').alias('post_type_id'),
        col('_Score').alias('score'),
        col('_Tags').alias('tags'),
        col('_ViewCount').alias('views')
    )
    .filter(col('post_type_id') == 2)  # answers
    .select(
        col('id').alias('answer_id'),
        'creation_date',
        'body',
        'comments',
        'user_id',
        'score',
        col('parent_id').alias('question_id')
    )   
    .repartition(8)
    .write
    .mode('overwrite')
    .option('path', answers_output_path)
    .save()
)