In [1]:
import findspark
findspark.init()

In [5]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, when, year, size, split
)

import os

In [3]:
spark = (
    SparkSession
    .builder
    .appName('ETL II')
    .getOrCreate()
)

# Task

* take dataset with questions
* add a new col 'long_title' which is of BooleanType and it is True if the 'title' has more than 5 words
* add a new col 'created_year' which is the year in which the question was created
* add a new col 'recent_question' which is of BooleanType and it is True if the question was created in 2016 or later
* filter only records that have long title and have accepted_answer_id or records that are recent
* save the result as CSV with following columns (question_id, title, created_year, accepted_answer_id)

In [6]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'output/questions-transformed')

In [7]:
questionsDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
)

In [8]:
resultDF = (
    questionsDF
    .select(
        'question_id',
        'title',
        'creation_date',
        'accepted_answer_id'
    )
    .withColumn('created_year', year('creation_date'))
    .withColumn('title_words', split('title', ' '))
    .withColumn('title_size', size('title_words'))
    .withColumn('long_title', when(col('title_size') > 4, True).otherwise(False) )
    .select(
        'question_id',
        'title',
        'created_year',
        'long_title',
        'accepted_answer_id'
    )
    .filter((col('created_year') > 2015) | (col('accepted_answer_id').isNotNull() & col('long_title')))
    .drop('long_title')
)

In [9]:
resultDF.show()

+-----------+--------------------+------------+------------------+
|question_id|               title|created_year|accepted_answer_id|
+-----------+--------------------+------------+------------------+
|     167813|Proving the Loren...|        2015|            167832|
|     360185|Can thermal mass ...|        2017|              null|
|      60913|Space time curvat...|        2013|             60920|
|     375564|Sound wave "lost"...|        2017|            375565|
|     397115|Why is a cylindri...|        2018|              null|
|     127964|Question on using...|        2014|            129655|
|     306827|How does water sl...|        2017|              null|
|     159308|Why can a particl...|        2015|            165035|
|     403447|Inertia as a fund...|        2018|              null|
|      98089|Ground State Assi...|        2014|            101131|
|     430369|What is meant by ...|        2018|            430391|
|     390718|Can a De Sitter u...|        2018|              n