# ELT II

In this notebook you will:
* transform timestamp type
* use when condition
* compose filters
* use some other DataFrame functions (split, size, substring)
* convert parquet to csv

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, when, year, size, split, substring
)

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('ETL II')
    .getOrCreate()
)

# Task

* take dataset with questions (converted to parquet)
* add a new col 'long_title' which is of BooleanType and it is True if the 'title' has more than 5 words
* add a new col 'created_year' which is the year in which the question was created
* replace the col 'title' with a col of the same name but only with substring of the first 10 letters
* filter only records that have long title and have accepted_answer_id or records that were created in 2016 or later
* save the result as CSV with following columns (question_id, title, created_year, accepted_answer_id)
* create only one csv file on the output (with header)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'output/questions-transformed')

data_output_path = os.path.join(project_path, 'output/etl-output/csv')

In [None]:
questionsDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
)

In [None]:
resultDF = (
    questionsDF
    .select(
        'question_id',
        'title',
        'creation_date',
        'accepted_answer_id'
    )
    .withColumn('created_year', year('creation_date'))
    .withColumn('title_words', split('title', ' '))
    .withColumn('title_size', size('title_words'))
    .withColumn('long_title', when(col('title_size') > 4, True).otherwise(False) )
    .select(
        'question_id',
        'title',
        'created_year',
        'long_title',
        'accepted_answer_id'
    )
    .filter(
        (col('created_year') > 2015) | 
        (col('accepted_answer_id').isNotNull() & col('long_title'))
    )
    .withColumn('title', substring('title', 1, 9))
    .drop('long_title')
)

In [None]:
resultDF.show()

In [None]:
(
    resultDF
    .coalesce(1)
    .write
    .mode('overwrite')
    .format('csv')
    .option('path', data_output_path)
    .option('header', True)
    .save()
)

In [None]:
spark.stop()

In [None]:
|