# ELT II

In this notebook you will:
* transform timestamp type
* use when condition
* compose filters
* use some other DataFrame functions (split, size, substring)
* convert parquet to csv

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, when, year, size, split, substring
)

import os

In [2]:
spark = (
    SparkSession
    .builder
    .appName('ETL II')
    .getOrCreate()
)

# Task

* take dataset with questions (converted to parquet)
* add a new col 'long_title' which is of BooleanType and it is True if the 'title' has more than 5 words
* add a new col 'created_year' which is the year in which the question was created
* replace the col 'title' with a col of the same name but only with substring of the first 10 letters
* filter only records that have long title and have accepted_answer_id or records that were created in 2016 or later
* save the result as CSV with following columns (question_id, title, created_year, accepted_answer_id)
* create only one csv file on the output (with header)

In [3]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

data_input_path = os.path.join(project_path, 'output/questions-transformed')

data_output_path = os.path.join(project_path, 'output/etl-output/csv')

In [4]:
questionsDF = (
    spark
    .read
    .option('path', data_input_path)
    .load()
)

In [5]:
resultDF = (
    questionsDF
    .select(
        'question_id',
        'title',
        'creation_date',
        'accepted_answer_id'
    )
    .withColumn('created_year', year('creation_date'))
    .withColumn('title_words', split('title', ' '))
    .withColumn('title_size', size('title_words'))
    .withColumn('long_title', when(col('title_size') > 4, True).otherwise(False) )
    .select(
        'question_id',
        'title',
        'created_year',
        'long_title',
        'accepted_answer_id'
    )
    .filter(
        (col('created_year') > 2015) | 
        (col('accepted_answer_id').isNotNull() & col('long_title'))
    )
    .withColumn('title', substring('title', 1, 9))
    .drop('long_title')
)

In [6]:
resultDF.show()

+-----------+---------+------------+------------------+
|question_id|    title|created_year|accepted_answer_id|
+-----------+---------+------------+------------------+
|     167813|Proving t|        2015|            167832|
|     360185|Can therm|        2017|              null|
|      60913|Space tim|        2013|             60920|
|     375564|Sound wav|        2017|            375565|
|     397115|Why is a |        2018|              null|
|     127964|Question |        2014|            129655|
|     306827|How does |        2017|              null|
|     159308|Why can a|        2015|            165035|
|     403447|Inertia a|        2018|              null|
|      98089|Ground St|        2014|            101131|
|     430369|What is m|        2018|            430391|
|     390718|Can a De |        2018|              null|
|     251952|When only|        2016|              null|
|      61679|Effects o|        2013|             61706|
|     211901|How was t|        2015|            

In [9]:
(
    resultDF
    .coalesce(1)
    .write
    .mode('overwrite')
    .format('csv')
    .option('path', data_output_path)
    .option('header', True)
    .save()
)

Py4JJavaError: An error occurred while calling o81.save.
: java.util.NoSuchElementException: None.get
	at scala.None$.get(Option.scala:347)
	at scala.None$.get(Option.scala:345)
	at org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker$.metrics(BasicWriteStatsTracker.scala:173)
	at org.apache.spark.sql.execution.command.DataWritingCommand$class.metrics(DataWritingCommand.scala:51)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.metrics$lzycompute(InsertIntoHadoopFsRelationCommand.scala:47)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.metrics(InsertIntoHadoopFsRelationCommand.scala:47)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.metrics$lzycompute(commands.scala:100)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.metrics(commands.scala:100)
	at org.apache.spark.sql.execution.SparkPlanInfo$.fromSparkPlan(SparkPlanInfo.scala:56)
	at org.apache.spark.sql.execution.SQLExecution$$anonfun$withNewExecutionId$1.apply(SQLExecution.scala:76)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:73)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:668)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:276)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:270)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [8]:
spark.stop()