## Working with tables

### Task I - Table creation
* create table messages 
* take data from questions (question_id -> message_id, creation_date, body, user_id)
* partition the table by year (derived_from creation_date)

### Task II - Table append
* append to the table new data
* take data from answers with the same structure
* partition by year & append to the table messages

### Task III - Partitions overwrite
* overwrite only partition for the year 2018
* take data from questions but filter only for year 2018
* use insertInto with dynamic overwrite

### Task IV - Tables management
* list all tables that we have in our database
* see the properties of the messages table
* rename the table messages -> posts
* see all partitions that the table has
* see properties of the partiton year=2018


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year
from pyspark.sql.types import *

import os

In [2]:
spark = (
    SparkSession
    .builder
    .appName('Tables')
    .enableHiveSupport()
    .getOrCreate()
)

In [3]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

answers_input_path = os.path.join(project_path, 'data/answers')

messages_path = os.path.join(project_path, 'output/tables/messages')

In [4]:
my_schema = StructType([
    StructField('question_id', LongType()),
    StructField('creation_date', TimestampType()),
    StructField('body', StringType()),
    StructField('user_id', LongType())
])

In [5]:
questionsDF = spark.read.schema(my_schema).parquet(questions_input_path)

In [6]:
answersDF = spark.read.schema(my_schema).parquet(answers_input_path)

In [7]:
(
    questionsDF
    .withColumn("year", year("creation_date"))
    .repartition("year")
    .write
    .mode("overwrite")
    .partitionBy("year")
    .option("path", messages_path)
    .saveAsTable("messages")
)

In [8]:
(
    answersDF
    .withColumn("year", year("creation_date"))
    .repartition("year")
    .write
    .mode("append")
    .partitionBy("year")
    .option("path", messages_path)
    .saveAsTable("messages")
)

In [55]:
spark.conf.get("spark.sql.sources.partitionOverwriteMode")

'DYNAMIC'

In [53]:
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "DYNAMIC")

In [48]:
(
    questionsDF
    .withColumn("year", year("creation_date"))
    .filter(col('year') == 2018)
    .repartition("year")
    .write
    .insertInto("messages", overwrite=False)
)

In [14]:
spark.read.schema(my_schema).parquet(messages_path).filter(col('year') == 2018).count()

22286

In [57]:
# static:
# .insertInto("messages", overwrite=False) appenduje data do tech partisen kam to patri a ostani necha byt
# .insertInto("messages", overwrite=True) prepise vse - necha jenom partisny s novymi daty

# dynamic:
# .insertInto("messages", overwrite=False) appenduje pouze danou partisnu, ostatni partisny necha byt
# .insertInto("messages", overwrite=True) prepise pouze danou partisnu

(
    questionsDF
    .withColumn("year", year("creation_date"))
    .filter(col('year').isin([2018]))
    .repartition("year")
    .write
    .insertInto("messages", overwrite=True)
)

In [30]:
spark.read.schema(my_schema).parquet(messages_path).filter(col('year') == 2018).count()

89144