# Gold Layer
In this notebook, we are creating the final tables for the Gold Layer of our Data Pipeline. In this layer, the data is ready for consumption by all the clients.

In this Layer, we will:
* Create a Weekly table for the stocks
* Create a Monthly Table for the stocks


In [0]:
# Importing needed Pyspark functions
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, DateType
from pyspark.sql.window import Window

In [0]:
# Function to save datasets to the Silver Layer
def save_gold(df, name):
    '''Save dataset to the silver folder
    Inputs:
    name: str = name of the folder in the silver layer'''

    # Build path in the Silver Layer
    file_path = f'/FileStore/tables/gold/{name}'

    df.coalesce(1).write.format('parquet').mode('overwrite').save(file_path)
    print(f'File {name} saved successfuly.')

In [0]:
# Open the Stocks table
stocks = spark.read.parquet('/FileStore/tables/silver/stocks')

In [0]:
# Creating dataset of stock prices by week
weekly_stocks = (
    stocks # dataset
    .groupBy('ticker', F.year('date').alias('year'), F.weekofyear('date').alias('week')) # group by ticker, year and week
    .agg( F.first('open').alias('open'), # Get open value
          F.max('high').alias('high'), # get high for the week
          F.min('low').alias('low'), # get low for the week
          F.last('close').alias('close'), # get close for the week
          F.sum('volume').alias('volume') ) # calc total volume for the week
    .filter( F.concat(col('year'), col('week')) != '20191' ) # remove first week 2019, as 2019-12-31 is being considered as week 1 in 2020 and it's calculating wrong the high open and close
    .sort('ticker', 'year', 'week')
    )

# Save dataset to the Gold Layer
save_gold(weekly_stocks, 'weekly_stocks')

File weekly_stocks saved successfuly.


In [0]:
# Creating dataset of stock prices by month
monthly_stocks = (
    stocks # dataset
    .groupBy('ticker', F.year('date').alias('year'), F.month('date').alias('month')) # group by ticker, year and month
    .agg( F.first('open').alias('open'), # Get open value
          F.max('high').alias('high'), # get high for the month
          F.min('low').alias('low'), # get low for the month
          F.last('close').alias('close'), # get close for the month
          F.sum('volume').alias('volume') ) # calc total volume for the month
    .sort('ticker', 'year', 'month')
    )

# Save dataset to the Gold Layer
save_gold(monthly_stocks, 'monthly_stocks')

File monthly_stocks saved successfuly.
