# Silver Layer
In this notebook, we will create the Silver Layer of our Data Pipeline. In this layer, it is expected that the Analysts can already work with the data, given that is has already been curated, passed by a first pass of cleanup and had the data types validated.

In this Layer, we will:
* Gather the Bronze datasets and make the stocks datasets as two consolidated ones: (1) Stocks; (2) Economic Indices


In [0]:
# Importing needed Pyspark functions
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, DateType
from pyspark.sql.window import Window

In [0]:
# Function to save datasets to the Silver Layer
def save_silver(df, name):
    '''Save dataset to the silver folder
    Inputs:
    name: str = name of the folder in the silver layer'''

    # Build path in the Silver Layer
    file_path = f'/FileStore/tables/silver/{name}'

    df.coalesce(1).write.format('parquet').mode('overwrite').save(file_path)
    print(f'File {name} saved successfuly.')

In [0]:
# Create an empty RDD
emp_RDD = spark.sparkContext.emptyRDD()
 
# Create an expected schema
columns = StructType([StructField('date', DateType(), True),
                      StructField('ticker', StringType(), True),
                      StructField('open', DoubleType(), True),
                      StructField('high', DoubleType(), True),
                      StructField('low', DoubleType(), True),
                      StructField('close', DoubleType(), True),
                      StructField('volume', IntegerType(), True),
                      StructField('RSI', DoubleType(), True)
                      ])
 
# Create an empty RDD with expected schema that will hold the consolidated data
stocks = spark.createDataFrame(data = emp_RDD,
                           schema = columns)  

In [0]:
# Determine the datasets to be united
tickers = ['CHTR' , 'CMCSA', 'T', 'TMUS', 'VZ']

# Gather transformed datasets
for ticker in tickers:
    path = f'/FileStore/tables/bronze/{ticker}' 
    dtf = spark.read.parquet(path)
    stocks = stocks.union(dtf).sort('ticker', 'date')
    print(f'{ticker} added to dataframe')  

CHTR added to dataframe
CMCSA added to dataframe
T added to dataframe
TMUS added to dataframe
VZ added to dataframe


In [0]:
# Adding Moving Averages to the data
# Window Specs
ws = Window.partitionBy('ticker').orderBy('date')
ws7 = Window.partitionBy('ticker').orderBy('date').rowsBetween(-6, Window.currentRow )
ws14 = Window.partitionBy('ticker').orderBy('date').rowsBetween(-13, Window.currentRow )
ws23 = Window.partitionBy('ticker').orderBy('date').rowsBetween(-22, Window.currentRow )
ws180 = Window.partitionBy('ticker').orderBy('date').rowsBetween(-179, Window.currentRow )


# Add moving averages
stocks_silver = (
    stocks
    .withColumn('ma7', F.mean('close').over(ws7)) # Add 7 days moving average
    .withColumn('ma23', F.mean('close').over(ws23)) # Add 23 days moving average
    .withColumn('ma180', F.mean('close').over(ws180)) # Add 300 days moving average
    )

In [0]:
# Save dataset
save_silver(stocks_silver, 'stocks')

File stocks saved successfuly.


In [0]:
# Create an empty RDD
emp_RDD = spark.sparkContext.emptyRDD()
 
# Create an expected schema
columns = StructType([StructField('indicator', StringType(), True),
                      StructField('value', DoubleType(), True),
                      StructField('date', DateType(), True)
                      ])
 
# Create an empty RDD with expected schema that will hold the consolidated data
indicators = spark.createDataFrame(data = emp_RDD,
                           schema = columns) 


# Determine the datasets to be united
tickers = ['INFLATION', 'REAL_GDP_PER_CAPITA', 'CPI']

# Gather transformed datasets
for ticker in tickers:
    path = f'/FileStore/tables/bronze/{ticker}' 
    dtf = spark.read.parquet(path)
    indicators = indicators.union(dtf)
    print(f'{ticker} added to dataframe')  

INFLATION added to dataframe
REAL_GDP_PER_CAPITA added to dataframe
CPI added to dataframe


In [0]:
# Save consolidated indicators to Silver Folder
save_silver(indicators, 'indicators')

File indicators saved successfuly.


In [0]:
# Open ETF dataset
etf = spark.read.parquet('/FileStore/tables/bronze/DJUSTL/')

# Update columns
etf = (etf
       .select('date', 'ticker', 'open', 'high', 'low', 'close', 'volume')
       .withColumn('RSI', F.lit(0))       
       )

# Save ETF DJUSTL to Silver Layer
save_silver(etf, 'DJUSTL')
#dbutils.fs.rm('FileStore/tables/bronze/DJUSTL/', True)

File DJUSTL saved successfuly.
