# AWS Glue Job - Load into Gold Stage

## Set the Glue session parameters


In [8]:
%iam_role arn:aws:iam::212430227630:role/LabRole
%region us-east-1
%number_of_workers 2

%idle_timeout 30
%glue_version 4.0
%worker_type G.1X

%%configure 
{
  "--enable-metrics": "true",
  "--enable-observability-metrics": "true"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Current iam_role is arn:aws:iam::212430227630:role/LabRole
iam_role has been set to arn:aws:iam::212430227630:role/LabRole.
Previous region: us-east-1
Setting new region to: us-east-1
Region is set to: us-east-1
Previous number of workers: None
Setting new number of workers to: 2
Current idle_timeout is None minutes.
idle_timeout has been set to 30 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
The following configurations have been updated: {'--enable-metrics': 'true', '--enable-observability-metrics': 'true'}


##  Set up and start your interactive session.


In [11]:
%load_ext autoreload
%autoreload 2

In [1]:
import sys
import boto3

from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 30
Session ID: fed13227-58b5-4245-b63c-b8ca6b353da8
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
--enable-metrics true
--enable-observability-metrics true
Waiting for session fed13227-58b5-4245-b63c-b8ca6b353da8 to get into ready status...
Session fed13227-58b5-4245-b63c-b8ca6b353da8 has been created.



## Save INFO data to Gold


In [2]:
from datetime import datetime, timedelta, timezone

import pyspark.sql.functions as F




### Set AWS Storage parameters


In [3]:
BUCKET_NAME = "cryptoengineer"
PREFIX_SILVER = "datalake/silver"
PREFIX_BRONZE = "datalake/bronze"
PREFIX_GOLD = "datalake/gold"




### Load job parameters

In [4]:
glue_client = boto3.client("glue")
# Check if params comde from a GLUE workflow
if '--WORKFLOW_NAME' in sys.argv and '--WORKFLOW_RUN_ID' in sys.argv:
    print("Running in Glue Workflow")
    
    glue_args = getResolvedOptions(
        sys.argv, ['WORKFLOW_NAME', 'WORKFLOW_RUN_ID']
    )
    
    print("Reading the workflow parameters")
    workflow_args = glue_client.get_workflow_run_properties(
        Name=glue_args['WORKFLOW_NAME'], RunId=glue_args['WORKFLOW_RUN_ID']
    )["RunProperties"]

    
    type = workflow_args['type']
    stage = "silver"
else:
    # Check if params comde from a Glue Job
    try:
        args = getResolvedOptions(sys.argv,
                                  ['JOB_NAME',
                                   'type',
                                   'stage'
                                   ])

        type = args['type']
        stage = args['stage']
        print("Running as Job")
    except:
        # Set the params for an interactive session
        print("Running as an interactive session")
        type = "commodities"
        stage = "silver"


Running as an interactive session


In [5]:
print("Type: ", type)
print("Stage: ", stage)

Type:  commodities
Stage:  silver


## Read data for Bronze Stage

In [6]:
path=f"s3://{BUCKET_NAME}/{PREFIX_BRONZE}/{type}"
print("Path:",path)

Path: s3://cryptoengineer/datalake/bronze/commodities


In [7]:
#Llevo a cabo la lectura del parquet correspondiente al tipo de asset para bronze
df_bronze = (
    spark
    .read
    #.select('symbol', 'datetime', 'timestamp')
    .parquet(path)
)

df_bronze.printSchema()
df_bronze.show(5)

root
 |-- datetime: string (nullable = true)
 |-- open: double (nullable = true)
 |-- low: double (nullable = true)
 |-- high: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: long (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- date: string (nullable = true)
 |-- base_currency: string (nullable = true)
 |-- source: string (nullable = true)
 |-- frequency: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- audit_time: timestamp (nullable = true)
 |-- type: string (nullable = true)
 |-- load_date: date (nullable = true)

+----------+------+------+------+------+------+----+-----+---+----+----------+-------------+------+---------+------+--------------------+-----------+----------+
|  datetime|  open|   low|  high| close|volume|year|month|day|time|      date|base_currency|source|frequency|symbol|          audit_time|       type| l

## Read data for Silver Stage

In [8]:
path=f"s3://{BUCKET_NAME}/{PREFIX_SILVER}/{type}"
print("Path:",path)

Path: s3://cryptoengineer/datalake/silver/commodities


In [9]:
#Llevo a cabo la lectura del parquet correspondiente al tipo de asset para silver
df_silver = (
    spark
    .read
    #.select('symbol', 'datetime', 'timestamp')
    .parquet(path)
)

df_silver.printSchema()
df_silver.show(5)

root
 |-- DATETIME: timestamp (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- VOLUME: double (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- TIME: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- BASE_CURRENCY: string (nullable = true)
 |-- SOURCE: string (nullable = true)
 |-- FREQUENCY: string (nullable = true)
 |-- AUDIT_TIME: timestamp (nullable = true)
 |-- TYPE: string (nullable = true)
 |-- LOAD_DATE: date (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- YEAR: integer (nullable = true)

+-------------------+-----+-----+-----+-----+--------+-----+---+----+----------+-------------+------+---------+--------------------+-----------+----------+------+----+
|           DATETIME| OPEN|  LOW| HIGH|CLOSE|  VOLUME|MONTH|DAY|TIME|      DATE|BASE_CURRENCY|SOURCE|FREQUENCY|          AUDIT_TIME|    

## Generate INFO data for Bronze stage

In [11]:
#llevo a cabo una agregación sobre cada simbolo y genero columnas adicionales para gold
if type == 'cryptos':
    table = 'bronze_t_cryptos'
elif type == 'indices':
    table = 'bronze_t_indices'
elif type == 'forex':
    table = 'bronze_t_forex'    
elif type == 'commodities':
    table = 'bronze_t_commodities'      

print("Table: ", table)

Table:  bronze_t_commodities


In [11]:
"""
(
    df_bronze
    .withColumn("datetime", 
                F.when(F.col("datetime").rlike("^\d{4}-\d{2}-\d{2}$"),F.concat(F.col("datetime"), F.lit(" 00:00:00")))
                .otherwise(F.col("datetime")))
    .groupBy("symbol","frequency","base_currency")
    .agg(
        F.min('datetime').alias('INIT_DATETIME'),
        F.max('datetime').alias('END_DATETIME'),
    )
    .withColumn("STAGE", F.lit('bronze'))
    .withColumn("TYPE", F.lit(type))
    .withColumn("TABLE", F.lit(table))
    .select("STAGE", "TABLE", "SYMBOL", "BASE_CURRENCY", "TYPE", "INIT_DATETIME", "END_DATETIME", "FREQUENCY")
    .withColumn("INIT_DATETIME", F.to_timestamp(F.col("INIT_DATETIME"), 'yyyy-MM-dd HH:mm:ss'))
    .withColumn("END_DATETIME", F.to_timestamp(F.col("END_DATETIME"), 'yyyy-MM-dd HH:mm:ss'))
    .withColumnRenamed("FREQUENCY", "FREQUENCIES") 
    .orderBy("STAGE", "TYPE")
 ).show(20)
 """

+------+--------------------+------+-------------+-----------+-------------------+-------------------+-----------+
| STAGE|               TABLE|SYMBOL|BASE_CURRENCY|       TYPE|      INIT_DATETIME|       END_DATETIME|FREQUENCIES|
+------+--------------------+------+-------------+-----------+-------------------+-------------------+-----------+
|bronze|bronze_t_commodities| BZUSD|          USD|commodities|2023-09-24 18:00:00|2024-09-06 16:45:00|      15min|
|bronze|bronze_t_commodities| BZUSD|          USD|commodities|2019-09-02 00:00:00|2024-09-06 00:00:00|       1day|
|bronze|bronze_t_commodities| CLUSD|          USD|commodities|2023-10-01 18:00:00|2024-09-06 16:45:00|      15min|
|bronze|bronze_t_commodities| CLUSD|          USD|commodities|2019-09-03 00:00:00|2024-09-06 00:00:00|       1day|
|bronze|bronze_t_commodities| GCUSD|          USD|commodities|2023-09-24 18:00:00|2024-09-06 16:45:00|      15min|
|bronze|bronze_t_commodities| GCUSD|          USD|commodities|2019-09-02 00:00:0

In [12]:
    
info_df_bronze = (
    df_bronze
    .withColumn("datetime", 
                F.when(F.col("datetime").rlike("^\d{4}-\d{2}-\d{2}$"),F.concat(F.col("datetime"), F.lit(" 00:00:00")))
                .otherwise(F.col("datetime")))
    .groupBy("symbol","frequency","base_currency")
    .agg(
        F.min('datetime').alias('INIT_DATETIME'),
        F.max('datetime').alias('END_DATETIME'),
    )
    .withColumn("STAGE", F.lit('bronze'))
    .withColumn("TYPE", F.lit(type))
    .withColumn("TABLE", F.lit(table))
    .select("STAGE", "TABLE", "SYMBOL", "BASE_CURRENCY", "TYPE", "INIT_DATETIME", "END_DATETIME", "FREQUENCY")
    .withColumn("INIT_DATETIME", F.to_timestamp(F.col("INIT_DATETIME"), 'yyyy-MM-dd HH:mm:ss'))
    .withColumn("END_DATETIME", F.to_timestamp(F.col("END_DATETIME"), 'yyyy-MM-dd HH:mm:ss'))
    .withColumnRenamed("FREQUENCY", "FREQUENCIES") 
    .orderBy("STAGE", "TYPE")
)

info_df_bronze.printSchema()
info_df_bronze.show(20)

root
 |-- STAGE: string (nullable = false)
 |-- TABLE: string (nullable = false)
 |-- SYMBOL: string (nullable = true)
 |-- BASE_CURRENCY: string (nullable = true)
 |-- TYPE: string (nullable = false)
 |-- INIT_DATETIME: timestamp (nullable = true)
 |-- END_DATETIME: timestamp (nullable = true)
 |-- FREQUENCIES: string (nullable = true)

+------+--------------------+------+-------------+-----------+-------------------+-------------------+-----------+
| STAGE|               TABLE|SYMBOL|BASE_CURRENCY|       TYPE|      INIT_DATETIME|       END_DATETIME|FREQUENCIES|
+------+--------------------+------+-------------+-----------+-------------------+-------------------+-----------+
|bronze|bronze_t_commodities| BZUSD|          USD|commodities|2023-09-24 18:00:00|2024-09-24 11:00:00|      15min|
|bronze|bronze_t_commodities| BZUSD|          USD|commodities|2019-09-02 00:00:00|2024-09-24 00:00:00|       1day|
|bronze|bronze_t_commodities| CLUSD|          USD|commodities|2023-10-01 18:00:00|202

## Generate INFO data for Silver stage

In [13]:
#llevo a cabo una agregación sobre cada simbolo y genero columnas adicionales para gold
if type == 'cryptos':
    table = 'silver_t_cryptos'
elif type == 'indices':
    table = 'silver_t_indices'
elif type == 'forex':
    table = 'silver_t_forex'    
elif type == 'commodities':
    table = 'silver_t_commodities'      

print("Table: ", table)

Table:  silver_t_commodities


In [14]:
"""
(
    df_silver
    .groupBy("symbol","frequency","base_currency")
    .agg(
        F.min('datetime').alias('INIT_DATETIME'),
        F.max('datetime').alias('END_DATETIME'),
    )
    .withColumn("STAGE", F.lit('silver'))
    .withColumn("TYPE", F.lit(type))        
    .withColumn("TABLE", F.lit(table))
    .select("STAGE", "TABLE", "SYMBOL", "BASE_CURRENCY", "TYPE", "INIT_DATETIME", "END_DATETIME", "FREQUENCY")
    .withColumnRenamed("FREQUENCY", "FREQUENCIES") 
    .orderBy("STAGE", "TYPE")
 ).show(20)
 """

+------+--------------------+------+-------------+-----------+-------------------+-------------------+-----------+
| STAGE|               TABLE|SYMBOL|BASE_CURRENCY|       TYPE|      INIT_DATETIME|       END_DATETIME|FREQUENCIES|
+------+--------------------+------+-------------+-----------+-------------------+-------------------+-----------+
|silver|silver_t_commodities| GCUSD|          USD|commodities|2019-09-02 00:00:00|2024-09-06 00:00:00|       1day|
|silver|silver_t_commodities| NGUSD|          USD|commodities|2019-09-02 00:00:00|2024-09-06 00:00:00|       1day|
|silver|silver_t_commodities| GCUSD|          USD|commodities|2023-09-24 18:00:00|2024-09-06 16:45:00|      15min|
|silver|silver_t_commodities| BZUSD|          USD|commodities|2019-09-02 00:00:00|2024-09-06 00:00:00|       1day|
|silver|silver_t_commodities| CLUSD|          USD|commodities|2023-10-01 18:00:00|2024-09-06 16:45:00|      15min|
|silver|silver_t_commodities| CLUSD|          USD|commodities|2019-09-03 00:00:0

In [14]:
    
info_df_silver = (
    df_silver
    .withColumn("datetime", 
                F.when(F.col("datetime").rlike("^\d{4}-\d{2}-\d{2}$"),F.concat(F.col("datetime"), F.lit(" 00:00:00")))
                .otherwise(F.col("datetime")))
    .groupBy("symbol","frequency","base_currency")
    .agg(
        F.min('datetime').alias('INIT_DATETIME'),
        F.max('datetime').alias('END_DATETIME'),
    )
    .withColumn("STAGE", F.lit('silver'))
    .withColumn("TYPE", F.lit(type))        
    .withColumn("TABLE", F.lit(table))
    .select("STAGE", "TABLE", "SYMBOL", "BASE_CURRENCY", "TYPE", "INIT_DATETIME", "END_DATETIME", "FREQUENCY")
    .withColumn("INIT_DATETIME", F.to_timestamp(F.col("INIT_DATETIME"), 'yyyy-MM-dd HH:mm:ss'))
    .withColumn("END_DATETIME", F.to_timestamp(F.col("END_DATETIME"), 'yyyy-MM-dd HH:mm:ss'))
    .withColumnRenamed("FREQUENCY", "FREQUENCIES") 
    .orderBy("STAGE", "TYPE")
 )

info_df_silver.printSchema()
info_df_silver.show(20)

root
 |-- STAGE: string (nullable = false)
 |-- TABLE: string (nullable = false)
 |-- SYMBOL: string (nullable = true)
 |-- BASE_CURRENCY: string (nullable = true)
 |-- TYPE: string (nullable = false)
 |-- INIT_DATETIME: timestamp (nullable = true)
 |-- END_DATETIME: timestamp (nullable = true)
 |-- FREQUENCIES: string (nullable = true)

+------+--------------------+------+-------------+-----------+-------------------+-------------------+-----------+
| STAGE|               TABLE|SYMBOL|BASE_CURRENCY|       TYPE|      INIT_DATETIME|       END_DATETIME|FREQUENCIES|
+------+--------------------+------+-------------+-----------+-------------------+-------------------+-----------+
|silver|silver_t_commodities| BZUSD|          USD|commodities|2023-09-24 18:00:00|2024-09-24 11:00:00|      15min|
|silver|silver_t_commodities| BZUSD|          USD|commodities|2019-09-02 00:00:00|2024-09-24 00:00:00|       1day|
|silver|silver_t_commodities| CLUSD|          USD|commodities|2023-10-01 18:00:00|202

## Update INFO data for TYPE in Gold Stage

In [15]:
path=f"s3://{BUCKET_NAME}/{PREFIX_GOLD}/{type}"
print("Path:",path)

Path: s3://cryptoengineer/datalake/gold/commodities


In [16]:
# Combino los elementos de df_gold
info_df_gold = info_df_bronze.union(info_df_silver)

info_df_gold.printSchema()
info_df_gold.show(20)

root
 |-- STAGE: string (nullable = false)
 |-- TABLE: string (nullable = false)
 |-- SYMBOL: string (nullable = true)
 |-- BASE_CURRENCY: string (nullable = true)
 |-- TYPE: string (nullable = false)
 |-- INIT_DATETIME: timestamp (nullable = true)
 |-- END_DATETIME: timestamp (nullable = true)
 |-- FREQUENCIES: string (nullable = true)

+------+--------------------+------+-------------+-----------+-------------------+-------------------+-----------+
| STAGE|               TABLE|SYMBOL|BASE_CURRENCY|       TYPE|      INIT_DATETIME|       END_DATETIME|FREQUENCIES|
+------+--------------------+------+-------------+-----------+-------------------+-------------------+-----------+
|bronze|bronze_t_commodities| BZUSD|          USD|commodities|2023-09-24 18:00:00|2024-09-24 11:00:00|      15min|
|bronze|bronze_t_commodities| BZUSD|          USD|commodities|2019-09-02 00:00:00|2024-09-24 00:00:00|       1day|
|bronze|bronze_t_commodities| CLUSD|          USD|commodities|2023-10-01 18:00:00|202

In [17]:
# Sobreescribo el resultado
(
    info_df_gold
    .write
    .format('parquet')
    .mode('overwrite')
    .partitionBy("STAGE")   
    .save(path)
)



