# CARGA DE DATOS INFORMATIVOS EN LA CAPA GOLD
##### Este notebook tiene como objetivo actualizar la información agregada que sirve como resumen de la información existente en las distintas capas

####  Run this cell to set up and start your interactive session.


In [30]:
#%help
#%stop_session

In [1]:
%region us-east-1
%number_of_workers 2
%idle_timeout 30
%worker_type G.1X
%glue_version 4.0

BUCKET = 'cryptoengineer'

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Previous region: us-east-1
Setting new region to: us-east-1
Region is set to: us-east-1
Previous number of workers: None
Setting new number of workers to: 2
Current idle_timeout is None minutes.
idle_timeout has been set to 30 minutes.
Previous worker type: None
Setting new worker type to: G.1X
Setting Glue version to: 4.0
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 30
Session ID: f503f0e8-d364-4efd-8807-aec07176a523
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session f503f0e8-d364-4efd-8807-aec0

In [2]:
#Importación de librerías necesarias
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType, TimestampType, DateType
from pyspark.sql.functions import col, from_unixtime, lit, regexp_replace, current_date, min as spark_min, max as spark_max, upper
import boto3
import os
import sys

from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)




In [3]:
#Load job parameters
glue_client = boto3.client("glue")

if '--WORKFLOW_NAME' in sys.argv and '--WORKFLOW_RUN_ID' in sys.argv:
    print("Running in Glue Workflow")
    
    glue_args = getResolvedOptions(
        sys.argv, ['WORKFLOW_NAME', 'WORKFLOW_RUN_ID', 'type']
    )
    
    print(glue_args)
    
    asset_type = glue_args['type']

else:
    print("Running as Job")
    args = getResolvedOptions(sys.argv,
                              ['JOB_NAME',
                               'type'])
    asset_type = args['type']  

GlueArgumentError: the following arguments are required: --JOB_NAME, --type


In [6]:
#asset_type = 'cryptos'

In [4]:
#Llevo a cabo la lectura del parquet correspondiente al tipo de asset para bronze
df_bronze = (
    spark.read
    .format('parquet')
    .option('header', 'true')
    .load('s3://' + BUCKET + f'/datalake/bronze/{asset_type}')
    #.select('symbol', 'datetime', 'timestamp')
)

df_bronze.printSchema()
df_bronze.show(5)

root
 |-- TIMESTAMP: string (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- VOLUME: double (nullable = true)
 |-- TRADES: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- DATETIME: timestamp (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- load_date: date (nullable = true)

+----------+-------+-------+-------+-------+----------+------+--------+-------+-------------------+----+----------+
| TIMESTAMP|   OPEN|   HIGH|    LOW|  CLOSE|    VOLUME|TRADES|  ORIGIN| SYMBOL|           DATETIME|YEAR| load_date|
+----------+-------+-------+-------+-------+----------+------+--------+-------+-------------------+----+----------+
|1578496500| 8300.0| 8300.0| 8300.0| 8300.0|     0.002|     1|historic|XBTUSDC|2020-01-08 15:15:00|2020|2024-08-23|
|1578497400| 8300.0| 8300.0| 8300.0| 8300.0| 4.0963E-4|     1|historic|XB

In [5]:
#Llevo a cabo la lectura del parquet correspondiente al tipo de asset para silver
df_silver = (
    spark.read
    .format('parquet')
    .option('header', 'true')
    .load('s3://' + BUCKET + f'/datalake/silver/{asset_type}')
    #.select('symbol', 'datetime', 'timestamp')
)

df_silver.printSchema()
df_silver.show(5)

root
 |-- BASE_CURRENCY: string (nullable = true)
 |-- TYPE: string (nullable = true)
 |-- DATETIME: timestamp (nullable = true)
 |-- DATE: string (nullable = true)
 |-- TIME: string (nullable = true)
 |-- FREQUENCY: string (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- VOLUME: double (nullable = true)
 |-- TRADE: integer (nullable = true)
 |-- AUDIT_TIME: date (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- YEAR: integer (nullable = true)

+-------------+-------+-------------------+----------+--------+---------+-----+---+--------+--------+--------+--------+----------+-----+----------+-------+----+
|BASE_CURRENCY|   TYPE|           DATETIME|      DATE|    TIME|FREQUENCY|MONTH|DAY|    OPEN|    HIGH|     LOW|   CLOSE|    VOLUME|TRADE|AUDIT_TIME| SYMBOL|YEAR|
+-------------+-------+---

In [6]:
#llevo a cabo una agregación sobre cada simbolo y genero columnas adicionales para gold
if asset_type == 'cryptos':
    table = 'bronze_t_cryptos'
elif asset_type == 'indices':
    table = 'bronze_t_indices'
    
info_df_bronze = (
    df_bronze
    .groupBy("symbol")
    .agg(
        spark_min('datetime').alias('INIT_DATETIME'),
        spark_max('datetime').alias('END_DATETIME'),
    )
    .withColumn("BASE_CURRENCY", lit('USD'))
    .withColumn("TYPE", lit(asset_type))
    .withColumn("STAGE", lit('bronze'))
    .withColumn("TABLE", lit(table))
    .withColumn("FREQUENCIES", lit('15min'))
    .select("STAGE", "TABLE", "SYMBOL", "BASE_CURRENCY", "TYPE", "INIT_DATETIME", "END_DATETIME", "FREQUENCIES")
    .orderBy("STAGE", "TYPE")
 )

info_df_bronze.printSchema()
info_df_bronze.show(5)

root
 |-- STAGE: string (nullable = false)
 |-- TABLE: string (nullable = false)
 |-- SYMBOL: string (nullable = true)
 |-- BASE_CURRENCY: string (nullable = false)
 |-- TYPE: string (nullable = false)
 |-- INIT_DATETIME: timestamp (nullable = true)
 |-- END_DATETIME: timestamp (nullable = true)
 |-- FREQUENCIES: string (nullable = false)

+------+---------+--------+-------------+-------+-------------------+-------------------+-----------+
| STAGE|    TABLE|  SYMBOL|BASE_CURRENCY|   TYPE|      INIT_DATETIME|       END_DATETIME|FREQUENCIES|
+------+---------+--------+-------------+-------+-------------------+-------------------+-----------+
|bronze|T_CRYPTOS| XBTUSDC|          USD|cryptos|2020-01-08 15:15:00|2024-03-31 23:45:00|      15min|
|bronze|T_CRYPTOS|1INCHUSD|          USD|cryptos|2021-08-10 15:30:00|2024-03-31 23:45:00|      15min|
+------+---------+--------+-------------+-------+-------------------+-------------------+-----------+


In [7]:
#llevo a cabo una agregación sobre cada simbolo y genero columnas adicionales para silver
if asset_type == 'cryptos':
    table = 'silver_t_cryptos'
elif asset_type == 'indices':
    table = 'silver_t_indices'
    
info_df_silver = (
    df_silver
    .groupBy("symbol")
    .agg(
        spark_min('datetime').alias('INIT_DATETIME'),
        spark_max('datetime').alias('END_DATETIME'),
    )
    .withColumn("BASE_CURRENCY", lit('USD'))
    .withColumn("TYPE", lit(asset_type))
    .withColumn("STAGE", lit('silver'))
    .withColumn("TABLE", lit(table))
    .withColumn("FREQUENCIES", lit('15min'))
    .select("STAGE", "TABLE", "SYMBOL", "BASE_CURRENCY", "TYPE", "INIT_DATETIME", "END_DATETIME", "FREQUENCIES")
    .orderBy("STAGE", "TYPE")
 )

info_df_silver.printSchema()
info_df_silver.show(5)

root
 |-- STAGE: string (nullable = false)
 |-- TABLE: string (nullable = false)
 |-- SYMBOL: string (nullable = true)
 |-- BASE_CURRENCY: string (nullable = false)
 |-- TYPE: string (nullable = false)
 |-- INIT_DATETIME: timestamp (nullable = true)
 |-- END_DATETIME: timestamp (nullable = true)
 |-- FREQUENCIES: string (nullable = false)

+------+---------+--------+-------------+-------+-------------------+-------------------+-----------+
| STAGE|    TABLE|  SYMBOL|BASE_CURRENCY|   TYPE|      INIT_DATETIME|       END_DATETIME|FREQUENCIES|
+------+---------+--------+-------------+-------+-------------------+-------------------+-----------+
|silver|T_CRYPTOS| XBTUSDC|          USD|cryptos|2020-01-08 15:15:00|2024-03-31 23:45:00|      15min|
|silver|T_CRYPTOS|1INCHUSD|          USD|cryptos|2021-08-10 15:30:00|2024-03-31 23:45:00|      15min|
+------+---------+--------+-------------+-------+-------------------+-------------------+-----------+


In [8]:
# Combino los elementos de df_gold
info_df_gold = info_df_bronze.union(info_df_silver)

info_df_gold.printSchema()
info_df_gold.show(5)

root
 |-- STAGE: string (nullable = false)
 |-- TABLE: string (nullable = false)
 |-- SYMBOL: string (nullable = true)
 |-- BASE_CURRENCY: string (nullable = false)
 |-- TYPE: string (nullable = false)
 |-- INIT_DATETIME: timestamp (nullable = true)
 |-- END_DATETIME: timestamp (nullable = true)
 |-- FREQUENCIES: string (nullable = false)

+------+---------+--------+-------------+-------+-------------------+-------------------+-----------+
| STAGE|    TABLE|  SYMBOL|BASE_CURRENCY|   TYPE|      INIT_DATETIME|       END_DATETIME|FREQUENCIES|
+------+---------+--------+-------------+-------+-------------------+-------------------+-----------+
|bronze|T_CRYPTOS| XBTUSDC|          USD|cryptos|2020-01-08 15:15:00|2024-03-31 23:45:00|      15min|
|bronze|T_CRYPTOS|1INCHUSD|          USD|cryptos|2021-08-10 15:30:00|2024-03-31 23:45:00|      15min|
|silver|T_CRYPTOS| XBTUSDC|          USD|cryptos|2020-01-08 15:15:00|2024-03-31 23:45:00|      15min|
|silver|T_CRYPTOS|1INCHUSD|          USD|crypt

In [9]:
# Sobreescribo el resultado
(
    info_df_gold
    .write
    .format('parquet')
    .mode('overwrite')
    .partitionBy("STAGE")   
    .save('s3://' + BUCKET + f'/datalake/gold/{asset_type}')
)

print('Datos guardados en s3://' + BUCKET + f'/datalake/gold/{asset_type}')

Datos guardados en s3://cryptoengineer-lg/datalake/gold/cryptos
