# CARGA DE DATOS INFORMATIVOS EN LA CAPA SILVER
##### Este notebook tiene como objetivo actualizar la información de la capa silver a partir de la máxima fecha de carga encontrada en bronze

####  Run this cell to set up and start your interactive session.


In [29]:
#%help
%stop_session

Stopping session: 44cd6e96-ed9a-4e6d-84e2-f176c3efdcdf
Stopped session.


In [1]:
%region us-east-1
%number_of_workers 2
%idle_timeout 30
%worker_type G.1X
%glue_version 4.0

BUCKET = 'cryptoengineer'

Previous region: us-east-1
Setting new region to: us-east-1
Region is set to: us-east-1
Previous number of workers: 2
Setting new number of workers to: 2
Current idle_timeout is 30 minutes.
idle_timeout has been set to 30 minutes.
Previous worker type: G.1X
Setting new worker type to: G.1X
Setting Glue version to: 4.0
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 30
Session ID: 858ec864-ab7c-4527-b4ee-cbe9b8fd8819
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session 858ec864-ab7c-4527-b4ee-cbe9b8fd8819 to get into ready status...
Session 858ec864-ab7c-4527-b4ee-cbe9b8fd8819 has been created.



In [3]:
#Importación de librerías necesarias
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType, TimestampType, DateType
from pyspark.sql.functions import col, from_unixtime, lit, regexp_replace, current_date, min as spark_min, max as spark_max, upper, date_format
import boto3
import os
import sys

from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)




In [3]:
#Load job parameters
glue_client = boto3.client("glue")

if '--WORKFLOW_NAME' in sys.argv and '--WORKFLOW_RUN_ID' in sys.argv:
    print("Running in Glue Workflow")
    
    glue_args = getResolvedOptions(
        sys.argv, ['WORKFLOW_NAME', 'WORKFLOW_RUN_ID', 'type']
    )
    
    print(glue_args)
    
    asset_type = glue_args['type']

else:
    print("Running as Job")
    args = getResolvedOptions(sys.argv,
                              ['JOB_NAME',
                               'type'])

    asset_type = args['type']

GlueArgumentError: the following arguments are required: --JOB_NAME, --type


In [35]:
#asset_type = 'cryptos'

In [5]:
# Carga las particiones disponibles
partitions_df = spark.read.parquet('s3://' + BUCKET + f'/datalake/bronze/{asset_type}').select('load_date').distinct()

# Encuentra el último load_date
max_load_date = partitions_df.agg(spark_max('load_date')).collect()[0][0]

print(f"Último load_date encontrado: {max_load_date}")

Último load_date encontrado: 2024-08-23


In [6]:
#Definición del esquema
schema = StructType([
    StructField("SYMBOL", StringType(), False),
    StructField("BASE_CURRENCY", StringType(), True),
    StructField("TYPE", StringType(), True),
    StructField("DATETIME", TimestampType(), True),
    StructField("DATE", DateType(), True),
    StructField("TIME", StringType(), True),
    StructField("FREQUENCY", StringType(), True),
    StructField("YEAR", IntegerType(), False),
    StructField("MONTH", IntegerType(), True),
    StructField("DAY", IntegerType(), True),
    StructField("OPEN", DoubleType(), True),
    StructField("HIGH", DoubleType(), True),
    StructField("LOW", DoubleType(), True),
    StructField("CLOSE", DoubleType(), True),
    StructField("VOLUME", DoubleType(), True),
    StructField("TRADES", IntegerType(), True),
    StructField("AUDIT_TIME", DateType(), True)
])




In [7]:
# Leer los datos correspondientes al último load_date
latest_data_df = (
    spark.read
    .schema(schema)
    .parquet('s3://' + BUCKET + f'/datalake/bronze/{asset_type}')
    .filter(col('load_date') == max_load_date)
)

# Verifica la estructura y muestra algunas filas
latest_data_df.printSchema()
latest_data_df.show(5)

Py4JJavaError: An error occurred while calling o121.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4.0 failed 4 times, most recent failure: Lost task 0.3 in stage 4.0 (TID 10) (172.39.254.209 executor 1): org.apache.spark.sql.execution.QueryExecutionException: Parquet column cannot be converted in file s3://cryptoengineer-lg/datalake/bronze/cryptos/load_date=2024-08-23/part-00008-d79ee3bc-10d5-487c-b9d1-2a26696122ba.c000.snappy.parquet. Column: [TRADES], Expected: int, Found: INT64
	at org.apache.spark.sql.errors.QueryExecutionErrors$.unsupportedSchemaColumnConvertError(QueryExecutionErrors.scala:706)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:397)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:227)
	at org.apache.spark.sql.execution.FileSourceScanExec$$anon$1.hasNext(DataSourceScanExec.scala:702)
	at org.apache.spark.sql.catalyst.expres

In [7]:
#transformo datos para obtener la estructura de silver
transformed_data_df = (
    latest_data_df
    .withColumn('year', col('datetime').substr(0, 4))
    .withColumn('month', col('datetime').substr(6, 2))
    .withColumn('day', col('datetime').substr(9, 2))
    .withColumn('frequency', lit('15min'))
    .withColumn('base_currency', lit('USD'))
    .withColumn("date", date_format(col("datetime"), "yyyy-MM-dd"))
    .withColumn("time", date_format(col("datetime"), "HH:mm:ss"))
    .select(
        col('symbol').alias('SYMBOL'),
        col('base_currency').alias('BASE_CURRENCY'),
        lit(asset_type).alias('TYPE'),
        col('datetime').alias('DATETIME'),
        col('date').alias('DATE'),
        col('time').alias('TIME'),
        col('frequency').alias('FREQUENCY'),
        col('year').cast(IntegerType()).alias('YEAR'),
        col('month').cast(IntegerType()).alias('MONTH'),
        col('day').cast(IntegerType()).alias('DAY'),
        col('open').alias('OPEN'),
        col('high').alias('HIGH'),
        col('low').alias('LOW'),
        col('close').alias('CLOSE'),
        col('volume').alias('VOLUME'),
        col('trades').alias('TRADES'),
        current_date().alias('AUDIT_TIME')
    )
)
transformed_data_df.printSchema()
transformed_data_df.show(5)

root
 |-- SYMBOL: string (nullable = true)
 |-- BASE_CURRENCY: string (nullable = false)
 |-- TYPE: string (nullable = false)
 |-- DATETIME: timestamp (nullable = true)
 |-- DATE: string (nullable = true)
 |-- TIME: string (nullable = true)
 |-- FREQUENCY: string (nullable = false)
 |-- YEAR: integer (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- VOLUME: double (nullable = true)
 |-- TRADE: integer (nullable = true)
 |-- AUDIT_TIME: date (nullable = false)

+-------+-------------+-------+-------------------+----------+--------+---------+----+-----+---+-------+-------+-------+-------+----------+-----+----------+
| SYMBOL|BASE_CURRENCY|   TYPE|           DATETIME|      DATE|    TIME|FREQUENCY|YEAR|MONTH|DAY|   OPEN|   HIGH|    LOW|  CLOSE|    VOLUME|TRADE|AUDIT_TIME|
+-------+-------------+-------

In [8]:
#persistimos los datos en la capa silver, reparticionando por simbolo y año, en modalidad append
(
    transformed_data_df
    .write
    .format('parquet')
    .mode("append")
    .partitionBy("SYMBOL", "YEAR")
    .save('s3://' + BUCKET + f'/datalake/silver/{asset_type}')
)

print('Datos guardados en s3://' + BUCKET +  f'/datalake/silver/{asset_type}')

Datos guardados en s3://cryptoengineer-lg/datalake/silver/cryptos
