# CARGA HISTÓRICA DE DATOS
##### Como continuación al notebook "Historic_load" este notebook tiene como finalidad buscar la fecha máxima cargada hasta el momento para cada symbol mediante el procesado del fichero .csv, y rellenar el hueco restante hasta la fecha actual realizando las llamadas APIs necesarias.


####  Run this cell to set up and start your interactive session.


In [11]:
#%help
#%stop_session

In [1]:
%region us-east-1
%number_of_workers 2
%idle_timeout 30
%worker_type G.1X
%glue_version 4.0

%extra_py_files s3://cryptoengineer/gluejobs-py-modules/KrakenApiTrades.py, s3://cryptoengineer/gluejobs-py-modules/KrakenApiOHLC.py

%load_ext autoreload
%autoreload 2

BUCKET = 'cryptoengineer'

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Previous region: us-east-1
Setting new region to: us-east-1
Region is set to: us-east-1
Previous number of workers: None
Setting new number of workers to: 2
Current idle_timeout is None minutes.
idle_timeout has been set to 30 minutes.
Previous worker type: None
Setting new worker type to: G.1X
Setting Glue version to: 4.0
Extra py files to be included:
s3://cryptoengineer/gluejobs-py-modules/KrakenApiTrades.py
s3://cryptoengineer/gluejobs-py-modules/KrakenApiOHLC.py
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 30
Session ID: de71fa32-2817-49d3-ac1a-e942e8e1d1f6
Ap

In [2]:
#Importación de librerías necesarias
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType, TimestampType, DateType
from pyspark.sql.functions import col, from_unixtime, lit, regexp_replace, current_date, min as spark_min, max as spark_max, date_format, datediff
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3
import os
from datetime import datetime
import KrakenApiTrades, KrakenApiOHLC

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)




In [3]:
#Llevo a cabo la lectura del resumen generado como metadatos en la capa gold con stage igual a bronze
resumen_bronze_df = (
    spark.read
    .format('parquet')
    .option('header', 'true')
    .load('s3://' + BUCKET + f'/datalake/gold/cryptos')
    .filter(col('STAGE') == 'bronze')
    .filter(col('TYPE') == 'cryptos')
    #.select('symbol', 'datetime', 'timestamp')
    .withColumn('END_DATE_STRING', date_format(col('END_DATETIME'), 'dd-MM-yyyy'))
    .withColumn('CURRENT_DATE_STRING', date_format(current_date(), 'dd-MM-yyyy'))
)

resumen_bronze_df.printSchema()
resumen_bronze_df.show(5)

root
 |-- TABLE: string (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- BASE_CURRENCY: string (nullable = true)
 |-- TYPE: string (nullable = true)
 |-- INIT_DATETIME: timestamp (nullable = true)
 |-- END_DATETIME: timestamp (nullable = true)
 |-- FREQUENCIES: string (nullable = true)
 |-- STAGE: string (nullable = true)
 |-- END_DATE_STRING: string (nullable = true)
 |-- CURRENT_DATE_STRING: string (nullable = false)

+----------------+-------+-------------+-------+-------------------+-------------------+-----------+------+---------------+-------------------+
|           TABLE| SYMBOL|BASE_CURRENCY|   TYPE|      INIT_DATETIME|       END_DATETIME|FREQUENCIES| STAGE|END_DATE_STRING|CURRENT_DATE_STRING|
+----------------+-------+-------------+-------+-------------------+-------------------+-----------+------+---------------+-------------------+
|bronze_t_cryptos| XBTUSD|          USD|cryptos|2013-10-06 21:30:00|2024-04-30 18:30:00|      15min|bronze|     30-04-2024|         

In [4]:
#Creación del esquema 
esquema_bronze_df = StructType([
    StructField('TIMESTAMP', StringType(), True),
    StructField('OPEN', DoubleType(), True),
    StructField('HIGH', DoubleType(), True),
    StructField('LOW', DoubleType(), True),
    StructField('CLOSE', DoubleType(), True),
    StructField('VOLUME', DoubleType(), True),
    StructField('TRADES', IntegerType(), True),
    StructField('ORIGIN', StringType(), True),
    StructField('LOAD_DATE', DateType(), True),
    StructField('SYMBOL', StringType(), True),
    StructField('DATETIME', TimestampType(), True),
    StructField('YEAR', IntegerType(), True)
])





In [5]:
ohlc_df_total = spark.createDataFrame([], esquema_bronze_df)

# Recopilamos las filas del DataFrame
rows = resumen_bronze_df.collect()

for row in rows:
    # Asignamos el valor de END_DATE_STRING a start_date
    start_date = row['END_DATE_STRING']
    
    # Asignamos el valor de CURRENT_DATE_STRING a end_date
    end_date = row['CURRENT_DATE_STRING']
    
    # Convertimos las fechas de cadena a objeto datetime para calcular la diferencia
    start_date_dt = datetime.strptime(start_date, '%d-%m-%Y')
    end_date_dt = datetime.strptime(end_date, '%d-%m-%Y')
    
    # Calculamos la diferencia en días
    delta = (end_date_dt - start_date_dt).days
    
    #Seteamos el valor del simbolo
    pair = row['SYMBOL']
    
    # Verificamos si el delta es menor a 7 días e imprimimos la traza correspondiente
    if delta < 7:
        print(f"Traza: El delta es menor a 7 días. start_date: {start_date}, end_date: {end_date}, delta: {delta} días")
        
        ohlc_df = KrakenApiOHLC.get_ohlc_data_for_date_range(pair, '15', start_date, end_date)

        ohlc_spark_df = spark.createDataFrame(ohlc_df)
        
        ohlc_spark_df = (
            ohlc_spark_df
             # Transformaciones básicas
             .withColumn('origin', lit('ApiOHLC'))
                    .withColumn('load_date', lit(current_date()))
                    .withColumn('symbol', lit(pair))
                    .withColumn('datetime', from_unixtime(col('timestamp')).cast('timestamp'))
                    .withColumn('year', col('datetime').substr(0, 4).cast('int'))
        )
        
    else:
        print(f"Traza: El delta es mayor o igual a 7 días. start_date: {start_date}, end_date: {end_date}, delta: {delta} días")
        
        ohlc_df = KrakenApiTrades.get_ohlc_data_for_date_range(pair, start_date, end_date, '15min')
        
        ohlc_spark_df = spark.createDataFrame(ohlc_df)
        
        ohlc_spark_df = (
            ohlc_spark_df
             # Transformaciones básicas
             .withColumn('origin', lit('ApiTrades'))
                    .withColumn('load_date', lit(current_date()))
                    .withColumn('symbol', lit(pair))
                    .withColumn('datetime', from_unixtime(col('timestamp')).cast('timestamp'))
                    .withColumn('year', col('datetime').substr(0, 4).cast('int'))
        )

    ohlc_df_total = ohlc_df_total.union(ohlc_spark_df)

Traza: El delta es menor a 7 días. start_date: 22-08-2024, end_date: 24-08-2024, delta: 2 días
Traza: El delta es menor a 7 días. start_date: 22-08-2024, end_date: 24-08-2024, delta: 2 días


In [6]:
ohlc_df_total.printSchema()
ohlc_df_total.show(5)

root
 |-- TIMESTAMP: string (nullable = true)
 |-- OPEN: string (nullable = true)
 |-- HIGH: string (nullable = true)
 |-- LOW: string (nullable = true)
 |-- CLOSE: string (nullable = true)
 |-- VOLUME: string (nullable = true)
 |-- TRADES: long (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- LOAD_DATE: date (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- DATETIME: timestamp (nullable = true)
 |-- YEAR: integer (nullable = true)

+----------+-----+-----+-----+-----+-------------+------+-------+----------+--------+-------------------+----+
| TIMESTAMP| OPEN| HIGH|  LOW|CLOSE|       VOLUME|TRADES| ORIGIN| LOAD_DATE|  SYMBOL|           DATETIME|YEAR|
+----------+-----+-----+-----+-----+-------------+------+-------+----------+--------+-------------------+----+
|1724285700|0.267|0.267|0.267|0.267|   0.00000000|     0|ApiOHLC|2024-08-24|1INCHUSD|2024-08-22 00:15:00|2024|
|1724286600|0.267|0.267|0.267|0.267|   0.00000000|     0|ApiOHLC|2024-08-24|1INCHUSD|2024-08-22 

In [7]:
ohlc_df_total_casted = ohlc_df_total.select(
    col('TIMESTAMP').cast(esquema_bronze_df['TIMESTAMP'].dataType),
    col('OPEN').cast(esquema_bronze_df['OPEN'].dataType),
    col('HIGH').cast(esquema_bronze_df['HIGH'].dataType),
    col('LOW').cast(esquema_bronze_df['LOW'].dataType),
    col('CLOSE').cast(esquema_bronze_df['CLOSE'].dataType),
    col('VOLUME').cast(esquema_bronze_df['VOLUME'].dataType),
    col('TRADES').cast(esquema_bronze_df['TRADES'].dataType),
    col('ORIGIN').cast(esquema_bronze_df['ORIGIN'].dataType),
    col('LOAD_DATE').cast(esquema_bronze_df['LOAD_DATE'].dataType),
    col('SYMBOL').cast(esquema_bronze_df['SYMBOL'].dataType),
    col('DATETIME').cast(esquema_bronze_df['DATETIME'].dataType),
    col('YEAR').cast(esquema_bronze_df['YEAR'].dataType)
)

ohlc_df_total_casted.printSchema()
ohlc_df_total_casted.show(5)

root
 |-- TIMESTAMP: string (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- VOLUME: double (nullable = true)
 |-- TRADES: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- LOAD_DATE: date (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- DATETIME: timestamp (nullable = true)
 |-- YEAR: integer (nullable = true)

+----------+-----+-----+-----+-----+-------------+------+-------+----------+--------+-------------------+----+
| TIMESTAMP| OPEN| HIGH|  LOW|CLOSE|       VOLUME|TRADES| ORIGIN| LOAD_DATE|  SYMBOL|           DATETIME|YEAR|
+----------+-----+-----+-----+-----+-------------+------+-------+----------+--------+-------------------+----+
|1724285700|0.267|0.267|0.267|0.267|          0.0|     0|ApiOHLC|2024-08-24|1INCHUSD|2024-08-22 00:15:00|2024|
|1724286600|0.267|0.267|0.267|0.267|          0.0|     0|ApiOHLC|2024-08-24|1INCHUSD|2024-08-

In [8]:
#Persistencia de datos
(
    ohlc_df_total_casted
        .write
        .format('parquet')
        #.partitionBy('symbol','year')
        .partitionBy('LOAD_DATE')
        .mode('append')
        .save('s3://' + BUCKET + '/datalake/bronze/cryptos')
)


