# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [1]:
%region us-east-1
%number_of_workers 2
%idle_timeout 30
%worker_type G.1X
%glue_version 4.0

%load_ext autoreload
%autoreload 2

BUCKET = 'cryptoengineer'

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Previous region: us-east-1
Setting new region to: us-east-1
Region is set to: us-east-1
Previous number of workers: None
Setting new number of workers to: 2
Current idle_timeout is None minutes.
idle_timeout has been set to 30 minutes.
Previous worker type: None
Setting new worker type to: G.1X
Setting Glue version to: 4.0
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 30
Session ID: 612690bb-cfdd-4282-ae10-44436ef3728e
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session 612690bb-cfdd-4282-ae10-4443

In [2]:
#Importación de librerías necesarias
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType, TimestampType, DateType
from pyspark.sql.functions import col, from_unixtime, lit, regexp_replace, current_date, min as spark_min, max as spark_max, date_format, datediff
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3
import os
from datetime import datetime

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)




In [3]:
#Llevo a cabo la lectura del resumen generado como metadatos en la capa gold con stage igual a bronze
silver_df = (
    spark.read
    .format('parquet')
    .option('header', 'true')
    .load('s3://' + BUCKET + f'/datalake/silver/cryptos')
)

silver_df.printSchema()
silver_df.show(5)
print(f"Count of rows in silver_df: {silver_df.count()}")

root
 |-- BASE_CURRENCY: string (nullable = true)
 |-- TYPE: string (nullable = true)
 |-- DATETIME: timestamp (nullable = true)
 |-- DATE: string (nullable = true)
 |-- TIME: string (nullable = true)
 |-- FREQUENCY: string (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- VOLUME: double (nullable = true)
 |-- TRADES: integer (nullable = true)
 |-- AUDIT_TIME: date (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- YEAR: integer (nullable = true)

+-------------+-------+-------------------+----------+--------+---------+-----+---+------+------+------+------+------------+------+----------+------+----+
|BASE_CURRENCY|   TYPE|           DATETIME|      DATE|    TIME|FREQUENCY|MONTH|DAY|  OPEN|  HIGH|   LOW| CLOSE|      VOLUME|TRADES|AUDIT_TIME|SYMBOL|YEAR|
+-------------+-------+--------------

In [4]:
wrong_date = '2024-03-31'
audit_time = '2024-09-21'
filtered_df = silver_df.filter((col("AUDIT_TIME") == audit_time) & (col("DATE") == wrong_date))

filtered_df.printSchema()
filtered_df.show(5)
print(f"Count of rows in silver_df: {filtered_df.count()}")

root
 |-- BASE_CURRENCY: string (nullable = true)
 |-- TYPE: string (nullable = true)
 |-- DATETIME: timestamp (nullable = true)
 |-- DATE: string (nullable = true)
 |-- TIME: string (nullable = true)
 |-- FREQUENCY: string (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- VOLUME: double (nullable = true)
 |-- TRADES: integer (nullable = true)
 |-- AUDIT_TIME: date (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- YEAR: integer (nullable = true)

+-------------+-------+-------------------+----------+--------+---------+-----+---+-------+-------+-------+-------+------------+------+----------+------+----+
|BASE_CURRENCY|   TYPE|           DATETIME|      DATE|    TIME|FREQUENCY|MONTH|DAY|   OPEN|   HIGH|    LOW|  CLOSE|      VOLUME|TRADES|AUDIT_TIME|SYMBOL|YEAR|
+-------------+-------+------

In [5]:
complementary_df = silver_df.subtract(filtered_df)
complementary_df.printSchema()
complementary_df.show(5)
print(f"Count of rows in complementary_df: {complementary_df.count()}")

root
 |-- BASE_CURRENCY: string (nullable = true)
 |-- TYPE: string (nullable = true)
 |-- DATETIME: timestamp (nullable = true)
 |-- DATE: string (nullable = true)
 |-- TIME: string (nullable = true)
 |-- FREQUENCY: string (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- VOLUME: double (nullable = true)
 |-- TRADES: integer (nullable = true)
 |-- AUDIT_TIME: date (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- YEAR: integer (nullable = true)

+-------------+-------+-------------------+----------+--------+---------+-----+---+------+------+------+------+------------+------+----------+------+----+
|BASE_CURRENCY|   TYPE|           DATETIME|      DATE|    TIME|FREQUENCY|MONTH|DAY|  OPEN|  HIGH|   LOW| CLOSE|      VOLUME|TRADES|AUDIT_TIME|SYMBOL|YEAR|
+-------------+-------+--------------

In [40]:
#Definición del esquema
schema = StructType([
    StructField("SYMBOL", StringType(), False),
    StructField("BASE_CURRENCY", StringType(), True),
    StructField("TYPE", StringType(), True),
    StructField("DATETIME", TimestampType(), True),
    StructField("DATE", DateType(), True),
    StructField("TIME", StringType(), True),
    StructField("FREQUENCY", StringType(), True),
    StructField("YEAR", IntegerType(), False),
    StructField("MONTH", IntegerType(), True),
    StructField("DAY", IntegerType(), True),
    StructField("OPEN", DoubleType(), True),
    StructField("HIGH", DoubleType(), True),
    StructField("LOW", DoubleType(), True),
    StructField("CLOSE", DoubleType(), True),
    StructField("VOLUME", DoubleType(), True),
    StructField("TRADES", IntegerType(), True),
    StructField("AUDIT_TIME", DateType(), True)
])




In [41]:
#Lets get sure that the resultant complementary_df has the same types defined in the schema
complementary_df_casted = complementary_df.select(
    
     col('BASE_CURRENCY').cast(schema['BASE_CURRENCY'].dataType),
     col('TYPE').cast(schema['TYPE'].dataType),
     col('DATETIME').cast(schema['DATETIME'].dataType),
     col('DATE').cast(schema['DATE'].dataType),
     col('TIME').cast(schema['TIME'].dataType),
     col('FREQUENCY').cast(schema['FREQUENCY'].dataType),
     col('MONTH').cast(schema['MONTH'].dataType),
     col('DAY').cast(schema['DAY'].dataType),
     col('OPEN').cast(schema['OPEN'].dataType),
     col('HIGH').cast(schema['HIGH'].dataType),
     col('LOW').cast(schema['LOW'].dataType),
     col('CLOSE').cast(schema['CLOSE'].dataType),
     col('VOLUME').cast(schema['VOLUME'].dataType),
     col('TRADES').cast(schema['TRADES'].dataType),
     col('AUDIT_TIME').cast(schema['AUDIT_TIME'].dataType),
     col('SYMBOL').cast(schema['SYMBOL'].dataType),
     col('YEAR').cast(schema['YEAR'].dataType),
)




In [42]:
complementary_df_casted.show()
complementary_df.printSchema()

+-------------+-------+-------------------+----------+--------+---------+-----+---+------+------+------+------+------------+------+----------+------+----+
|BASE_CURRENCY|   TYPE|           DATETIME|      DATE|    TIME|FREQUENCY|MONTH|DAY|  OPEN|  HIGH|   LOW| CLOSE|      VOLUME|TRADES|AUDIT_TIME|SYMBOL|YEAR|
+-------------+-------+-------------------+----------+--------+---------+-----+---+------+------+------+------+------------+------+----------+------+----+
|          USD|cryptos|2018-04-04 11:15:00|2018-04-04|11:15:00|    15min|    4|  4|7049.2|7058.6|7000.1|7049.3|102.19471522|   282|2024-09-21|XBTUSD|2018|
|          USD|cryptos|2018-04-04 12:00:00|2018-04-04|12:00:00|    15min|    4|  4|7061.1|7075.0|7044.0|7071.9| 47.88071896|   251|2024-09-21|XBTUSD|2018|
|          USD|cryptos|2018-04-04 12:45:00|2018-04-04|12:45:00|    15min|    4|  4|7030.0|7079.2|7030.0|7054.6| 90.57475819|   282|2024-09-21|XBTUSD|2018|
|          USD|cryptos|2018-04-04 14:00:00|2018-04-04|14:00:00|    15m

In [7]:
#persistimos los datos en la capa silver, reparticionando por simbolo y año, en modalidad overwrite
(
    complementary_df
    .write
    .format('parquet')
    .mode("overwrite")
    .partitionBy("SYMBOL", "YEAR")
    .save('s3://' + BUCKET + f'/datalake/silver/cryptos2')
)

print('Datos guardados en s3://' + BUCKET +  f'/datalake/silver/cryptos2')

Datos guardados en s3://cryptoengineer/datalake/silver/cryptos2


In [8]:
silver_df2 = (
    spark.read
    .format('parquet')
    .option('header', 'true')
    .load('s3://' + BUCKET + f'/datalake/silver/cryptos2')
)

silver_df.printSchema()
silver_df.show(5)
print(f"Count of rows in silver_df: {silver_df.count()}")

root
 |-- BASE_CURRENCY: string (nullable = true)
 |-- TYPE: string (nullable = true)
 |-- DATETIME: timestamp (nullable = true)
 |-- DATE: string (nullable = true)
 |-- TIME: string (nullable = true)
 |-- FREQUENCY: string (nullable = true)
 |-- MONTH: integer (nullable = true)
 |-- DAY: integer (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- VOLUME: double (nullable = true)
 |-- TRADES: integer (nullable = true)
 |-- AUDIT_TIME: date (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- YEAR: integer (nullable = true)

+-------------+-------+-------------------+----------+--------+---------+-----+---+------+------+------+------+------------+------+----------+------+----+
|BASE_CURRENCY|   TYPE|           DATETIME|      DATE|    TIME|FREQUENCY|MONTH|DAY|  OPEN|  HIGH|   LOW| CLOSE|      VOLUME|TRADES|AUDIT_TIME|SYMBOL|YEAR|
+-------------+-------+--------------

In [9]:
#persistimos los datos en la capa silver, reparticionando por simbolo y año, en modalidad overwrite
(
    silver_df2
    .write
    .format('parquet')
    .mode("overwrite")
    .partitionBy("SYMBOL", "YEAR")
    .save('s3://' + BUCKET + f'/datalake/silver/cryptos')
)

print('Datos guardados en s3://' + BUCKET +  f'/datalake/silver/cryptos')

Datos guardados en s3://cryptoengineer/datalake/silver/cryptos
