# CARGA HISTÓRICA DE DATOS
##### El presente notebook tiene como objeto procesar los ficheros .csv obtenidos a partir del scraping de INVESTING


####  Run this cell to set up and start your interactive session.


In [7]:
#%help

In [1]:
%region us-east-1
%number_of_workers 2
%idle_timeout 30
%worker_type G.1X
%glue_version 4.0

BUCKET = 'cryptoengineer'

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Previous region: us-east-1
Setting new region to: us-east-1
Region is set to: us-east-1
Previous number of workers: None
Setting new number of workers to: 2
Current idle_timeout is None minutes.
idle_timeout has been set to 30 minutes.
Previous worker type: None
Setting new worker type to: G.1X
Setting Glue version to: 4.0
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 30
Session ID: 6c13aea1-a4d6-4d81-be91-3080bb95fe57
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session 6c13aea1-a4d6-4d81-be91-3080

In [8]:
#Importación de librerías necesarias
from pyspark.sql.types import StructField, StructType, StringType, DoubleType, IntegerType, TimestampType, DateType
from pyspark.sql.functions import col, from_unixtime, lit, regexp_replace, current_date, min as spark_min, max as spark_max, regexp_extract
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import boto3
import os

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)




In [3]:
#Defino función para determinar los ficheros a procesar
def list_s3_files(bucket_name, folder_prefix):
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=folder_prefix)
    
    files = []
    for page in page_iterator:
        if 'Contents' in page:
            for obj in page['Contents']:
                # Check if the object is a file and not a directory
                if not obj['Key'].endswith('/'):
                    # Extract the file name from the full S3 key
                    file_name = os.path.basename(obj['Key'])
                    files.append(file_name)
    return files




In [4]:
#Lectura de ficheros CSV a procesar
files = list_s3_files(BUCKET, 'datalake/historic_data/indices')

print('Los ficheros a procesar son: ')
for file in files:
    print(file)

Los ficheros a procesar son: 
DAX_20240904.csv
DowJones_20240904.csv
EuroStoxx50_20240904.csv
IBEX35_20240904.csv
Nasdaq_20240904.csv


In [5]:
#Creación del esquema 
historic_schema = StructType([
    StructField('TIMESTAMP', StringType(), True),
    StructField('OPEN', DoubleType(), True),
    StructField('HIGH', DoubleType(), True),
    StructField('LOW', DoubleType(), True),
    StructField('CLOSE', DoubleType(), True),
    StructField('VOLUME', DoubleType(), True),
    StructField('TRADES', IntegerType(), True),
    StructField('ORIGIN', StringType(), True),
    StructField('LOAD_DATE', DateType(), True),
    StructField('SYMBOL', StringType(), True),
    StructField('DATETIME', TimestampType(), True),
    StructField('YEAR', IntegerType(), True)
])

#Creación del DF de destino
historic_df = spark.createDataFrame([], historic_schema)




In [12]:
#Iteración a través de todos los ficheros 
for file in files:
    print('Procesado el fichero: ' + file)
    #Lectura del fichero
    file_df = (
        spark.read
        .format("csv")
        .schema(historic_schema)
        .option('header', 'false')
        .load('s3://' + BUCKET + '/datalake/historic_data/indices/' + file)

        #Transformaciones básicas
        .withColumn('origin', lit('scraping'))
        .withColumn('load_date', current_date())
        .withColumn('symbol', regexp_extract(lit(file), r'^([^_]+)', 1))
        .withColumn('datetime', from_unixtime(col('timestamp')).cast('timestamp'))
        .withColumn('year', col('datetime').substr(0, 4).cast('int'))

    )
    historic_df = historic_df.unionAll(file_df)

Procesado el fichero: DAX_20240904.csv
Procesado el fichero: DowJones_20240904.csv
Procesado el fichero: EuroStoxx50_20240904.csv
Procesado el fichero: IBEX35_20240904.csv
Procesado el fichero: Nasdaq_20240904.csv


In [13]:
historic_df.printSchema()

root
 |-- TIMESTAMP: string (nullable = true)
 |-- OPEN: double (nullable = true)
 |-- HIGH: double (nullable = true)
 |-- LOW: double (nullable = true)
 |-- CLOSE: double (nullable = true)
 |-- VOLUME: double (nullable = true)
 |-- TRADES: integer (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- LOAD_DATE: date (nullable = true)
 |-- SYMBOL: string (nullable = true)
 |-- DATETIME: timestamp (nullable = true)
 |-- YEAR: integer (nullable = true)


In [14]:
historic_df.show(5)

+----------+----------------+----------------+----------------+----------------+---------+------+--------+----------+------+-------------------+----+
| TIMESTAMP|            OPEN|            HIGH|             LOW|           CLOSE|   VOLUME|TRADES|  ORIGIN| LOAD_DATE|SYMBOL|           DATETIME|YEAR|
+----------+----------------+----------------+----------------+----------------+---------+------+--------+----------+------+-------------------+----+
|1630911600| 15828.900390625| 15836.400390625| 15817.349609375| 15833.900390625|1615827.0|  null|scraping|2024-09-04|   DAX|2021-09-06 07:00:00|2021|
|1630912500|15849.5498046875|15883.4501953125| 15845.900390625| 15874.900390625|1741831.0|  null|scraping|2024-09-04|   DAX|2021-09-06 07:15:00|2021|
|1630913400|15878.4501953125|15886.4501953125| 15875.400390625|15880.9501953125| 882622.0|  null|scraping|2024-09-04|   DAX|2021-09-06 07:30:00|2021|
|1630914300|15884.4501953125|15884.4501953125|15873.9501953125|15873.9501953125| 701598.0|  null|scr

In [15]:
#Persistencia de datos
(
    historic_df
        .write
        .format('parquet')
        #.partitionBy('symbol','year')
        .partitionBy('LOAD_DATE')
        .mode('append')
        .save('s3://' + BUCKET + '/datalake/bronze/indices')
)

print('Datos guardados en s3://' + BUCKET + '/datalake/bronze/indices')

Datos guardados en s3://cryptoengineer-lg/datalake/bronze/indices
