# AWS Glue Job - Load Forex data to Bronze Stage from API source

## Set the Glue session parameters


In [8]:
%iam_role arn:aws:iam::212430227630:role/LabRole
%region us-east-1
%number_of_workers 2

%idle_timeout 30
%glue_version 4.0
%worker_type G.1X

%%configure 
{
  "--enable-metrics": "true",
  "--enable-observability-metrics": "true"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Current iam_role is arn:aws:iam::212430227630:role/LabRole
iam_role has been set to arn:aws:iam::212430227630:role/LabRole.
Previous region: us-east-1
Setting new region to: us-east-1
Region is set to: us-east-1
Previous number of workers: None
Setting new number of workers to: 2
Current idle_timeout is None minutes.
idle_timeout has been set to 30 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
The following configurations have been updated: {'--enable-metrics': 'true', '--enable-observability-metrics': 'true'}


In [11]:
%extra_py_files s3://cryptoengineer/gluejobs-py-modules/load.py, s3://cryptoengineer/gluejobs-py-modules/storage.py
%additional_python_modules yfinance

Extra py files to be included:
s3://cryptoengineer/gluejobs-py-modules/load.py
s3://cryptoengineer/gluejobs-py-modules/storage.py
Additional python modules to be included:
yfinance


##  Set up and start your interactive session.


In [14]:
%load_ext autoreload
%autoreload 2

In [1]:
import sys
import boto3

from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 30
Session ID: 8b861921-2460-4817-be65-f3e8435ddfb9
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
--enable-metrics true
--enable-observability-metrics true
--extra-py-files s3://cryptoengineer/gluejobs-py-modules/load.py,s3://cryptoengineer/gluejobs-py-modules/storage.py
--additional-python-modules yfinance
Waiting for session 8b861921-2460-4817-be65-f3e8435ddfb9 to get into ready status...
Session 8b861921-2460-4817-be65-f3e8435ddfb9 has been created.



## Batch Load - COMMODITIES

### Load the libraries and dependencies

In [2]:
from datetime import datetime, timedelta, timezone
import pyspark.sql.functions as F

import pandas as pd
import load




### Set AWS Storage parameters


In [3]:
BUCKET_NAME = "cryptoengineer"
PREFIX = "datalake/bronze/commodities"




### Load job parameters

In [5]:
glue_client = boto3.client("glue")
# Check if params come from a GLUE workflow
if '--WORKFLOW_NAME' in sys.argv and '--WORKFLOW_RUN_ID' in sys.argv:
    print("Running in Glue Workflow")
    
    glue_args = getResolvedOptions(
        sys.argv, ['WORKFLOW_NAME', 'WORKFLOW_RUN_ID']
    )
    
    print("Reading the workflow parameters")
    workflow_args = glue_client.get_workflow_run_properties(
        Name=glue_args['WORKFLOW_NAME'], RunId=glue_args['WORKFLOW_RUN_ID']
    )["RunProperties"]

    
    base= workflow_args['base']
    time_frame = int(workflow_args['time_frame'])
    freq= workflow_args['freq']
    symbols = workflow_args['symbols']
    api_key = workflow_args['api_key']

else:
    # Check if params come from a Glue Job    
    try:
        args = getResolvedOptions(sys.argv,
                                  ['JOB_NAME',
                                   'base',
                                   'time_frame',
                                   'freq',
                                   'symbols',
                                   'api_key'])
        base= args['base']
        time_frame = int(args['time_frame'])
        freq = args['freq']
        symbols = args['symbols']
        api_key = args['api_key']
        print("Running as Job")        
    except:
        print("Running as Notebook")
        base = "USD"
        time_frame = 408 #48
        freq= '15min' #'1day'
        symbols = "CLUSD,GCUSD,NGUSD"
        api_key = ""

Running as Notebook


In [None]:
print("base: ", base)
print("Time Frame: ", time_frame)
print("Frequency: ", freq)
print("Symbols: ", symbols)
print("API Key: ", api_key)

#### Set the start and end dates for the data you want to load

In [7]:
# Start date
start_date = (datetime.utcnow() - timedelta(hours=time_frame)).strftime("%Y-%m-%d")
end_date = datetime.utcnow().strftime("%Y-%m-%d")

print("Start date; ",start_date," End date: ",end_date)

Start date;  2024-09-07  End date:  2024-09-24


## Read max date loaded from INFO table

In [8]:
PREFIX_INFO_TABLE='datalake/gold/commodities'
path=f"s3://{BUCKET_NAME}/{PREFIX_INFO_TABLE}"
print("Path:", path)

Path: s3://cryptoengineer/datalake/gold/commodities


In [9]:
df_max_dates = (
    spark
    .read
    .parquet(path)
    .filter(F.col("stage")=='bronze')
    .select('symbol','base_currency','frequencies','end_datetime')
    .toPandas()
)

  series = series.astype(t, copy=False)


In [21]:
df_max_dates.head(10)

  symbol base_currency frequencies        end_datetime
0  BZUSD           USD       15min 2024-09-24 11:00:00
1  BZUSD           USD        1day 2024-09-24 00:00:00
2  CLUSD           USD       15min 2024-09-06 16:45:00
3  CLUSD           USD        1day 2024-09-06 00:00:00
4  GCUSD           USD       15min 2024-09-06 16:45:00
5  GCUSD           USD        1day 2024-09-06 00:00:00
6  NGUSD           USD       15min 2024-09-06 16:45:00
7  NGUSD           USD        1day 2024-09-06 00:00:00


In [11]:
df_max_dates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   symbol         8 non-null      object        
 1   base_currency  8 non-null      object        
 2   frequencies    8 non-null      object        
 3   end_datetime   8 non-null      datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 384.0+ bytes


## Load the historical commodities- 15min frequency

Set some config values

In [12]:
type='COMMODITIES'
source='FMP'




In [22]:
if freq == '15min':
    print("Reading historical data by frequency")
    df= pd.DataFrame()
    for symbol in symbols.split(","):
        print("Loading: ", symbol)
        symbol_df = load.load_batch_freq_rates(base=base,
                                              start_date=start_date,
                                              end_date=end_date,
                                              freq=freq,
                                              symbol=symbol,
                                              api_key=api_key,
                                              source=source
        )
        print("Records loaded: ", len(symbol_df))        
        # Get the max date time already loaded 
        max_datetime=df_max_dates[(df_max_dates['symbol'] ==symbol) & (df_max_dates['base_currency'] ==base) 
                              & (df_max_dates['frequencies'] ==freq)]['end_datetime'].max()
        # Extract only the new data
        symbol_df = symbol_df[pd.to_datetime(symbol_df['date']) > max_datetime]
        print("New records: ", len(symbol_df))
        
        # Complete the table schema 
        if len(symbol_df)>0:
            symbol_df = load.set_schema_table(symbol_df, symbol, source, freq, base, type)
            print("Records: ", len(symbol_df))
            df = pd.concat([df, symbol_df])
        else:
            print("No data for: ", symbol)

else:
    print("Reading daily historical data")
    df= pd.DataFrame()
    for symbol in symbols.split(","):
        print("Loading: ", symbol)
        symbol_df = load.load_historical_rates(base=base,
                                              start_date=start_date,
                                              end_date=end_date,
                                              symbol=symbol,
                                              api_key=api_key,
                                              source=source
        )
        print("Records loaded: ", len(symbol_df))        
        # Get the max date time already loaded 
        max_datetime=df_max_dates[(df_max_dates['symbol'] ==symbol) & (df_max_dates['base_currency'] ==base) 
                                  & (df_max_dates['frequencies'] ==freq)]['end_datetime'].max()
        # Extract only the new data
        symbol_df = symbol_df[pd.to_datetime(symbol_df['date']) > max_datetime]
        print("New records: ", len(symbol_df))
        
        # Complete the table schema 
        if len(symbol_df)>0:        
            symbol_df = load.set_schema_table(symbol_df, symbol, source, freq, base, type)
            print("Records: ", len(symbol_df))
            df = pd.concat([df, symbol_df])
        else:
            print("No data for: ", symbol)


Reading daily historical data
Loading:  CLUSD
https://financialmodelingprep.com/api/v3/historical-price-full
https://financialmodelingprep.com/api/v3/historical-price-full/CLUSD?apikey=xGFdE9Ydrcr1oCDJiCjHiZnkqUnQnjaH
Lectura API correcta
Records loaded:  15
New records:  15
Records:  15
Loading:  GCUSD
https://financialmodelingprep.com/api/v3/historical-price-full
https://financialmodelingprep.com/api/v3/historical-price-full/GCUSD?apikey=xGFdE9Ydrcr1oCDJiCjHiZnkqUnQnjaH
Lectura API correcta
Records loaded:  15
New records:  15
Records:  15
Loading:  NGUSD
https://financialmodelingprep.com/api/v3/historical-price-full
https://financialmodelingprep.com/api/v3/historical-price-full/NGUSD?apikey=xGFdE9Ydrcr1oCDJiCjHiZnkqUnQnjaH
Lectura API correcta
Records loaded:  15
New records:  15
Records:  15


In [23]:
print("Records: ", len(df))

Records:  45


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45 entries, 0 to 14
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   datetime          45 non-null     object             
 1   open              45 non-null     float64            
 2   high              45 non-null     float64            
 3   low               45 non-null     float64            
 4   close             45 non-null     float64            
 5   adjClose          45 non-null     float64            
 6   volume            45 non-null     int64              
 7   unadjustedVolume  45 non-null     int64              
 8   change            45 non-null     float64            
 9   changePercent     45 non-null     float64            
 10  vwap              45 non-null     float64            
 11  label             45 non-null     object             
 12  changeOverTime    45 non-null     float64            
 13  year   

In [25]:
df.tail(15)

      datetime   open  ...   load_date         type
0   2024-09-24  2.634  ...  2024-09-24  COMMODITIES
1   2024-09-23  2.462  ...  2024-09-24  COMMODITIES
2   2024-09-22  2.462  ...  2024-09-24  COMMODITIES
3   2024-09-20  2.358  ...  2024-09-24  COMMODITIES
4   2024-09-19  2.288  ...  2024-09-24  COMMODITIES
5   2024-09-18  2.312  ...  2024-09-24  COMMODITIES
6   2024-09-17  2.391  ...  2024-09-24  COMMODITIES
7   2024-09-16  2.298  ...  2024-09-24  COMMODITIES
8   2024-09-15  2.298  ...  2024-09-24  COMMODITIES
9   2024-09-13  2.363  ...  2024-09-24  COMMODITIES
10  2024-09-12  2.286  ...  2024-09-24  COMMODITIES
11  2024-09-11  2.236  ...  2024-09-24  COMMODITIES
12  2024-09-10  2.149  ...  2024-09-24  COMMODITIES
13  2024-09-09  2.220  ...  2024-09-24  COMMODITIES
14  2024-09-08  2.220  ...  2024-09-24  COMMODITIES

[15 rows x 25 columns]


## Append the batch data to RAW table

Set the destination raw table

In [26]:
path=f"s3://{BUCKET_NAME}/{PREFIX}"
print("Path:",path)

Path: s3://cryptoengineer/datalake/bronze/commodities


Save data to datalake, bronze stage, in parquet format and partitioned by load_date

In [27]:
if len(df)>0:
    print("Saving data to: ", path)    
    (
        spark.createDataFrame(df)
        .repartition("load_date")
        .write
        .format("parquet")
        .mode("append")
        .partitionBy(['load_date'])
        .save(path)
    )

Saving data to:  s3://cryptoengineer/datalake/bronze/commodities
  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():
