# Introduction
This notebook will fetch data from an external API, process them, and load them to S3 bucket as a staging area.

In [13]:
import requests
import pandas as pd
from datetime import datetime
import json
import configparser
import boto3
import os

In [14]:
config = configparser.ConfigParser()
config.read("../config.cfg")

# retrieve access key and secret key
aws_access_key_id = config['AWS']['aws_access_key_id']
aws_secret_access_key = config['AWS']['aws_secret_access_key']

## Fetch quick data from RapidAPI

In [15]:
def get_stock_data(symbol, rangeData = "1mo", interval = "1h"):
    base_url = "https://apidojo-yahoo-finance-v1.p.rapidapi.com/market/get-charts"
    querystring = {
        "region":"US", 
        "lang":"en", 
        "symbol": symbol, 
        "range": rangeData, 
        "interval": interval
    }
    headers = {
        'x-rapidapi-host': config["RapidAPI"]['x-rapidapi-host'],
        'x-rapidapi-key': config['RapidAPI']['x-rapidapi-key'],
        'Content-Type': "application/json"
    }
    response = requests.request("GET", base_url, headers=headers, params=querystring)
    if response.ok:
        data = json.loads(response.content)
        return data['chart']['result'][0]
    return None

In [16]:
response = get_stock_data("AAPL")
response

{'meta': {'currency': 'USD',
  'symbol': 'AAPL',
  'exchangeName': 'NMS',
  'instrumentType': 'EQUITY',
  'firstTradeDate': 345479400,
  'regularMarketTime': 1637949603,
  'gmtoffset': -18000,
  'timezone': 'EST',
  'exchangeTimezoneName': 'America/New_York',
  'regularMarketPrice': 156.81,
  'chartPreviousClose': 149.32,
  'previousClose': 161.94,
  'scale': 3,
  'priceHint': 2,
  'currentTradingPeriod': {'pre': {'timezone': 'EST',
    'start': 1637917200,
    'end': 1637937000,
    'gmtoffset': -18000},
   'regular': {'timezone': 'EST',
    'start': 1637937000,
    'end': 1637949600,
    'gmtoffset': -18000},
   'post': {'timezone': 'EST',
    'start': 1637949600,
    'end': 1637974800,
    'gmtoffset': -18000}},
  'tradingPeriods': [[{'timezone': 'EDT',
     'start': 1635341400,
     'end': 1635364800,
     'gmtoffset': -14400}],
   [{'timezone': 'EDT',
     'start': 1635427800,
     'end': 1635451200,
     'gmtoffset': -14400}],
   [{'timezone': 'EDT',
     'start': 1635514200,
   

## Transform raw json data to Dataframes

In [17]:
metadata = response['meta']
timestamps = response['timestamp']
indicators = response['indicators']['quote'][0]

In [18]:
table = {
    "Timestamps": timestamps, 
    "Volume": indicators['volume'], 
    "Low": indicators["low"],
    "Open": indicators["open"], 
    "High": indicators["high"], 
    "Close": indicators["close"]
}
df_indicators = pd.DataFrame(table)
df_indicators['Datetime'] = (df_indicators['Timestamps']
                             .apply(lambda x: datetime.fromtimestamp(int(x)).strftime("%Y-%m-%d %H:%M:%S")))
df_indicators['symbol'] = metadata['symbol']
df_indicators.head()

Unnamed: 0,Timestamps,Volume,Low,Open,High,Close,Datetime,symbol
0,1635341400,3081418,148.850006,149.375,149.550003,148.929993,2021-10-27 08:30:00,AAPL
1,1635341700,1599312,148.5,149.0,149.0,148.695007,2021-10-27 08:35:00,AAPL
2,1635342000,1191193,148.720001,148.769806,148.970001,148.759995,2021-10-27 08:40:00,AAPL
3,1635342300,1248374,148.902496,148.929993,149.380005,149.249207,2021-10-27 08:45:00,AAPL
4,1635342600,1073784,149.031998,149.199799,149.509995,149.5,2021-10-27 08:50:00,AAPL


In [9]:
def get_trading_period(trade_period):
    trading_periods = set([])
    for period in trade_period:
        start = datetime.fromtimestamp(int(period[0]['start'])).strftime("%H:%M:%S")
        end = datetime.fromtimestamp(int(period[0]['end'])).strftime("%H:%M:%S")
        date = f"{start} - {end}"
        trading_periods.add(date)
    return list(trading_periods)

impt_metadata = {
    "currency": metadata['currency'],
    "symbol": metadata['symbol'],
    "instrumentType": metadata['instrumentType'],
    "firstTradeDate": datetime.fromtimestamp(metadata['firstTradeDate']).strftime("%Y-%m-%d %H:%M:%S"),
    "exchangeTimezoneName": metadata["exchangeTimezoneName"],
    'timezone': metadata['timezone'],
    'trade_period': get_trading_period(metadata['tradingPeriods']),
    'range': metadata['range'],
    'interval': metadata['dataGranularity'],
    'start_date': df_indicators['Timestamps'].min()
}

df_meta = pd.DataFrame(impt_metadata)
df_meta.head()

Unnamed: 0,currency,symbol,instrumentType,firstTradeDate,exchangeTimezoneName,timezone,trade_period,range,interval,start_date
0,USD,AAPL,EQUITY,1980-12-12 08:30:00,America/New_York,EST,08:30:00 - 12:00:00,1mo,5m,1635341400
1,USD,AAPL,EQUITY,1980-12-12 08:30:00,America/New_York,EST,08:30:00 - 15:00:00,1mo,5m,1635341400


## Upload CSV files to S3 bucket

In [10]:
# instantiate s3 objects
s3 = boto3.resource("s3", 
                    region_name = 'us-west-2', 
                    aws_access_key_id = aws_access_key_id,
                    aws_secret_access_key = aws_secret_access_key
                   )

# function to upload local file object to S3 bucket
def upload_object(s3, local_path, tag, obj, bucket, keys, ext="csv"):
    file_name = str(keys['start_date']) + f".{ext}"
    directory = os.path.join(local_path, keys['symbol'], tag, keys['range'], keys['interval'])
    file_path = os.path.join(directory, file_name)
    
    # upload file locally
    if ext == "csv":
        if not os.path.exists(directory): 
            os.makedirs(directory)
        obj.to_csv(file_path, header=False, index=False)
    elif ext == "json":
        with open(file_path, 'w') as file:
            json.dump(obj, file)
    else:
        print("Invalid file extension")
        return
    
    # upload file to s3
    key = os.path.join(keys['symbol'], tag, keys['range'], keys['interval'], file_name)
    try:
        s3.Bucket(bucket).upload_file(file_path, key)
        print(f"Successfully uploaded an object @ {local_path} to S3 @ s3://{bucket}/{key}")
    except Exception as e:
        print(e)
        
    return os.path.join(keys['symbol'], tag, keys['range'], keys['interval'])

In [11]:
# retrieve bucket_name from config object
bucket_name = config['S3']['bucket_name']

keys = dict(df_meta.iloc[0, :])

# upload metadata dataframe
key_path_meta = upload_object(
    s3 = s3,
    local_path = "../data/s3",
    tag = 'metadata',
    obj = df_meta,
    bucket = bucket_name,
    keys = keys,
    ext = "csv"
)    

# upload indicators dataframe
key_path_indicators = upload_object(
    s3 = s3,
    local_path = "../data/s3",
    tag = 'indicators',
    obj = df_indicators,
    bucket = bucket_name,
    keys = keys,
    ext = "csv"
)

Successfully uploaded an object @ ../data/s3 to S3 @ s3://nyse-stock-data-dngo/AAPL/metadata/1mo/5m/1635341400.csv
Successfully uploaded an object @ ../data/s3 to S3 @ s3://nyse-stock-data-dngo/AAPL/indicators/1mo/5m/1635341400.csv


### Store data to share among notebooks

In [12]:
%store key_path_meta
%store key_path_indicators

Stored 'key_path_meta' (str)
Stored 'key_path_indicators' (str)
