# Introduction
This notebook will fetch data from an external API, process them, and load them to S3 bucket as a staging area.

In [2]:
import requests
import pandas as pd
from datetime import datetime
import json
import configparser
import boto3
import os
import io

In [3]:
config = configparser.ConfigParser()
config.read("../config.cfg")

['../config.cfg']

## Fetch quick data from RapidAPI

In [4]:
def get_stock_data(symbol, rangeData = "1mo", interval = "1h"):
    base_url = "https://yh-finance.p.rapidapi.com/stock/v3/get-chart"
    querystring = {
        "region":"US", 
        "lang":"en", 
        "symbol": symbol,
        "range": rangeData, 
        "interval": interval,
        "events":"capitalGain,div,split"
    }
    headers = {
        'x-rapidapi-host': config["RapidAPI"]['x-rapidapi-host'],
        'x-rapidapi-key': config['RapidAPI']['x-rapidapi-key'],
        'Content-Type': "application/json"
    }
    response = requests.request("GET", base_url, headers=headers, params=querystring)
    if response.ok:
        data = json.loads(response.content)
        return data['chart']['result'][0]
    return None

In [5]:
response = get_stock_data("AAPL", rangeData = "1d", interval = "1m")
response

{'meta': {'currency': 'USD',
  'symbol': 'AAPL',
  'exchangeName': 'NMS',
  'instrumentType': 'EQUITY',
  'firstTradeDate': 345479400,
  'regularMarketTime': 1646427603,
  'gmtoffset': -18000,
  'timezone': 'EST',
  'exchangeTimezoneName': 'America/New_York',
  'regularMarketPrice': 163.17,
  'chartPreviousClose': 166.23,
  'previousClose': 166.23,
  'scale': 3,
  'priceHint': 2,
  'currentTradingPeriod': {'pre': {'timezone': 'EST',
    'end': 1646404200,
    'start': 1646384400,
    'gmtoffset': -18000},
   'regular': {'timezone': 'EST',
    'end': 1646427600,
    'start': 1646404200,
    'gmtoffset': -18000},
   'post': {'timezone': 'EST',
    'end': 1646442000,
    'start': 1646427600,
    'gmtoffset': -18000}},
  'tradingPeriods': [[{'timezone': 'EST',
     'end': 1646427600,
     'start': 1646404200,
     'gmtoffset': -18000}]],
  'dataGranularity': '1m',
  'range': '1d',
  'validRanges': ['1d',
   '5d',
   '1mo',
   '3mo',
   '6mo',
   '1y',
   '2y',
   '5y',
   '10y',
   'ytd',


## Transform raw json data to Dataframes

In [6]:
def create_metadata(response):
    metadata = response['meta']
    timestamps = response['timestamp']
    indicators = response['indicators']['quote'][0]
    
    def get_trading_period(trade_period):
        trading_periods = set([])
        for period in trade_period:
            start = datetime.fromtimestamp(int(period[0]['start'])).strftime("%H:%M:%S")
            end = datetime.fromtimestamp(int(period[0]['end'])).strftime("%H:%M:%S")
            date = f"{start} - {end}"
            trading_periods.add(date)
        return list(trading_periods)

    tss = pd.Series(timestamps).apply(lambda x: datetime.fromtimestamp(int(x)).strftime("%Y-%m-%d %H:%M:%S"))
    impt_metadata = {
        "currency": metadata['currency'],
        "symbol": metadata['symbol'],
        "instrumentType": metadata['instrumentType'],
        "firstTradeDate": datetime.fromtimestamp(metadata['firstTradeDate']).strftime("%Y-%m-%d %H:%M:%S"),
        "exchangeTimezoneName": metadata["exchangeTimezoneName"],
        'timezone': metadata['timezone'],
        'trade_period': get_trading_period(metadata['tradingPeriods']),
        'range': metadata['range'],
        'interval': metadata['dataGranularity'],
        'start_date': tss.min()
    }

    df_meta = pd.DataFrame(impt_metadata)
    return df_meta

def create_indicators(response):
    metadata = response['meta']
    timestamps = response['timestamp']
    indicators = response['indicators']['quote'][0]
    
    table = {
        "Timestamps": timestamps, 
        "Volume": indicators['volume'], 
        "Low": indicators["low"],
        "Open": indicators["open"], 
        "High": indicators["high"], 
        "Close": indicators["close"]
    }
    
    tss = pd.Series(timestamps).apply(lambda x: datetime.fromtimestamp(int(x)).strftime("%Y-%m-%d %H:%M:%S"))
    df_indicators = pd.DataFrame(table)
    df_indicators['Datetime'] = tss
    df_indicators['symbol'] = metadata['symbol']
    return df_indicators

In [7]:
create_metadata(response).head()

Unnamed: 0,currency,symbol,instrumentType,firstTradeDate,exchangeTimezoneName,timezone,trade_period,range,interval,start_date
0,USD,AAPL,EQUITY,1980-12-12 08:30:00,America/New_York,EST,08:30:00 - 15:00:00,1d,1m,2022-03-04 08:30:00


In [8]:
create_indicators(response)

Unnamed: 0,Timestamps,Volume,Low,Open,High,Close,Datetime,symbol
0,1646404200,3088473,164.922806,164.922806,164.940002,164.940002,2022-03-04 08:30:00,AAPL
1,1646404260,371482,164.740005,164.899994,165.169998,165.000000,2022-03-04 08:31:00,AAPL
2,1646404320,320367,164.990005,164.990005,165.320007,165.220093,2022-03-04 08:32:00,AAPL
3,1646404380,346026,164.940002,165.240005,165.300003,165.289993,2022-03-04 08:33:00,AAPL
4,1646404440,314123,165.029999,165.300003,165.399994,165.279999,2022-03-04 08:34:00,AAPL
...,...,...,...,...,...,...,...,...
386,1646427360,270991,163.039993,163.139999,163.190002,163.160004,2022-03-04 14:56:00,AAPL
387,1646427420,291904,163.074997,163.149994,163.300003,163.267700,2022-03-04 14:57:00,AAPL
388,1646427480,359805,163.199997,163.259995,163.389999,163.369995,2022-03-04 14:58:00,AAPL
389,1646427540,935945,163.139999,163.371094,163.529999,163.220001,2022-03-04 14:59:00,AAPL


## Upload CSV files to S3 bucket

In [9]:
# function to upload local file object to S3 bucket
def upload_object(s3, bucket, data, tag, functions):
    df_meta = create_metadata(data)
    keys = dict(df_meta.iloc[0,:])
    
    function = functions[tag]
    df = function(response)
    
    # upload file to s3
    file_name = str(keys['start_date']) + f".csv"
    key = os.path.join(keys['symbol'], tag, keys['range'], keys['interval'], file_name)
    meta_key = os.path.join(keys['symbol'], 'metadata', keys['range'], keys['interval'], file_name)
    
    try:
        with io.StringIO() as csv_buffer:
            df_meta.to_csv(csv_buffer, index=False)
            s3.put_object(Bucket=bucket, Body=csv_buffer.getvalue(), Key=meta_key)
            print(f"Successfully uploaded an object to S3 @ s3://{bucket}/{meta_key}")
            
        with io.StringIO() as csv_buffer:
            df.to_csv(csv_buffer, index=False)  
            s3.put_object(Bucket=bucket, Body=csv_buffer.getvalue(), Key=key)
            print(f"Successfully uploaded an object to S3 @ s3://{bucket}/{key}")
            
    except Exception as e:
        print(e)
        
    return os.path.join(keys['symbol'], tag, keys['range'], keys['interval'])

In [10]:
# instantiate s3 objects
s3_client = boto3.client("s3", 
                         region_name = 'us-west-2', 
                         aws_access_key_id = config['AWS']['aws_access_key_id'],
                         aws_secret_access_key = config['AWS']['aws_secret_access_key'])

# upload data to S3
key_path_meta = upload_object(
    s3 = s3_client,
    bucket = config['S3']['bucket_name'],
    data = response,
    tag = 'indicators',
    functions = {'indicators':create_indicators, 'meta':create_metadata}
)    

Successfully uploaded an object to S3 @ s3://nyse-stock-data/AAPL/metadata/1d/1m/2022-03-04 08:30:00.csv
Successfully uploaded an object to S3 @ s3://nyse-stock-data/AAPL/indicators/1d/1m/2022-03-04 08:30:00.csv


### Store data to share among notebooks

In [None]:
%store key_path_meta
%store key_path_indicators