
# Part 2: Let work with a real example

## Download market data per underline index

## Standard programming solution

- Extract index composition from Stooq
- Download daily market data for each symbols
- Resample daily candles to weekly and monthly
- Save all result to parquets

In [1]:
import pandas as pd
from typing import List
import os
import time
import pathlib


def index_to_stooq_id(index_name:str)-> str:
    i_2_s = {"DJI": "578",
            "NDX":"580",
            "HSI":"616"}
    return i_2_s[index_name]

def get_index_composition(index_id: str) -> pd.DataFrame:
    return pd.read_html(f"https://stooq.com/t/?i={index_id}", attrs = {"id": "fth1"})[0]

def get_symbols(df_index_composition: pd.DataFrame) -> List[str]:
    return [symbol for symbol in df_index_composition["Symbol"]]

def get_historical_data(symbol: str, time_frame: str="d") -> pd.DataFrame:
    df = pd.read_csv(f"https://stooq.com/q/d/l/?s={symbol}&i={time_frame}", parse_dates=True)
    df['Date'] =  pd.to_datetime(df['Date'], format='%Y-%m-%d')
    return df

In [2]:
def resample_candle(df_daily:pd.DataFrame, resampling: str = "W") -> pd.DataFrame:
    agg_dict = {'Open': 'first',
          'High': 'max',
          'Low': 'min',
          'Close': 'last',
          'Volume': 'mean'}
    df_with_index = df_daily.set_index("Date")
    return df_with_index.resample(resampling).agg(agg_dict).reset_index()

def save_to_parquest(df: pd.DataFrame, path: str):
    pathlib.Path(path).parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(path)

In [9]:
index_name = "DJI"
root_market_data = "~/git/prefect_presentation/market_data"


print(f"=> Extract composition for {index_name}")
index_id = index_to_stooq_id(index_name)
df_composition = get_index_composition(index_id)


for symbol in get_symbols(df_composition)[:6]:
    start = time.time()
    print(f"==> Extract market data for {symbol}", end =" ")
    df_OHLC_daily = get_historical_data(symbol)
    save_to_parquest(df_OHLC_daily, f"{root_market_data}/{index_name}/{symbol}_d.parquet")
    print("Resample W.", end =" ")
    df_OHLC_weekly = resample_candle(df_OHLC_daily, "W")
    save_to_parquest(df_OHLC_weekly, f"{root_market_data}/{index_name}/{symbol}_W.parquet")
    print("M", end =" ")
    df_OHLC_monthly = resample_candle(df_OHLC_daily, "M")
    save_to_parquest(df_OHLC_monthly, f"{root_market_data}/{index_name}/{symbol}_M.parquet")
    end = time.time()
    print(f"took: {end - start:2.2f}s")

=> Extract composition for DJI
==> Extract market data for AAPL.US Resample W. M took: 3.85s
==> Extract market data for AXP.US Resample W. M took: 4.26s
==> Extract market data for BA.US Resample W. M took: 5.00s
==> Extract market data for CAT.US Resample W. M took: 5.10s
==> Extract market data for CSCO.US Resample W. M took: 4.51s
==> Extract market data for CVX.US Resample W. M took: 4.55s


## Limitation

- linear, hard to scale out
- Load of hedge case to handle to be resilient


## How can we do, Let look at Prefect



### Few important base component to know

1. Task: Low level execution unit
1. Flow: Container of tasks with their dependencies

In [None]:
from prefect import task, Flow, Parameter, unmapped
import pandas as pd
from typing import List
import os
import time
import pathlib

@task
def index_to_stooq_id(index_name:str)-> str:
    i_2_s = {"DJI": "578",
            "NDX":"580",
            "HSI":"616"}
    return i_2_s[index_name]

@task
def get_index_composition(index_id: str) -> pd.DataFrame:
    return pd.read_html(f"https://stooq.com/t/?i={index_id}", attrs = {"id": "fth1"})[0]

@task
def get_symbols(df_index_composition: pd.DataFrame) -> List[str]:
    return [symbol for symbol in df_index_composition["Symbol"]]

@task
def get_historical_data(symbol: str, time_frame: str="d") -> pd.DataFrame:
    df = pd.read_csv(f"https://stooq.com/q/d/l/?s={symbol}&i={time_frame}", parse_dates=True)
    df['Date'] =  pd.to_datetime(df['Date'], format='%Y-%m-%d')
    return df

@task
def resample_candle(df_daily:pd.DataFrame, resampling: str = "W") -> pd.DataFrame:
    agg_dict = {'Open': 'first',
          'High': 'max',
          'Low': 'min',
          'Close': 'last',
          'Volume': 'mean'}
    df_with_index = df_daily.set_index("Date")
    return df_with_index.resample(resampling).agg(agg_dict).reset_index()

@task
def get_save_path(root:str, index_name: str, symbol:str, timeframe:str)-> str:
    return f"{root}/{index_name}/{symbol}_{timeframe}.parquet"

@task
def save_to_parquest(df: pd.DataFrame, path: str):
    pathlib.Path(path).parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(path)


In [None]:
with Flow("extract_market_data") as f:

    index_name = Parameter("index", default="DJI")
    root_market_data = Parameter("root_folder", "~/git/prefect_presentation/market_data")

    index_id = index_to_stooq_id(index_name)
    df_composition = get_index_composition(index_id)
    symbols = get_symbols(df_composition)
    
    df_OHLC_daily = get_historical_data.map(symbols)
    daily_path = get_save_path.map(unmapped(root_market_data),unmapped(index_name), symbols, unmapped("d") )
    save_to_parquest.map(df_OHLC_daily, daily_path)
    
    df_weekly = resample_candle.map(df_OHLC_daily, unmapped("W"))
    weekly_path = get_save_path.map(unmapped(root_market_data),unmapped(index_name), symbols, unmapped("W") )
    save_to_parquest.map(df_weekly, weekly_path)
    
    df_monthly = resample_candle.map(df_OHLC_daily, unmapped("M"))
    monthly_path = get_save_path.map(unmapped(root_market_data),unmapped(index_name), symbols, unmapped("M") )
    save_to_parquest.map(df_monthly, monthly_path)
    


In [None]:
f.visualize()

In [None]:
f.run()

In [None]:
from prefect.executors import LocalDaskExecutor     
executor = LocalDaskExecutor(scheduler="threads", nb_threads="8")
f.run(executor=executor)