# Homework Starter — Stage 15: Orchestration & System Design
Complete the sections below. Keep your answers concise and focused on orchestration readiness.

## 1) Project Task Decomposition
List 4–8 tasks. Add more rows as needed.

In [4]:
from pathlib import Path
import pandas as pd

tasks = pd.DataFrame({
    'task': [
        'ingest_data',       # download or load raw market data
        'clean_data',        # clean and feature-engineer SPX/VIX/macroeconomic data
        'train_model',       # train regression model for SPX return
        'save_model',        # save trained model to disk
        'run_flask_api',     # deploy Flask API for prediction
        'run_streamlit_dashboard',  # launch Streamlit dashboard
        'generate_report'    # generate analysis/plots report
    ],
    'inputs': [
        '/data/raw_market.csv',          # raw CSV of SPX, VIX, macro data
        'raw_market.csv',                # input for cleaning
        'cleaned_data.csv',              # cleaned dataset for training
        'trained_model.pkl',             # input/output for save step
        'trained_model.pkl',             # model input for API
        'trained_model.pkl',             # model input for dashboard
        'model.pkl, cleaned_data.csv'    # inputs for generating report
    ],
    'outputs': [
        'raw_market.csv',
        'cleaned_data.csv',
        'trained_model.pkl',
        'trained_model.pkl',
        'flask_api_running.txt',         # placeholder to indicate API is running
        'streamlit_dashboard_running.txt', # placeholder for dashboard
        'analysis_report.pdf'
    ],
    'idempotent': [
        True, True, True, True, True, True, True
    ]
})
tasks

Unnamed: 0,task,inputs,outputs,idempotent
0,ingest_data,/data/raw_market.csv,raw_market.csv,True
1,clean_data,raw_market.csv,cleaned_data.csv,True
2,train_model,cleaned_data.csv,trained_model.pkl,True
3,save_model,trained_model.pkl,trained_model.pkl,True
4,run_flask_api,trained_model.pkl,flask_api_running.txt,True
5,run_streamlit_dashboard,trained_model.pkl,streamlit_dashboard_running.txt,True
6,generate_report,"model.pkl, cleaned_data.csv",analysis_report.pdf,True


## 2) Dependencies (DAG)
Describe dependencies and paste a small diagram if you have one.

In [5]:
dag = {
    'ingest_data': [],
    'clean_data': ['ingest_data'],
    'train_model': ['clean_data'],
    'save_model': ['train_model'],
    'run_flask_api': ['save_model'],
    'run_streamlit_dashboard': ['save_model'],
    'generate_report': ['save_model', 'clean_data']  # report depends on model and cleaned data
}
dag

{'ingest_data': [],
 'clean_data': ['ingest_data'],
 'train_model': ['clean_data'],
 'save_model': ['train_model'],
 'run_flask_api': ['save_model'],
 'run_streamlit_dashboard': ['save_model'],
 'generate_report': ['save_model', 'clean_data']}

## 3) Logging & Checkpoints Plan
Specify what you will log and where you will checkpoint for each task.

In [6]:
logging_plan = pd.DataFrame({
    'task': [
        'ingest_data',
        'clean_data',
        'train_model',
        'save_model',
        'run_flask_api',
        'run_streamlit_dashboard',
        'generate_report'
    ],
    'log_messages': [
        'start/end, rows downloaded, source URI',
        'start/end, rows in/out, null counts, schema validation',
        'start/end, hyperparameters, train/test metrics (R², RMSE)',
        'start/end, model path saved, size',
        'start/end, API start status, request count, errors',
        'start/end, dashboard launch status, user interactions',
        'start/end, artifact paths, number of plots, summary stats'
    ],
    'checkpoint_artifact': [
        'raw_market.csv',
        'cleaned_data.csv',
        'trained_model.pkl',
        'trained_model.pkl',
        'flask_api_running.txt',
        'streamlit_dashboard_running.txt',
        'analysis_report.pdf'
    ]
})
logging_plan

Unnamed: 0,task,log_messages,checkpoint_artifact
0,ingest_data,"start/end, rows downloaded, source URI",raw_market.csv
1,clean_data,"start/end, rows in/out, null counts, schema va...",cleaned_data.csv
2,train_model,"start/end, hyperparameters, train/test metrics...",trained_model.pkl
3,save_model,"start/end, model path saved, size",trained_model.pkl
4,run_flask_api,"start/end, API start status, request count, er...",flask_api_running.txt
5,run_streamlit_dashboard,"start/end, dashboard launch status, user inter...",streamlit_dashboard_running.txt
6,generate_report,"start/end, artifact paths, number of plots, su...",analysis_report.pdf


## 4) Right-Sizing Automation
Which parts will you automate now? Which stay manual? Why?

*(Write your rationale here.)*

Automation focuses on **repetitive, deterministic tasks** such as data pipelines, feature computation, model training, and deployment. Manual work is reserved for **tasks needing judgment, interpretation, or business context**, ensuring that automation enhances efficiency without replacing critical human decision-making.

## 5) (Stretch) Refactor One Task into a Function + CLI
Use the templates below.

In [10]:
import argparse, json, logging, sys
from pathlib import Path
from datetime import datetime

def my_task(input_path: str, output_path: str) -> None:
    '''Example task template: read → transform → write JSON.'''
    logging.info('[my_task] start')
    
    # TODO: implement your logic
    result = {'run_at': datetime.utcnow().isoformat(), 'note': 'replace with real output'}
    
    # Ensure output folder exists, relative to the parent of notebooks folder
    notebooks_folder = Path.cwd()           # current notebook folder
    project_root = notebooks_folder.parent  # sibling to notebooks
    full_output_path = project_root / output_path
    full_output_path.parent.mkdir(parents=True, exist_ok=True)
    
    full_output_path.write_text(json.dumps(result, indent=2))
    logging.info('[my_task] wrote %s', full_output_path)

def main(argv=None):
    parser = argparse.ArgumentParser(description='Homework task wrapper')
    parser.add_argument('--input', required=True)
    parser.add_argument('--output', required=True)
    args = parser.parse_args(argv)
    logging.basicConfig(level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)])
    my_task(args.input, args.output)

if __name__ == '__main__':
    # Example simulated CLI in notebook:
    main(['--input', 'data/in.ext', '--output', 'data/out.json'])

INFO:root:[my_task] start
INFO:root:[my_task] wrote /Users/yihanyao/bootcamp_yihan_yao/homework/homework15/data/out.json


### Optional: Simple Retry Wrapper (fill in)
Add a small retry with linear backoff to harden a task.

In [11]:
import time
def retry(n_tries=3, delay=0.2):
    def wrapper(fn, *args, **kwargs):
        # TODO: implement try/except loop with sleep backoff
        return fn(*args, **kwargs)
    return wrapper