# Question 5: Async Data Pipeline
Modify Question 3 to write data to the database asynchronously. \
Read from the database 5 times concurrantly using async (hint: asyncio.gather())

In [1]:
import os
from datetime import timedelta
import asyncio
import aiosqlite
import time

import pandas as pd
import numpy as np

### Functions to calculate the indicators ###


def calculate_MACD(time_data, short_window=12, long_window=26):
    """Calculate the Moving Average Convergence Divergence (MACD) for a given time series.
    INPUTS:
    - data: pd.Series, the time series data
    - short_window, long_window: int, the short and long window periods for MACD line (EMA_short - EMA_long).
    OUTPUTS:
    - macd: pd.Series, as per definition
    """
    short_EMA = time_data.ewm(span=short_window, adjust=False).mean()
    long_EMA = time_data.ewm(span=long_window, adjust=False).mean()
    return short_EMA - long_EMA


def calculate_RSI(time_data, window=14):
    """Calculate the Relative Strength Index (RSI) for a given time series.
    The Relative Strenght (RS) is a EMA of the gains and losses with smoothing factor based on 'window'.
    INPUTS:
    - data: pd.Series, the time series data
    - window: int, the window period for RSI
    OUTPUTS:
    - rsi: pd.Series, as per definition
    """
    delta = time_data.diff()
    gain = delta.where(delta > 0, other=0)
    loss = -delta.where(delta < 0, other=0)
    avg_gain = gain.ewm(com=window).mean()
    avg_loss = loss.ewm(com=window).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi


### Functions to handle the database ###


def log_execution(func):
    async def wrapper(*args, **kwargs):
        if func.__name__ == "update_metal_indicators":
            print(
                f"Updating metal indicators for {args[1]} from {args[2]} to {args[3]}."
            )
        else:
            print(
                f"Running function {func.__name__} with arguments: {args} and keyword arguments: {kwargs}."
            )
        start_execution = time.time()
        result = await func(
            *args, **kwargs
        )  # Await the function call, otherwise it will return a coroutine and not run the function
        end_execution = time.time()
        if func.__name__ == "update_metal_indicators":
            print(
                f"Finished updating metal indicators for {args[1]} in {end_execution - start_execution:.2f} seconds."
            )
        else:
            print(
                f"Finished execution of {func.__name__} in {end_execution - start_execution:.2f} seconds."
            )
        return result

    return wrapper


@log_execution
async def update_metal_indicators(df, metal, start_date, end_date):
    async with aiosqlite.connect(f"{os.pardir}/market_data.db") as conn:
        async with conn.cursor() as cur:
            # Select only dates of interest
            df_filtered = df[(df["Dates"] >= start_date) & (df["Dates"] <= end_date)]

            # Loop through rows and update MACD and RSI for the specified metal
            for _, row in df_filtered.iterrows():
                # Convert the date to ISO format (YYYY-MM-DD)
                date_str = row["Dates"].strftime("%Y-%m-%d")

                await cur.execute(
                    """
                    UPDATE MetalPrices
                    SET MACD = ?, RSI = ?
                    WHERE Date = ? AND Metal = ?;
                """,
                    (row[f"MACD_{metal}"], row[f"RSI_{metal}"], date_str, metal),
                )

            await conn.commit()

In [2]:

### Parameters ###

data_path = f"{os.pardir}/data/MarketData.csv"
# Metals to select
metals = ["COPPER", "ZINC"]
# Use a padding to calculate the EMA and RSI without a starting bias
padding = 50
# Select only 2020 and 2021
start_date = pd.to_datetime("01/01/2020", format="%d/%m/%Y")
end_date = pd.to_datetime("31/12/2021", format="%d/%m/%Y")

### Load the data ###

raw_df = pd.read_csv(data_path)

### Build a selected dataframe ###

# Get columns names with selected metals
columns_metals = raw_df.iloc[2, 1:-1].values
columns_metals = [col.split(" ")[1] for col in columns_metals]
# Get the mask of the columns to select, the first column is the date
mask_columns = np.concatenate(([True], np.isin(columns_metals, metals), [False]))
# Select only columns in the mask, also the first 6 rows are headers
df = raw_df.iloc[6:, mask_columns].copy()
df.columns = np.concatenate((["Dates"], metals))
# Ensure the data types are correct. the format dd/mm/yyyy works better for pandas, but needs to be converted to yyyy-mm-dd in the database
df["Dates"] = pd.to_datetime(df["Dates"], format="%d/%m/%Y", errors="coerce")
for metal in metals:
    df[metal] = pd.to_numeric(df[metal], errors="coerce")
# Select only the indicated period with a padding at the beginning
cutoff_date = start_date - timedelta(days=padding)
df = df[(df["Dates"] > cutoff_date) & (df["Dates"] <= end_date)]
# Check the data doesn't have any missing values, otherwise it needs addressing
if not (df.isna().sum() == 0).all():
    raise ValueError(
        "There are NaN values in the data. Please check the data and try again."
    )

### Calculate the indicators ###

for metal in metals:
    df[f"MACD_{metal}"] = calculate_MACD(df[metal])
    df[f"RSI_{metal}"] = calculate_RSI(df[metal])

### Update the database ###

# Create a list of tasks to update indicators for each metal concurrently
tasks = [update_metal_indicators(df, metal, start_date, end_date) for metal in metals]
# Run all the tasks concurrently
asyncio.gather(*tasks)

<_GatheringFuture pending>

Updating metal indicators for COPPER from 2020-01-01 00:00:00 to 2021-12-31 00:00:00.
Updating metal indicators for ZINC from 2020-01-01 00:00:00 to 2021-12-31 00:00:00.
Finished updating metal indicators for COPPER in 0.07 seconds.
Finished updating metal indicators for ZINC in 0.13 seconds.
