# Question 3: Data Pipeline and Transformation

Objective:
- Show understanding of creating data pipelines and transformations.

Task:
- Using the CSV file from Question 1, filter the data to include only 'Copper' and 'Zinc' for the year 2020 & 2021.
- Calculate MACD (slow/medium/fast) and RSI for each metal historically.
- Use SQL inserts to populate the SQL table created in Question 2 with this generated data.
- Demonstrate the use of a decorator to log the execution of the SQL inserts.

In [2]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data_path = f'{os.pardir}/data/MarketData.csv'

In [3]:
# Load the data 
raw_df = pd.read_csv(data_path)
raw_df

Unnamed: 0,Start Date,01/01/2010,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,End Date,01/01/2023,,,,,
1,,,,,,,
2,,LME COPPER 3MO ($),LME ALUMINUM 3MO ($),LME ZINC 3MO ($),LME LEAD 3MO ($),LME TIN 3MO ($),Generic 1st 'CL' Future
3,,LMCADS03 Comdty,LMAHDS03 Comdty,LMZSDS03 Comdty,LMPBDS03 Comdty,LMSNDS03 Comdty,CL1 Comdty
4,,Settlement Price,Settlement Price,Settlement Price,Settlement Price,Settlement Price,Settlement Price
...,...,...,...,...,...,...,...
3392,26/12/2022,8349.5,2389.5,2965,2273.5,23934,79.56
3393,27/12/2022,8349.5,2389.5,2965,2273.5,23934,79.53
3394,28/12/2022,8443,2381,3005.5,2218,24734,78.96
3395,29/12/2022,8418,2405,2984.5,2272.5,24915,78.4


In [4]:
# Select only Dates, Copper, and Zinc columns, the first 6 rows are other information
df = raw_df.iloc[6:, [0,1,3]].copy() # TODO: do this by using the column names
df.columns = ['Dates', 'Copper', 'Zinc']
# Ensure the data types are correct
df['Dates'] = pd.to_datetime(df['Dates'], format='%d/%m/%Y', errors='coerce')
df['Copper'] = pd.to_numeric(df['Copper'], errors='coerce')
df['Zinc'] = pd.to_numeric(df['Zinc'], errors='coerce')
# # Select only 2021
# df = df[(df['Dates'].dt.year == 2020) | (df['Dates'].dt.year == 2021)]
# Check there are no NaN values
print(df.isna().sum())
df

Dates     0
Copper    0
Zinc      0
dtype: int64


Unnamed: 0,Dates,Copper,Zinc
6,2010-01-01,7375.0,2560.0
7,2010-01-04,7500.0,2574.0
8,2010-01-05,7485.0,2575.0
9,2010-01-06,7660.0,2718.0
10,2010-01-07,7535.0,2607.0
...,...,...,...
3392,2022-12-26,8349.5,2965.0
3393,2022-12-27,8349.5,2965.0
3394,2022-12-28,8443.0,3005.5
3395,2022-12-29,8418.0,2984.5


In [9]:
# Calculate how many days of padding I need to calculate an accurate EMA
precision = 0.05 # Weight smaller than 5%
ema_window = 26
N_padding = np.ceil(np.log((ema_window + 1) * (precision / 2)) / np.log((ema_window - 1) / (ema_window + 1)))
N_padding

np.float64(6.0)

In [6]:
# Calculate the MACD for Copper and Zinc

def calculate_macd(data, short_window=12, long_window=26):
    """Calculate the Moving Average Convergence Divergence (MACD) for a given time series.
    INPUTS:
    - data: pd.Series, the time series data 
    - short_window, long_window: int, the short and long window periods for MACD line (EMA_short - EMA_long).
    OUTPUTS:
    - macd: pd.Series, as per definition
    """
    short_EMA = data.emw(span=short_window, adjust=False).mean()
    long_EMA = data.emw(span=long_window, adjust=False).mean()
    return short_EMA - long_EMA

