In [18]:
# import packages
import pandas as pd
import os

# Load utilities
from etl_utils import load_query

# Load file path from .env
from dotenv import load_dotenv
load_dotenv()

sp_csv_path = os.getenv("SP_CSV_PATH")

# Create db connection string
db_username = os.getenv("DB_USERNAME")
db_password = os.getenv("DB_PASSWORD")
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")
db_conn_str = f"mysql+pymysql://{db_username}:{db_password}@{db_host}:{db_port}/stocks_etl_ml_db"

# Set table name
table_name = "sp_constituents"

In [2]:
# Specify date - this would be passed by Airflow
load_date = '2018-05-01'

In [3]:
# Load file
# First, check for the number of files in the folder
files = os.listdir(sp_csv_path)

# Get csvs
csv_files = [f for f in files if f.endswith(".csv")]

# Set file prefix
file_prefix = "S&P 500 Historical Components & Changes"

In [11]:
# Load file if prefix is valid and it is the only file
if len(csv_files) > 1:
    print("More than one csv file in folder. Make sure only the csv file needed to update is in folder.")
elif not csv_files[0].startswith(file_prefix):
    print("The file prefix is not the expected prefix. Are you sure the file is correct?")
else:
    full_path = os.path.join(sp_csv_path, csv_files[0])
    df = pd.read_csv(full_path)
    print("File loaded successfully.")

File loaded successfully.


In [12]:
# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

In [13]:
# There may not be an exact match for the date we pass - get the closest match
last_row = df[df['date'] <= load_date].tail(1)

In [14]:
# Split tickers into their own rows
last_row.loc[:,'tickers'] = last_row['tickers'].str.split(",")
df_melted = last_row.explode('tickers').reset_index(drop = True)

In [15]:
# Add column for the load date
df_melted['Load_Date'] = pd.to_datetime(load_date)

# Rename and reorder columns
df_melted.rename(
    columns = {"date":"Actual_Date",
               "tickers": "Tickers"},
    inplace = True
)

df_melted = df_melted[["Actual_Date", "Load_Date", "Tickers"]]

In [16]:
df_melted

Unnamed: 0,Actual_Date,Load_Date,Tickers
0,2018-04-24,2018-05-01,A
1,2018-04-24,2018-05-01,AAL
2,2018-04-24,2018-05-01,AAP
3,2018-04-24,2018-05-01,AAPL
4,2018-04-24,2018-05-01,ABBV
...,...,...,...
501,2018-04-24,2018-05-01,XYL
502,2018-04-24,2018-05-01,YUM
503,2018-04-24,2018-05-01,ZBH
504,2018-04-24,2018-05-01,ZION


In [19]:
# Push to db
load_query(db_conn_str = db_conn_str, table_name = table_name, df = df_melted)

Table columns align: continuing to data upload.
506 rows uploaded successfully to sp_constituents.
