# SPDR/Index ETF Data Gathering and PreProcessing

<hr style="border: 4px solid royalblue">

### Disclaimer: This notebook should not be considered any kind of financial advice. It exists only for the purposes of practicing modeling and making predictions

In [5]:
from statsmodels.regression.rolling import RollingOLS
import yfinance as yf
import matplotlib.pyplot as plt
import pandas_datareader.data as web
import pandas as pd
import numpy as np
import datetime as dt
import warnings
import pandas_ta
import statsmodels.api as sm
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 25)
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Download Data
For the guide, we will be downloading data for the Dow Jones, S&P 500, Nasdaq, and Russell 2000. Not all date ranges have filled values for each index, and therefore their tickers will not appear in the data until at such time that we have that data from yfinance. This process would be the same if downloading data about other stocks or ETFs, you simply need to update the tickers in the cell below.

In [6]:
# Download Index Data from yfinance
indeces = ["^RUT", "^IXIC", "^GSPC", "^DJI"]
index_df = yf.download(indeces, start='1964-01-02', end='2023-11-03')

[*********************100%%**********************]  4 of 4 completed


In [31]:
# Visual inspection, initially set up with multi-index columns, hence the .stack() method use
index_df = index_df.stack()
index_df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1964-01-02,^GSPC,75.43,75.43,75.790001,74.82,0.0,4680000.0
1964-01-03,^GSPC,75.5,75.5,76.040001,75.089996,0.0,5550000.0
1964-01-06,^GSPC,75.669998,75.669998,76.120003,75.18,0.0,5480000.0


In [32]:
# Reset the index to access the dates, create datetime
index_df.reset_index(inplace=True)

# convert to datetime
index_df["Date"] = pd.to_datetime(index_df["Date"])

# set the index
index_df.set_index("Date", inplace=True)

In [35]:
# Renaming ticker column
index_df.rename(columns={"level_1":"Ticker"}, inplace=True)
index_df.head(2)

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1964-01-02,^GSPC,75.43,75.43,75.790001,74.82,0.0,4680000.0
1964-01-03,^GSPC,75.5,75.5,76.040001,75.089996,0.0,5550000.0


# Data outside of yfinance

In [13]:
# Download historical GDP data from Federal Reserve Economic Data (F.R.E.D.)
gdp_data = web.get_data_fred("GDP", start="1964-01-01", end="2024-01-01")

# Download historical interest rates from the Federal Reserve
interest_rate_data = web.get_data_fred("DTB3", start="1964-01-02", end="2024-01-01")

# Download csv file from: https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html
ff = pd.read_csv('/Users/samalainabayeva/Desktop/Capstone Project/Dataset/F-F_Research_Data_5_Factors_2x3_daily.CSV', skiprows=2)

### Fama-French Preprocessing:

In [14]:
ff.head(2)

Unnamed: 0.1,Unnamed: 0,Mkt-RF,SMB,HML,RMW,CMA,RF
0,19630701,-0.67,0.02,-0.35,0.03,0.13,0.012
1,19630702,0.79,-0.28,0.28,-0.08,-0.21,0.012


In [26]:
# Converting Column to string for purposes of creating datetime
ff["Unnamed: 0"] = ff["Unnamed: 0"].astype(str)

# formating the string to match other datetime data
ff["Unnamed: 0"] = ff["Unnamed: 0"].apply(lambda x: f"{x[:4]}-{x[4:6]}-{x[6:]}")

# converting the properly formatted column to a datetime object in an appropriately named column
ff["Date"] = pd.to_datetime(ff["Unnamed: 0"])

# removing undesired features
ff.drop(columns=["Unnamed: 0", "RF"], inplace=True)

# Setting the datetime as index
ff.set_index("Date", inplace=True)

# Convert %'s into decimal values (might be unneccesary with future scaling)
ff = ff.div(100)

### GDP Preprocessing: a little tricky because of missing dates
GDP data has many dates missing, so I had to create an empty date range and merge it with the known data.

In [19]:
gdp_data.head(2)

Unnamed: 0_level_0,GDP
DATE,Unnamed: 1_level_1
1964-01-01,669.822
1964-04-01,678.674


In [20]:
# Create a date range spanning multiple years
start_date = pd.Timestamp(year=gdp_data.index.min().year, month=gdp_data.index.min().month, \
                          day=gdp_data.index.min().day)

# Known reasonable end date
end_date = pd.Timestamp(year=2023, month=7, day=1)

# Create the empty range
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Merge the date range with the filtered GDP data, filling in missing dates
merged_data = date_range.to_frame().merge(gdp_data, how='left', left_on=date_range, right_index=True)

In [22]:
# Now the dates have been expanded to match our stock data dates
merged_data.head(3)

Unnamed: 0,0,GDP
1964-01-01,1964-01-01,669.822
1964-01-02,1964-01-02,
1964-01-03,1964-01-03,
1964-01-04,1964-01-04,
1964-01-05,1964-01-05,


In [23]:
# Filling in the missing dates using forward fill, while GDP is not static, it is also unknown and estimated between
# reporting periods, so forward filling seemed the best way to avoid exposing the data to future GDP, this way, the 
# data only has access to GDP after the moment it is reported quarterly. 
merged_data["GDP_Filled"] = merged_data["GDP"].ffill()
merged_data.drop(columns=[0, "GDP"], inplace=True)
merged_data.head(3)

Unnamed: 0,GDP_Filled
1964-01-01,669.822
1964-01-02,669.822
1964-01-03,669.822


In [27]:
ff.head(3)

Unnamed: 0_level_0,Mkt-RF,SMB,HML,RMW,CMA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1963-07-01,-0.0067,0.0002,-0.0035,0.0003,0.0013
1963-07-02,0.0079,-0.0028,0.0028,-0.0008,-0.0021
1963-07-03,0.0063,-0.0018,-0.001,0.0013,-0.0025


### Interest rates

In [114]:
interest_rate_data.rename(columns={"DTB3": "Interest_Rates"}, inplace=True)
interest_rate_data.head(3)

Unnamed: 0_level_0,Interest_Rates
DATE,Unnamed: 1_level_1
1964-01-02,3.53
1964-01-03,3.53
1964-01-06,3.53


In [115]:
index_df.head(3)

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1964-01-02,^GSPC,75.43,75.43,75.790001,74.82,0.0,4680000.0
1964-01-03,^GSPC,75.5,75.5,76.040001,75.089996,0.0,5550000.0
1964-01-06,^GSPC,75.669998,75.669998,76.120003,75.18,0.0,5480000.0


##### All of our data share date time indices, although they do not all date back to the same dates, therefore, it is important that we join them correctly to the data with the most restrictive index time range.

In [116]:
# inspecting dates and shape
index_df.index.min(), index_df.index.max(), index_df.shape

(Timestamp('1964-01-02 00:00:00'),
 Timestamp('2023-11-02 00:00:00'),
 (45496, 7))

In [117]:
# Joining the data together on the shared index values
df = index_df.join(merged_data).join(interest_rate_data).join(ff)
df.index.min(), df.index.max(), df.shape

(Timestamp('1964-01-02 00:00:00'),
 Timestamp('2023-11-02 00:00:00'),
 (45496, 14))

In [120]:
# Inspecting the data
df.head()

Unnamed: 0,index,Ticker,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,RMW,CMA
0,1964-01-02,^GSPC,75.43,75.43,75.790001,74.82,0.0,4680000.0,669.822,3.53,0.006,0.0065,0.0057,-0.004,0.0017
1,1964-01-03,^GSPC,75.5,75.5,76.040001,75.089996,0.0,5550000.0,669.822,3.53,0.0017,0.0022,0.0036,-0.0031,0.0022
2,1964-01-06,^GSPC,75.669998,75.669998,76.120003,75.18,0.0,5480000.0,669.822,3.53,0.0023,0.0008,0.0002,-0.0023,0.0035
3,1964-01-07,^GSPC,75.690002,75.690002,76.239998,75.25,0.0,5700000.0,669.822,3.53,0.0004,0.0019,0.0073,-0.0041,0.008
4,1964-01-08,^GSPC,76.0,76.0,76.349998,75.389999,0.0,5380000.0,669.822,3.54,0.0034,0.0006,-0.0014,0.0024,0.0005


# Technical Indicator Additions

In [123]:
# re-structuring data, some of these may or may not be neccessary
df.reset_index(inplace=True)
df.drop(columns="level_0", inplace=True)
df.rename(columns={'index':"Date"}, inplace=True)
df.set_index(["Date",'Ticker'], inplace=True)

In [124]:
df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,RMW,CMA
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1964-01-02,^GSPC,75.43,75.43,75.790001,74.82,0.0,4680000.0,669.822,3.53,0.006,0.0065,0.0057,-0.004,0.0017
1964-01-03,^GSPC,75.5,75.5,76.040001,75.089996,0.0,5550000.0,669.822,3.53,0.0017,0.0022,0.0036,-0.0031,0.0022


### Garman-Klass Volatility

In [125]:
# Volatility Measure

df["garman_klass_vol"] = ((np.log(df['High']) - np.log(df["Low"])) ** 2)/2 - \
                        (2*np.log(2)- 1) * (np.log(df["Adj Close"])- np.log(df['Open']))**2

In [126]:
# Replacing the -inf values created by np.log(df['Open']) when early-dated open values == 0
# leaving value as nan for imputation later on
df["garman_klass_vol"].replace(-np.inf, np.nan, inplace=True)
df.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,RMW,CMA,garman_klass_vol
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1964-01-02,^GSPC,75.43,75.43,75.790001,74.82,0.0,4680000.0,669.822,3.53,0.006,0.0065,0.0057,-0.004,0.0017,
1964-01-03,^GSPC,75.5,75.5,76.040001,75.089996,0.0,5550000.0,669.822,3.53,0.0017,0.0022,0.0036,-0.0031,0.0022,
1964-01-06,^GSPC,75.669998,75.669998,76.120003,75.18,0.0,5480000.0,669.822,3.53,0.0023,0.0008,0.0002,-0.0023,0.0035,


### pandas_ta methods -> technical analysis metrics

In [127]:
# Adding a feature for relative strength indicator -> takes 20 sessions to update

df["RSI"] = df.groupby(level=1)['Adj Close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))

In [128]:
df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,RMW,CMA,garman_klass_vol,RSI
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1964-01-02,^GSPC,75.43,75.43,75.790001,74.82,0.0,4680000.0,669.822,3.53,0.006,0.0065,0.0057,-0.004,0.0017,,
1964-01-03,^GSPC,75.5,75.5,76.040001,75.089996,0.0,5550000.0,669.822,3.53,0.0017,0.0022,0.0036,-0.0031,0.0022,,


In [129]:
# These also require the length of 20 sessions to show up

# 2 STD DEVs below 20-day SMA
df['lowest_bollinger_band'] = df.groupby(level=1)["Adj Close"].transform(lambda x: pandas_ta.bbands(\
                            close=x, length=20).iloc[:,0] if pandas_ta.bbands(close=x, length=20) \
                            is not None else np.nan)

# 1 STD DEV below 20-day SMA
df['lower_bollinger_band'] = df.groupby(level=1)["Adj Close"].transform(lambda x: pandas_ta.bbands(\
                            close=x, length=20, std=1).iloc[:,0] if pandas_ta.bbands(close=x, length=20) \
                            is not None else np.nan)

# 20 - Day Simple moving average

df['20_day_SMA'] = df.groupby(level=1)["Adj Close"].transform(lambda x: pandas_ta.bbands(\
                            close=x, length=20).iloc[:,1] if pandas_ta.bbands(close=x, length=20) \
                            is not None else np.nan)

# 1 STD DEV above SMA

df['one_up_bollinger_band'] = df.groupby(level=1)["Adj Close"].transform(lambda x: pandas_ta.bbands(\
                        close=x, length=20, std=1).iloc[:,2] if pandas_ta.bbands(close=x, length=20) \
                        is not None else np.nan)

# 2 STD DEVs above SMA

df['upper_bollinger_band'] = df.groupby(level=1)["Adj Close"].transform(lambda x: pandas_ta.bbands(\
                            close=x, length=20).iloc[:,2] if pandas_ta.bbands(close=x, length=20) \
                            is not None else np.nan)

In [130]:
df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,RMW,CMA,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1964-01-02,^GSPC,75.43,75.43,75.790001,74.82,0.0,4680000.0,669.822,3.53,0.006,0.0065,0.0057,-0.004,0.0017,,,,,,,


In [131]:
# Function for calculating the atr -> because multiple columns needed for tranformation, .transform() will not work
# because .transform() can only take 1 column as an input, we  will use a groupby().apply() function

def compute_atr(stock_data):
    atr = pandas_ta.atr(high=stock_data["High"],
                 low=stock_data["Low"],
                 close=stock_data["Close"],
                       length=14)
    return atr

In [132]:
# Need to add group_keys = False, otherwise, it will double the date index, giving us a triple index

df["ATR"] = df.groupby(level=1, group_keys=False).apply(compute_atr)

In [134]:
df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,RMW,CMA,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1964-01-02,^GSPC,75.43,75.43,75.790001,74.82,0.0,4680000.0,669.822,3.53,0.006,0.0065,0.0057,-0.004,0.0017,,,,,,,,


In [135]:
# Custom function for adding a new feature, utilizing pandas_ta

def compute_MACD(close):
    macd_df = pandas_ta.macd(close=close, length=20)
    
    if macd_df is not None and not macd_df.empty:
        macd = macd_df.iloc[:, 0]
        return macd
    else:
        # This else clause was very tricky to figure out, required a series of nans in the proper length,
        # which was on occassion less than 20, therefore causing errors
        return pd.Series([np.nan] * len(close), index=close.index)

In [136]:
# Moving Average Convergence Divergence

df["MACD"] = df.groupby(level=1, group_keys=False)["Adj Close"].apply(compute_MACD)

In [137]:
df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,RMW,CMA,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1964-01-02,^GSPC,75.43,75.43,75.790001,74.82,0.0,4680000.0,669.822,3.53,0.006,0.0065,0.0057,-0.004,0.0017,,,,,,,,,


In [138]:
# Dollar volume addition, amount of dollars worth of stock/etf/index traded in a day in millions

df["dollar_volume(M)"] = ((df['Adj Close']*df["Volume"])/1000000).round(4)

In [139]:
# Inspection
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,RMW,CMA,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M)
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1964-01-02,^GSPC,75.43,75.43,75.790001,74.82,0.0,4680000.0,669.822,3.53,0.006,0.0065,0.0057,-0.004,0.0017,,,,,,,,,,353.0124
1964-01-03,^GSPC,75.5,75.5,76.040001,75.089996,0.0,5550000.0,669.822,3.53,0.0017,0.0022,0.0036,-0.0031,0.0022,,,,,,,,,,419.025
1964-01-06,^GSPC,75.669998,75.669998,76.120003,75.18,0.0,5480000.0,669.822,3.53,0.0023,0.0008,0.0002,-0.0023,0.0035,,,,,,,,,,414.6716
1964-01-07,^GSPC,75.690002,75.690002,76.239998,75.25,0.0,5700000.0,669.822,3.53,0.0004,0.0019,0.0073,-0.0041,0.008,,,,,,,,,,431.433
1964-01-08,^GSPC,76.0,76.0,76.349998,75.389999,0.0,5380000.0,669.822,3.54,0.0034,0.0006,-0.0014,0.0024,0.0005,,,,,,,,,,408.88


# Functions for target dates -> generate future dates and retrieve values

In [140]:
# This group of functions works -> Just replace the DF in the initial dates=<DF>["Date"]

def one_month_later(date, months=1):
    dates = df["Date"].unique()
    # Start with a specific date
    start_date = date 
    
    # Calculate the date one month later, adjusting for weekends
    one_month_later = start_date + pd.DateOffset(months=months)

    while one_month_later not in dates:  
        if one_month_later > dates[-1]:
            return np.nan
            
        else:
            one_month_later += pd.DateOffset(days=1)
    
    return one_month_later


def three_months_later(date, months=3):
    return one_month_later(date, months=months)


def six_months_later(date, months=6):
    return one_month_later(date, months=months)


def twelve_months_later(date, months=12):
    return one_month_later(date, months=months)



In [146]:
# Gaining access to Date-Times

df.reset_index(inplace=True)

In [143]:
# Creating columns for quarter, month, day of week, year of presidential cycle, using string type for cat encoding

df["Quarter"] = df["Date"].dt.quarter.astype(str)
df["Month"] = df["Date"].dt.month.astype(str)
df["cycle_year"] = (df["Date"].dt.year % 4)
df["day_of_week"] = df["Date"].dt.day_of_week.astype(str)
df['cycle_year'] = df['cycle_year'].apply(lambda x: 4 if x == 0 else x).astype(str)
df['day_of_week'] = df["day_of_week"].map({'0':'Monday','1':'Tuesday','2':'Wednesday','3':'Thursday','4':'Friday'})

In [149]:
df.head(2)

Unnamed: 0,Date,Ticker,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,...,upper_bollinger_band,ATR,MACD,dollar_volume(M),Quarter,Month,cycle_year,day_of_week,one_months_later,three_months_later,six_months_later,twelve_months_later
0,1964-01-02,^GSPC,75.43,75.43,75.790001,74.82,0.0,4680000.0,669.822,3.53,0.006,0.0065,...,,,,353.0124,1,1,4,Thursday,1964-02-03,1964-04-02,1964-07-02,1965-01-04
1,1964-01-03,^GSPC,75.5,75.5,76.040001,75.089996,0.0,5550000.0,669.822,3.53,0.0017,0.0022,...,,,,419.025,1,1,4,Friday,1964-02-03,1964-04-03,1964-07-06,1965-01-04


In [148]:
# Cell takes about 3 minutes to run

df['one_months_later'] = df["Date"].apply(one_month_later)
df["three_months_later"] = df["Date"].apply(three_months_later)
df["six_months_later"] = df["Date"].apply(six_months_later)
df["twelve_months_later"] = df["Date"].apply(twelve_months_later)

In [150]:
# These functions add changes in percentage at the target dates, requires date to be index

def one_month_price_change(df, num="One"):
    for ticker in list(df["Ticker"].unique()):
        ticker_df = df[df["Ticker"] == ticker]

        for index, row in ticker_df.iterrows():
            try:
                one_month_later_value = row[f"{num.lower()}_months_later"]
                if not pd.isna(one_month_later_value):
                    ticker_df.at[index, f"{num}_Month_Change"] = (ticker_df.loc[one_month_later_value, "Adj Close"] - row["Adj Close"]) / row["Adj Close"]
                else:
                    # Handle the case where one_month_later is NaN (NaT)
                    ticker_df.at[index, f"{num}_Month_Change"] = np.nan
            except KeyError:
                # Handle the KeyError exception here if necessary
                ticker_df.at[index, f"{num}_Month_Change"] = np.nan
           
        df.loc[df["Ticker"] == ticker, f"{num}_Month_Change"] = ticker_df[f"{num}_Month_Change"]
    
    return df



def three_month_price_change(df, num="Three"):
    return one_month_price_change(df, num)
    
    
def six_month_price_change(df, num="Six"):
    return one_month_price_change(df, num)


def twelve_month_price_change(df, num="Twelve"):
    return one_month_price_change(df, num)

In [151]:
# resetting date to the index for use with the next set of functions
df.set_index("Date", inplace=True)

In [152]:
# adding changes in percentage for inferential purposes - takes a couple of minutes to run

one_month_price_change(df)
three_month_price_change(df)
six_month_price_change(df)
twelve_month_price_change(df)

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,...,Quarter,Month,cycle_year,day_of_week,one_months_later,three_months_later,six_months_later,twelve_months_later,One_Month_Change,Three_Month_Change,Six_Month_Change,Twelve_Month_Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1964-01-02,^GSPC,75.430000,75.430000,75.790001,74.820000,0.000000,4.680000e+06,669.822,3.53,0.0060,0.0065,0.0057,...,1,1,4,Thursday,1964-02-03,1964-04-02,1964-07-02,1965-01-04,0.020416,0.056609,0.095055,0.116664
1964-01-03,^GSPC,75.500000,75.500000,76.040001,75.089996,0.000000,5.550000e+06,669.822,3.53,0.0017,0.0022,0.0036,...,1,1,4,Friday,1964-02-03,1964-04-03,1964-07-06,1965-01-04,0.019470,0.058808,0.099073,0.115629
1964-01-06,^GSPC,75.669998,75.669998,76.120003,75.180000,0.000000,5.480000e+06,669.822,3.53,0.0023,0.0008,0.0002,...,1,1,4,Monday,1964-02-06,1964-04-06,1964-07-06,1965-01-06,0.016651,0.057486,0.096604,0.121845
1964-01-07,^GSPC,75.690002,75.690002,76.239998,75.250000,0.000000,5.700000e+06,669.822,3.53,0.0004,0.0019,0.0073,...,1,1,4,Tuesday,1964-02-07,1964-04-07,1964-07-07,1965-01-07,0.019686,0.053508,0.098164,0.126437
1964-01-08,^GSPC,76.000000,76.000000,76.349998,75.389999,0.000000,5.380000e+06,669.822,3.54,0.0034,0.0006,-0.0014,...,1,1,4,Wednesday,1964-02-10,1964-04-08,1964-07-08,1965-01-08,0.013816,0.049342,0.093684,0.123290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-01,^RUT,1669.699951,1669.699951,1669.790039,1649.060059,1661.410034,4.224900e+09,,5.31,,,,...,4,11,3,Wednesday,NaT,NaT,NaT,NaT,,,,
2023-11-02,^DJI,33839.078125,33839.078125,33852.960938,33450.031250,33457.820312,3.043500e+08,,5.28,,,,...,4,11,3,Thursday,NaT,NaT,NaT,NaT,,,,
2023-11-02,^GSPC,4317.779785,4317.779785,4319.720215,4268.259766,4268.259766,4.669780e+09,,5.28,,,,...,4,11,3,Thursday,NaT,NaT,NaT,NaT,,,,
2023-11-02,^IXIC,13294.190430,13294.190430,13302.179688,13177.639648,13230.490234,4.962950e+09,,5.28,,,,...,4,11,3,Thursday,NaT,NaT,NaT,NaT,,,,


In [153]:
# This function creates binary "No-Gain:0, Gain:1" columns

def one_month_gainer(df, num="One"):
    for ticker in list(df["Ticker"].unique()):
        ticker_df = df[df["Ticker"] == ticker]

        for index, row in ticker_df.iterrows():
            try:
                one_month_later_value = row[f"{num.lower()}_months_later"]
                if not pd.isna(one_month_later_value):
                    ticker_df.at[index, f"{num}_Month_Positive"] = (row["Adj Close"] < ticker_df.loc[\
                                                                    one_month_later_value, "Adj Close"]).astype(int)
                else:
                    # Handle the case where one_month_later is NaN (NaT)
                    ticker_df.at[index, f"{num}_Month_Positive"] = np.nan
            except KeyError:
                # Handle the KeyError exception here if necessary
                ticker_df.at[index, f"{num}_Month_Positive"] = np.nan
           
        df.loc[df["Ticker"] == ticker, f"{num}_Month_Positive"] = ticker_df[f"{num}_Month_Positive"]
    
    return df


def three_month_gainer(df, num="Three"):
    return one_month_gainer(df, num)


def six_month_gainer(df, num="Six"):
    return one_month_gainer(df, num)


def twelve_month_gainer(df, num="Twelve"):
    return one_month_gainer(df, num)

In [154]:
# Adding the binary columns which will be our target
one_month_gainer(df)
three_month_gainer(df)
six_month_gainer(df)
twelve_month_gainer(df)

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,...,one_months_later,three_months_later,six_months_later,twelve_months_later,One_Month_Change,Three_Month_Change,Six_Month_Change,Twelve_Month_Change,One_Month_Positive,Three_Month_Positive,Six_Month_Positive,Twelve_Month_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1964-01-02,^GSPC,75.430000,75.430000,75.790001,74.820000,0.000000,4.680000e+06,669.822,3.53,0.0060,0.0065,0.0057,...,1964-02-03,1964-04-02,1964-07-02,1965-01-04,0.020416,0.056609,0.095055,0.116664,1.0,1.0,1.0,1.0
1964-01-03,^GSPC,75.500000,75.500000,76.040001,75.089996,0.000000,5.550000e+06,669.822,3.53,0.0017,0.0022,0.0036,...,1964-02-03,1964-04-03,1964-07-06,1965-01-04,0.019470,0.058808,0.099073,0.115629,1.0,1.0,1.0,1.0
1964-01-06,^GSPC,75.669998,75.669998,76.120003,75.180000,0.000000,5.480000e+06,669.822,3.53,0.0023,0.0008,0.0002,...,1964-02-06,1964-04-06,1964-07-06,1965-01-06,0.016651,0.057486,0.096604,0.121845,1.0,1.0,1.0,1.0
1964-01-07,^GSPC,75.690002,75.690002,76.239998,75.250000,0.000000,5.700000e+06,669.822,3.53,0.0004,0.0019,0.0073,...,1964-02-07,1964-04-07,1964-07-07,1965-01-07,0.019686,0.053508,0.098164,0.126437,1.0,1.0,1.0,1.0
1964-01-08,^GSPC,76.000000,76.000000,76.349998,75.389999,0.000000,5.380000e+06,669.822,3.54,0.0034,0.0006,-0.0014,...,1964-02-10,1964-04-08,1964-07-08,1965-01-08,0.013816,0.049342,0.093684,0.123290,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-01,^RUT,1669.699951,1669.699951,1669.790039,1649.060059,1661.410034,4.224900e+09,,5.31,,,,...,NaT,NaT,NaT,NaT,,,,,,,,
2023-11-02,^DJI,33839.078125,33839.078125,33852.960938,33450.031250,33457.820312,3.043500e+08,,5.28,,,,...,NaT,NaT,NaT,NaT,,,,,,,,
2023-11-02,^GSPC,4317.779785,4317.779785,4319.720215,4268.259766,4268.259766,4.669780e+09,,5.28,,,,...,NaT,NaT,NaT,NaT,,,,,,,,
2023-11-02,^IXIC,13294.190430,13294.190430,13302.179688,13177.639648,13230.490234,4.962950e+09,,5.28,,,,...,NaT,NaT,NaT,NaT,,,,,,,,


### Handling nulls
If you so choose to keep more data, you can handle these null values differently. My intention was to model over different time frames, and so the early/late data contained nulls due to requiring x-number of periods to generate technical indicators, and x-days in the future in order to know whether or not the price increased from a given date. I therefore dropped all non-garman_klass nulls

In [163]:
df.dropna(subset=df.columns.difference(["garman_klass_vol"]), inplace=True)

In [164]:
df.isna().sum()

Ticker                      0
Adj Close                   0
Close                       0
High                        0
Low                         0
Open                        0
Volume                      0
GDP_Filled                  0
Interest_Rates              0
Mkt-RF                      0
SMB                         0
HML                         0
RMW                         0
CMA                         0
garman_klass_vol         4491
RSI                         0
lowest_bollinger_band       0
lower_bollinger_band        0
20_day_SMA                  0
one_up_bollinger_band       0
upper_bollinger_band        0
ATR                         0
MACD                        0
dollar_volume(M)            0
Quarter                     0
Month                       0
cycle_year                  0
day_of_week                 0
one_months_later            0
three_months_later          0
six_months_later            0
twelve_months_later         0
One_Month_Change            0
Three_Mont

In [165]:
# Inspection
df["One_Month_Positive"].value_counts(), df["Three_Month_Positive"].value_counts(),\
df["Six_Month_Positive"].value_counts(), df["Twelve_Month_Positive"].value_counts()

(One_Month_Positive
 1.0    26891
 0.0    17075
 Name: count, dtype: int64,
 Three_Month_Positive
 1.0    28489
 0.0    15477
 Name: count, dtype: int64,
 Six_Month_Positive
 1.0    30033
 0.0    13933
 Name: count, dtype: int64,
 Twelve_Month_Positive
 1.0    32561
 0.0    11405
 Name: count, dtype: int64)

In [166]:
# Info
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43966 entries, 1964-02-06 to 2022-11-02
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Ticker                 43966 non-null  object        
 1   Adj Close              43966 non-null  float64       
 2   Close                  43966 non-null  float64       
 3   High                   43966 non-null  float64       
 4   Low                    43966 non-null  float64       
 5   Open                   43966 non-null  float64       
 6   Volume                 43966 non-null  float64       
 7   GDP_Filled             43966 non-null  float64       
 8   Interest_Rates         43966 non-null  float64       
 9   Mkt-RF                 43966 non-null  float64       
 10  SMB                    43966 non-null  float64       
 11  HML                    43966 non-null  float64       
 12  RMW                    43966 non-null  floa

# Final Data
Next Steps would be to add more technical indicators or add more commodity prices, etc.

In [167]:
df

Unnamed: 0_level_0,Ticker,Adj Close,Close,High,Low,Open,Volume,GDP_Filled,Interest_Rates,Mkt-RF,SMB,HML,...,one_months_later,three_months_later,six_months_later,twelve_months_later,One_Month_Change,Three_Month_Change,Six_Month_Change,Twelve_Month_Change,One_Month_Positive,Three_Month_Positive,Six_Month_Positive,Twelve_Month_Positive
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1964-02-06,^GSPC,76.930000,76.930000,77.260002,76.470001,0.000000,4.110000e+06,669.822,3.50,0.0024,0.0006,-0.0023,...,1964-03-06,1964-05-06,1964-08-06,1965-02-08,0.017938,0.053685,0.057325,0.130248,1.0,1.0,1.0,1.0
1964-02-07,^GSPC,77.180000,77.180000,77.510002,76.660004,0.000000,4.710000e+06,669.822,3.52,0.0037,-0.0006,0.0014,...,1964-03-09,1964-05-07,1964-08-07,1965-02-08,0.014900,0.051438,0.060637,0.126587,1.0,1.0,1.0,1.0
1964-02-10,^GSPC,77.050003,77.050003,77.769997,76.830002,0.000000,4.150000e+06,669.822,3.53,-0.0013,0.0017,-0.0020,...,1964-03-10,1964-05-11,1964-08-10,1965-02-10,0.019987,0.049968,0.061389,0.122128,1.0,1.0,1.0,1.0
1964-02-11,^GSPC,77.330002,77.330002,77.650002,76.809998,0.000000,4.040000e+06,669.822,3.52,0.0033,-0.0003,-0.0014,...,1964-03-11,1964-05-11,1964-08-11,1965-02-11,0.020949,0.046166,0.057287,0.106168,1.0,1.0,1.0,1.0
1964-02-13,^GSPC,77.519997,77.519997,77.930000,77.099998,0.000000,4.820000e+06,669.822,3.52,0.0008,-0.0001,-0.0007,...,1964-03-13,1964-05-13,1964-08-13,1965-02-15,0.020898,0.044505,0.063081,0.110294,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-01,^RUT,1851.390015,1851.390015,1868.750000,1845.930054,1864.949951,4.481210e+09,26408.405,4.06,-0.0035,0.0044,0.0078,...,2022-12-01,2023-02-01,2023-05-01,2023-11-01,0.016361,0.059102,-0.044388,-0.098137,1.0,1.0,0.0,0.0
2022-11-02,^DJI,32147.759766,32147.759766,33071.929688,32139.769531,32576.279297,3.984300e+08,26408.405,4.04,-0.0267,-0.0087,0.0161,...,2022-12-02,2023-02-02,2023-05-02,2023-11-02,0.070988,0.059294,0.047803,0.052611,1.0,1.0,1.0,1.0
2022-11-02,^GSPC,3759.689941,3759.689941,3894.439941,3758.679932,3852.899902,4.899000e+09,26408.405,4.04,-0.0267,-0.0087,0.0161,...,2022-12-02,2023-02-02,2023-05-02,2023-11-02,0.082988,0.111730,0.095723,0.148440,1.0,1.0,1.0,1.0
2022-11-02,^IXIC,10524.799805,10524.799805,10993.240234,10522.900391,10885.009766,5.436420e+09,26408.405,4.04,-0.0267,-0.0087,0.0161,...,2022-12-02,2023-02-02,2023-05-02,2023-11-02,0.088999,0.159245,0.147814,0.263130,1.0,1.0,1.0,1.0


In [168]:
# Cell for creating different files for use

df.to_csv("/Users/samalainabayeva/Desktop/Capstone Project/INDICES_FILLED.csv")
# df.to_csv("/Users/samalainabayeva/Desktop/Capstone Project/INDECES_w_technical_indicators.csv")
# df.to_csv('/Users/samalainabayeva/Desktop/Capstone Project/Indices_back_to_1964_with_indicators.csv')