# SPDR/Index ETF PreProcessing

In [1]:
from statsmodels.regression.rolling import RollingOLS
import matplotlib.pyplot as plt
import pandas_datareader.data as web
import pandas as pd
import numpy as np
import datetime as dt
import warnings
import pandas_ta
import statsmodels.api as sm
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 25)
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [48]:
# Creating different df's for different original data

df = pd.read_csv('/Users/samalainabayeva/Desktop/Capstone Project/SPDRs.csv')

# df = pd.read_csv('/Users/samalainabayeva/Desktop/Capstone Project/Major_Indeces.csv')

In [49]:
df.head()

Unnamed: 0,Date,Unnamed: 1,Adj Close,Close,High,Low,Open,Volume
0,1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0
1,1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0
2,1998-12-22,XLF,11.612926,18.937855,19.052092,18.849005,19.052092,55887.0
3,1998-12-22,XLI,14.97139,23.28125,23.28125,23.203125,23.203125,600.0
4,1998-12-22,XLK,24.243126,32.046875,32.5,31.78125,32.40625,300500.0


In [50]:
# Not neccessary for BOTH of the imported csvs, only one of them

df.rename(columns={"Unnamed: 1": "Ticker"}, inplace=True)

In [51]:
df.set_index(["Date", 'Ticker'], inplace=True)
# df.set_index(["Date", 'Index'], inplace=True)

In [52]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0
1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0
1998-12-22,XLF,11.612926,18.937855,19.052092,18.849005,19.052092,55887.0
1998-12-22,XLI,14.97139,23.28125,23.28125,23.203125,23.203125,600.0
1998-12-22,XLK,24.243126,32.046875,32.5,31.78125,32.40625,300500.0


## The following will be the same technical indicator additions as the Quant Notebook

In [53]:
# Volatility Measure

df["garman_klass_vol"] = ((np.log(df['High']) - np.log(df["Low"])) ** 2)/2 - (2*np.log(2)- 1) * (np.log(df["Adj Close"])- np.log(df['Open']))**2

In [54]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068
1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0,-0.152028
1998-12-22,XLF,11.612926,18.937855,19.052092,18.849005,19.052092,55887.0,-0.094617
1998-12-22,XLI,14.97139,23.28125,23.28125,23.203125,23.203125,600.0,-0.074152
1998-12-22,XLK,24.243126,32.046875,32.5,31.78125,32.40625,300500.0,-0.032286


In [55]:
# Adding a feature for relative strength indicator -> takes 20 sessions to update

df["RSI"] = df.groupby(level=1)['Adj Close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))

In [56]:
df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,


In [57]:
# These also require the length of 20 sessions to show up

# 2 STD DEVs below 20-day SMA
df['lowest_bollinger_band'] = df.groupby(level=1)["Adj Close"].transform(lambda x: pandas_ta.bbands(\
                            close=x, length=20).iloc[:,0] if pandas_ta.bbands(close=x, length=20) \
                            is not None else np.nan)

# 1 STD DEV below 20-day SMA
df['lower_bollinger_band'] = df.groupby(level=1)["Adj Close"].transform(lambda x: pandas_ta.bbands(\
                            close=x, length=20, std=1).iloc[:,0] if pandas_ta.bbands(close=x, length=20) \
                            is not None else np.nan)

# 20 - Day Simple moving average

df['20_day_SMA'] = df.groupby(level=1)["Adj Close"].transform(lambda x: pandas_ta.bbands(\
                            close=x, length=20).iloc[:,1] if pandas_ta.bbands(close=x, length=20) \
                            is not None else np.nan)

# 1 STD DEV above SMA

df['one_up_bollinger_band'] = df.groupby(level=1)["Adj Close"].transform(lambda x: pandas_ta.bbands(\
                        close=x, length=20, std=1).iloc[:,2] if pandas_ta.bbands(close=x, length=20) \
                        is not None else np.nan)

# 2 STD DEVs above SMA

df['upper_bollinger_band'] = df.groupby(level=1)["Adj Close"].transform(lambda x: pandas_ta.bbands(\
                            close=x, length=20).iloc[:,2] if pandas_ta.bbands(close=x, length=20) \
                            is not None else np.nan)

In [58]:
df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,


In [59]:
# Function for calculating the atr, because multiple columns needed for tranformation, .transform() will not work
# because .transform can only take 1 column as an input, we  will use a groupby().apply() function

def compute_atr(stock_data):
    atr = pandas_ta.atr(high=stock_data["High"],
                 low=stock_data["Low"],
                 close=stock_data["Close"],
                       length=14)
    return atr

In [60]:
# Need to add group_keys = False, otherwise, it will double the date index, giving us a triple index

df["ATR"] = df.groupby(level=1, group_keys=False).apply(compute_atr)

In [61]:
df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,


In [62]:
# Custom function for adding a new feature, utilizing pandas_ta

def compute_MACD(close):
    macd_df = pandas_ta.macd(close=close, length=20)
    
    if macd_df is not None and not macd_df.empty:
        macd = macd_df.iloc[:, 0]
        return macd
    else:
        # This else clause was very tricky to figure out, required a series of nans in the proper length,
        # which was on occassion less than 20, therefore causing errors
        return pd.Series([np.nan] * len(close), index=close.index)

In [63]:
# Moving Average Convergence Divergence

df["MACD"] = df.groupby(level=1, group_keys=False)["Adj Close"].apply(compute_MACD)

In [64]:
df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,,


In [65]:
# Dollar volume addition

df["dollar_volume(M)"] = ((df['Adj Close']*df["Volume"])/1000000).round(4)

In [66]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Close,High,Low,Open,Volume,garman_klass_vol,RSI,lowest_bollinger_band,lower_bollinger_band,20_day_SMA,one_up_bollinger_band,upper_bollinger_band,ATR,MACD,dollar_volume(M)
Date,Ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1998-12-22,XLB,12.011767,20.828125,20.828125,20.75,20.78125,1900.0,-0.116068,,,,,,,,,0.0228
1998-12-22,XLE,12.448157,23.265625,23.390625,23.1875,23.3125,15200.0,-0.152028,,,,,,,,,0.1892
1998-12-22,XLF,11.612926,18.937855,19.052092,18.849005,19.052092,55887.0,-0.094617,,,,,,,,,0.649
1998-12-22,XLI,14.97139,23.28125,23.28125,23.203125,23.203125,600.0,-0.074152,,,,,,,,,0.009
1998-12-22,XLK,24.243126,32.046875,32.5,31.78125,32.40625,300500.0,-0.032286,,,,,,,,,7.2851


In [67]:
# SPDRS

df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 59633 entries, ('1998-12-22', 'XLB') to ('2023-10-25', 'XLY')
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Adj Close              59633 non-null  float64
 1   Close                  59633 non-null  float64
 2   High                   59633 non-null  float64
 3   Low                    59633 non-null  float64
 4   Open                   59633 non-null  float64
 5   Volume                 59633 non-null  float64
 6   garman_klass_vol       59633 non-null  float64
 7   RSI                    59413 non-null  float64
 8   lowest_bollinger_band  59424 non-null  float64
 9   lower_bollinger_band   59424 non-null  float64
 10  20_day_SMA             59424 non-null  float64
 11  one_up_bollinger_band  59424 non-null  float64
 12  upper_bollinger_band   59424 non-null  float64
 13  ATR                    59479 non-null  float64
 14  MACD              

# DONT RUN NEXT CELL!!!

In [45]:
# INDEX LEVEL

df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 31068 entries, ('1992-12-22', '^DJI') to ('2023-10-25', '^RUT')
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Adj Close              31068 non-null  float64
 1   Close                  31068 non-null  float64
 2   High                   31068 non-null  float64
 3   Low                    31068 non-null  float64
 4   Open                   31068 non-null  float64
 5   Volume                 31068 non-null  int64  
 6   garman_klass_vol       31068 non-null  float64
 7   RSI                    30988 non-null  float64
 8   lowest_bollinger_band  30992 non-null  float64
 9   lower_bollinger_band   30992 non-null  float64
 10  20_day_SMA             30992 non-null  float64
 11  one_up_bollinger_band  30992 non-null  float64
 12  upper_bollinger_band   30992 non-null  float64
 13  ATR                    31012 non-null  float64
 14  MACD            

In [68]:
# Cell for creating different files for use

df.to_csv("/Users/samalainabayeva/Desktop/Capstone Project/SPDRs_w_technical_indicators.csv")
# df.to_csv("/Users/samalainabayeva/Desktop/Capstone Project/INDECES_w_technical_indicators.csv")