# 0. Data Collection for Financial Data
Collecting OHLC (open, high, low, close) and volume data using the Yahoo Finance API to curate a dataset for time series modeling, based on a short-term 5-day buy/sell classification approach (rolling window testing intended use).


# 1. Required Libraries

In [8]:
!pip install yfinance
!pip install pandas
!pip install pandas_ta
!pip install numpy==1.23.5



In [9]:
import yfinance as yf
import pandas as pd
import numpy as np

# 2. Raw Data Collection
Collect OHLC & volume daily data for a number of assets.

In [10]:
assets = ['AAPL', 'MSFT', 'NVDA', 'WMT', 'HD', 'JPM', 'AMZN', 'BA', 'SBUX', 'UNH']

In [11]:
ticker = yf.Ticker(assets[0])

In [12]:
aapl = yf.download(assets[0], start='2023-12-26', end='2025-01-07')  # add a bit of date padding for a full year

  aapl = yf.download(assets[0], start='2023-12-26', end='2025-01-07')  # add a bit of date padding for a full year
[*********************100%***********************]  1 of 1 completed


In [13]:
aapl

Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2023-12-26,191.646545,192.480435,191.428144,192.202472,28919300
2023-12-27,191.745804,192.093265,189.700782,191.090614,48087700
2023-12-28,192.172684,193.244834,191.765661,192.728610,34049900
2023-12-29,191.130341,192.986741,190.336153,192.490376,42628800
2024-01-02,184.290421,187.070068,182.553143,185.789438,82488700
...,...,...,...,...,...
2024-12-30,251.593079,252.889953,250.146571,251.623005,35557500
2024-12-31,249.817368,252.670486,248.829744,251.832511,39480700
2025-01-02,243.263199,248.500565,241.238085,248.330961,55740700
2025-01-03,242.774368,243.592387,241.307905,242.774368,40244100


# 3. Trend, Momentum, and Volume Indicators
Derive technical indicators as new features (columns) for the dataset, keeping in mind which ones are relevant to a short-term 5-day prediction

In [14]:
import pandas_ta as ta  # use the pandas technical analysis extension for faster feature engineering

## 3.1 Trend Indicators

In [15]:
def encode_ti(df):  # takes in a df whose columns include 'Open', 'High', 'Low', 'Close', 'Volume'
  df['sma'] =  ta.sma(df['Close'].squeeze(), length=5)  # simple moving average for 5-day period
  df['ema'] =  ta.ema(df['Close'].squeeze(), length=5)  # exponential moving average for 5-day period, weights closer datapoints more highly
  df['wma'] = ta.wma(df['Close'].squeeze(), length=5)  # weighted moving average for 5-day period, same effect as above but linear
  return df

## 3.2 Momentum Indicators

In [16]:
def encode_mi(df):  # takes in a df whose columns include 'Open', 'High', 'Low', 'Close', 'Volume'
  df['rsi_5'] = ta.rsi(df['Close'].squeeze(), length=5)  # a short-term version of the relative strength index
  df['mtm_5'] = ta.mom(df['Close'].squeeze(), length=5)  # price difference from 5 days ago
  df['cci_5'] = ta.cci(df['High'].squeeze(), df['Low'].squeeze(), df['Close'].squeeze(), length=5)  # measure deviation from avg price
  df['wr_5']  = ta.willr(df['High'].squeeze(), df['Low'].squeeze(), df['Close'].squeeze(), length=5)  # track overbought/oversold conditions

## 3.3 Volatility Indicators

In [17]:
def encode_vi(df):  # takes in a df whose columns include 'Open', 'High', 'Low', 'Close', 'Volume'
  df['atr_5'] = ta.atr(df['High'].squeeze(), df['Low'].squeeze(), df['Close'].squeeze(), length=5)  # a short-term version of average true range

# 4. Target Variable Engineering

In [18]:
def encode_all(df):  # takes in a df whose columns include 'Open', 'High', 'Low', 'Close', 'Volume'
  encode_ti(df)
  encode_mi(df)
  encode_vi(df)

In [19]:
def encode_target(df):  # takes in a df with a 'Close' column, classifies as a "buy" if >= 1% change, sell if <= -1% change, 0 (neutral) o/w
  future_price = df['Close'].shift(-5)
  pct_change = (future_price - df['Close']) / df['Close']
  df['target'] = pct_change.squeeze().apply(lambda x : 1 if x >= 0.01 else -1 if x <= 0.01 else 0)

In [20]:
encode_all(aapl)

In [21]:
encode_target(aapl)

In [22]:
aapl.columns = [col[0] if col[1] == '' else col[0] for col in aapl.columns]

In [23]:
aapl.columns

Index(['Close', 'High', 'Low', 'Open', 'Volume', 'sma', 'ema', 'wma', 'rsi_5',
       'mtm_5', 'cci_5', 'wr_5', 'atr_5', 'target'],
      dtype='object')

In [24]:
aapl.head()

Unnamed: 0_level_0,Close,High,Low,Open,Volume,sma,ema,wma,rsi_5,mtm_5,cci_5,wr_5,atr_5,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2023-12-26,191.646545,192.480435,191.428144,192.202472,28919300,,,,,,,,,-1
2023-12-27,191.745804,192.093265,189.700782,191.090614,48087700,,,,,,,,,-1
2023-12-28,192.172684,193.244834,191.765661,192.72861,34049900,,,,,,,,,-1
2023-12-29,191.130341,192.986741,190.336153,192.490376,42628800,,,,,,,,,-1
2024-01-02,184.290421,187.070068,182.553143,185.789438,82488700,190.197159,190.197159,189.175311,,,-166.666667,-83.751142,,-1


In [25]:
aapl.iloc[5:-5].to_csv('2024_2025_appl_5d.csv')

***Note*** - have to get rid of the first and last 5 datapoints because of forward and backward looking indicators have a max reach of -5 and +5 days

In [26]:
aapl['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
-1,141
1,113
0,5


# 5. Combine into a single Ticker --> CSV function

In [27]:
def get_yr_5d_data(ticker):
  df = yf.download(ticker, start='2023-12-26', end='2025-01-07')
  encode_all(df)
  encode_target(df)
  df.iloc[5:-5].to_csv(f'2024_2025_{ticker}_5d.csv')

In [28]:
for asset in assets: get_yr_5d_data(asset)

  df = yf.download(ticker, start='2023-12-26', end='2025-01-07')
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start='2023-12-26', end='2025-01-07')
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start='2023-12-26', end='2025-01-07')
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start='2023-12-26', end='2025-01-07')
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start='2023-12-26', end='2025-01-07')
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start='2023-12-26', end='2025-01-07')
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start='2023-12-26', end='2025-01-07')
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start='2023-12-26', end='2025-01-0