## Libraries & necessary pre-code

In [6]:
import yfinance as yf
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

## Data Collection

In [7]:
tickers = ['TSM', 'NVDA', 'AMD', 'INTC', 'ORCL']
start_date = '2024-10-23'
end_date = '2025-10-23'

def get_ticker_data(ticker, start_date, end_date):
    '''
    Collects daily data for a tickers and returns a dataframe of
    it (without multilevel columns)

    :param ticker: String of a ticker (ex: '^GSPC')
    :param: start_date: start date String in format 'YYYY-MM-DD'
    :param: end_date: end date String in format 'YYYY-MM-DD'

    :return: df (pd.Dataframe) of the data for that ticker on YTD data
    '''
    try:
        # Only really need Open & Close to get daily_returns, and don't need Ticker in the data (drop it)
        data = yf.download(ticker, start=start_date, end=end_date, progress=False)[['Open', 'Close']].droplevel(1, axis=1)
    except Exception:
        return f"Data collection failed for {ticker} from {start_date} to {end_date}"

    return data

# Test
# print(get_ticker_data('^GSPC', '2025-01-06', '2025-10-22').droplevel(1, axis=1))

def get_tickers_data(tickers, start_date, end_date):
    '''
    Iterates through a list of tickers and returns a dictionary of dataframes for it.

    :param tickers: list of Strings of tickers ex: '^GSPC', 'TSMC'
    :param: start_date: start date String in format 'YYYY-MM-DD'
    :param: end_date: end date String in format 'YYYY-MM-DD'

    :return: dictionary of DataFrames connected to the ticker
    '''
    ticker_dict = {}
    try:
        for ticker in tickers:
            ticker_dict[ticker] = get_ticker_data(ticker, start_date, end_date)
    except Exception:
        return f"Data collection failed for {tickers} from {start_date} to {end_date}"

    return ticker_dict

data_dict = get_tickers_data(tickers, start_date, end_date)

# Save to csv
for ticker in data_dict:
    data_dict[ticker].to_csv(f'{ticker}_data.csv')

# Test
data_dict

{'TSM': Price             Open       Close
 Date                              
 2024-10-23  194.584991  198.720474
 2024-10-24  196.840728  195.821686
 2024-10-25  198.116987  201.273010
 2024-10-28  197.137525  192.606308
 2024-10-29  192.319398  194.842239
 ...                ...         ...
 2025-10-16  310.929993  299.839996
 2025-10-17  298.399994  295.079987
 2025-10-20  300.000000  297.700012
 2025-10-21  299.000000  294.510010
 2025-10-22  293.000000  288.880005
 
 [250 rows x 2 columns],
 'NVDA': Price             Open       Close
 Date                              
 2024-10-23  141.989282  139.519989
 2024-10-24  140.779637  140.369751
 2024-10-25  140.889594  141.499420
 2024-10-28  142.959021  140.479736
 2024-10-29  140.249772  141.209503
 ...                ...         ...
 2025-10-16  182.229996  181.809998
 2025-10-17  180.179993  183.220001
 2025-10-20  183.130005  182.639999
 2025-10-21  182.789993  181.160004
 2025-10-22  181.139999  180.279999
 
 [250 rows x 2 colum

## Data Cleaning

In [50]:
# Here we want to clean the data so drop the multilevel columns, verify data, ensure no missing data

# First, check data (see if any missing)
# I think this is the only check we need as yFinance has consistent dating data, naming, pricing (don't have to worry about type mismatch or other trivial items)
def handle_nan(ticker_df):
    '''
    Checks whether the number of missing values is > 0, and if so then use dropna() to clean

    :param ticker_df: the ticker's Data in a pd.DataFrame
    :return: the corrected (or unmodified) dataframe
    '''
    # If >, there's missing
    if ticker_df.isnull().sum().any():
        print(f"Missing vals: {ticker_df.isnull().sum().all()}")
        ticker_df = ticker_df.dropna()

    return ticker_df

# Run it for all our stocks
for ticker in data_dict:
    data_dict[ticker] = handle_nan(data_dict[ticker])
    # Didn't print, so that mean's there's no NaNs

## Feature Engineering


In [None]:
# Daily returns
