In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import yfinance as yf

In [30]:
tickers = [
    "AAPL", "NVDA", "AVGO", "CRM", "AMD", "ADBE", "QCOM", 
    "IBM", "NOW", "GOOG", "NFLX", "DIS", "F",
    "VZ", "T", "GME", "SCHW", "DAL", "TXN", "WDAY", "TGT", 
    "GM", "LLY", "UNH", "JNJ", "ABBV",
    "MRK", "TMO", "ABT", "PFE", "AMGN", "ISRG", "SYK", 
    "GILD", "VRTX", "AMZN",
    "TSLA", "WMT", "HD", "CMG", "LOW", "TJX",
    "KO", "PEP", "PM", "CL", "CAT", "BA",
    "UPS", "XOM", "CVX", "COP", "NEE", "EBAY"
]

yf_tickers = []

for ticker in tickers:
    yf_tickers.append(yf.Ticker(ticker))

In [31]:
# Quick hygiene check function
def run_hygiene_checks(df):
    # 1. Check for impossible prices
    logic_errors = df[(df['High'] < df['Low']) | (df['Open'] < df['Low']) | (df['Close'] > df['High'])]
    
    # 2. Check for missing values
    null_counts = df.isnull().sum()
    
    # 3. Check for outliers (Price jumps > 50% without a split)
    df['pct_change'] = df['Close'].pct_change()
    spikes = df[(df['pct_change'].abs() >= 0.5) & (df['Stock Splits'] == 0)]
    
    return {"logic_errors": logic_errors, "nulls": null_counts, "spikes": spikes}

How I would handle the following:

1. Logic Errors

2. Outliers

3. Missing Data

In [32]:
logic_e = []
spikes_e = []

for ticker in yf_tickers:
    raw = ticker.history(start='2015-01-01', end='2025-12-31')
    raw.drop(inplace=True, columns=['Dividends'])
    raw_r = raw.round(6)
    stats = run_hygiene_checks(raw_r)
    logic_e.append(stats['logic_errors'])
    spikes_e.append(stats['spikes'])

In [33]:
logic_e

[Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Vo

In [34]:
spikes_e

[Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits, pct_change]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits, pct_change]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits, pct_change]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits, pct_change]
 Index: [],
                            Open  High   Low  Close     Volume  Stock Splits  \
 Date                                                                          
 2016-04-22 00:00:00-04:00  3.19  3.99  3.18   3.99  143265300           0.0   
 
                            pct_change  
 Date                                   
 2016-04-22 00:00:00-04:00    0.522901  ,
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits, pct_change]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, Close, Volume, Stock Splits, pct_change]
 Index: [],
 Empty DataFrame
 Columns: [Open, High, Low, C

In [35]:
simple_splits = {}

def create_train_test_splits(df):
    return (df.loc['2015': '2021'], df.loc['2022':])

In [36]:
for ticker in yf_tickers:
    raw = ticker.history(start='2015-01-01', end='2025-12-31')
    raw.drop(inplace=True, columns=['Dividends'])
    simple_splits[ticker.ticker] = create_train_test_splits(raw)

In [37]:
simple_splits

{'AAPL': (                                 Open        High         Low       Close  \
  Date                                                                        
  2015-01-02 00:00:00-05:00   24.694243   24.705328   23.798608   24.237558   
  2015-01-05 00:00:00-05:00   24.006990   24.086799   23.368519   23.554739   
  2015-01-06 00:00:00-05:00   23.619034   23.816340   23.195602   23.556961   
  2015-01-07 00:00:00-05:00   23.765345   23.987036   23.654499   23.887276   
  2015-01-08 00:00:00-05:00   24.215378   24.862717   24.097880   24.805077   
  ...                               ...         ...         ...         ...   
  2021-12-27 00:00:00-05:00  173.451428  176.713010  173.431849  176.624863   
  2021-12-28 00:00:00-05:00  176.458336  177.604295  174.861822  175.606201   
  2021-12-29 00:00:00-05:00  175.645406  176.918699  174.479854  175.694382   
  2021-12-30 00:00:00-05:00  175.782469  176.859873  174.430819  174.538559   
  2021-12-31 00:00:00-05:00  174.430829  175