In [124]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import yfinance as yf

In [125]:
tickers = [
    "AAPL", "NVDA", "AVGO", "CRM", "AMD", "ADBE", "QCOM", 
    "IBM", "NOW", "GOOG", "NFLX", "DIS", "F",
    "VZ", "T", "GME", "SCHW", "DAL", "TXN", "WDAY", "TGT", 
    "GM", "LLY", "UNH", "JNJ", "ABBV",
    "MRK", "TMO", "ABT", "PFE", "AMGN", "ISRG", "SYK", 
    "GILD", "VRTX", "AMZN",
    "TSLA", "WMT", "HD", "CMG", "LOW", "TJX",
    "KO", "PEP", "PM", "CL", "CAT", "BA",
    "UPS", "XOM", "CVX", "COP", "NEE", "EBAY"
]

yf_tickers = []

for ticker in tickers:
    yf_tickers.append(yf.Ticker(ticker))

In [126]:
# Quick hygiene check function
def run_hygiene_checks(df):
    # 1. Check for impossible prices
    logic_errors = df[(df['High'] < df['Low']) | (df['Open'] < df['Low']) | (df['Close'] > df['High'])]
    
    # 2. Check for missing values
    null_counts = df.isnull().sum()
    
    # 3. Check for outliers (Price jumps > 50% without a split)
    df['pct_change'] = df['Close'].pct_change()
    spikes = df[(df['pct_change'].abs() > 0.5) & (df['Stock Splits'] == 0)]
    
    return {"logic_errors": len(logic_errors), "nulls": null_counts, "spikes": len(spikes)}

How I would handle the following:

1. Logic Errors

2. Outliers

3. Missing Data

In [127]:
logic_e = []
spikes_e = []

for ticker in yf_tickers:
    raw = ticker.history(start='2015-01-01', end='2025-12-31')
    raw.drop(inplace=True, columns=['Dividends'])
    stats = run_hygiene_checks(raw)
    logic_e.append(stats['logic_errors'])
    spikes_e.append(stats['logic_errors'])

In [128]:
logic_e

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 7,
 0,
 2,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0]

In [129]:
spikes_e

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 7,
 0,
 2,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0]

In [117]:
simple_splits = {}

def create_train_test_splits(df):
    return (df.loc['2015': '2021'], df.loc['2022':])

In [118]:
for ticker in yf_tickers:
    raw = ticker.history(start='2015-01-01', end='2025-12-31')
    raw.drop(inplace=True, columns=['Dividends'])
    simple_splits[ticker.ticker] = create_train_test_splits(raw)

In [None]:
simple_splits

54