In [4]:
# Read stock symbols from the file
with open('sp500_symbols.txt', 'r') as file:
    stock_symbols = [line.strip() for line in file.readlines()]
print(f"Loaded {len(stock_symbols)} stock symbols.")

Loaded 503 stock symbols.


In [None]:
# --------------------------------------------------------------
#  sp500_renew_data.py – Daily Update Script
# --------------------------------------------------------------
import pandas as pd
import yfinance as yf
import pickle, os, time
from datetime import datetime, timedelta

# ---------- SETTINGS ----------
CSV_FILE    = 'sp500_historical_data.csv'
SYMBOL_FILE = 'sp500_symbols.txt'
CHECKPOINT  = 'sp500_renew_progress.pkl'
END         = datetime.now()
START       = END - timedelta(year=2)  # Last two years days (covers weekends)
FAILED_TXT  = 'renew_failed.txt'

# ---------- 1. Load symbols ----------
with open(SYMBOL_FILE) as f:
    ALL_SYMBOLS = [s.strip() for s in f if s.strip()]

# ---------- 2. Load existing CSV ----------
existing_df = pd.read_csv(CSV_FILE, header=[0,1], index_col=0, parse_dates=True)
saved_symbols = set(existing_df.columns.levels[0])
print(f"Existing CSV: {len(saved_symbols)} symbols")

# ---------- 3. Resume checkpoint ----------
if os.path.exists(CHECKPOINT):
    with open(CHECKPOINT, 'rb') as f:
        prog = pickle.load(f)
    tried = set(prog['tried'])
    failed = prog['failed']
else:
    tried, failed = set(), []

to_fetch = [s for s in ALL_SYMBOLS if s not in tried]
print(f"Renewing {len(to_fetch)} symbols...")

# ---------- 4. Fetch function ----------
def fetch(sym):
    for attempt in range(3):
        try:
            df = yf.Ticker(sym).history(start=START, end=END)
            return df[['Open','High','Low','Close','Volume']] if not df.empty else None
        except:
            time.sleep(2)
    return None

# ---------- 5. Loop ----------
new_data = {}
for i, sym in enumerate(to_fetch, 1):
    print(f"[{i}/{len(to_fetch)}] {sym}", end=' ')
    df = fetch(sym)
    if df is not None:
        new_data[sym] = df
        print("OK")
    else:
        failed.append(sym)
        print("FAILED")
    
    tried.add(sym)
    with open(CHECKPOINT, 'wb') as f:  # Save after EVERY symbol
        pickle.dump({'tried': list(tried), 'failed': failed}, f)
    time.sleep(1)  # Gentle delay

# ---------- 6. Merge & Save ----------
if new_data:
    new_df = pd.concat(new_data, axis=1, keys=new_data.keys())
    # Align dates and merge (only new rows)
    combined = pd.concat([existing_df, new_df]).groupby(level=0).last()
    combined.to_csv(CSV_FILE)
    print(f"Renewed! CSV now up to {END.date()}")

if failed:
    with open(FAILED_TXT, 'w') as f:
        f.write('\n'.join(failed))

os.remove(CHECKPOINT)  # Clean up when done
print("Done! Run anytime to renew.")

TypeError: __new__() got an unexpected keyword argument 'years'

In [12]:
# sp500_upload_simple.py
import getpass
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.exc import OperationalError

# ====================== CONFIG ======================
CSV_PATH   = "sp500_historical_data.csv"
DB_USER    = "postgres"
DB_PASSWORD = getpass.getpass("Password: ")
DB_HOST    = "localhost"
DB_PORT    = "5432"
DB_NAME    = "bootcamp_2508_final_project"
TABLE_NAME = "sp500_historical_data"

# ====================== 1. READ & MELT ======================
print("Loading and melting CSV...")
df_wide = pd.read_csv(CSV_PATH, header=[0, 1], index_col=0, parse_dates=True, low_memory=False)
df_wide.columns = ['_'.join(col).strip() for col in df_wide.columns.values]
df_wide = df_wide.reset_index()

df_long = pd.melt(
    df_wide,
    id_vars='Date',
    value_vars=[c for c in df_wide.columns if c != 'Date'],
    var_name='symbol_metric',
    value_name='value'
)

df_long[['symbol', 'metric']] = df_long['symbol_metric'].str.split('_', expand=True)
df_long = df_long.drop(columns='symbol_metric').rename(columns={'Date': 'date'})

metric_map = {'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume'}
df_long['metric'] = df_long['metric'].map(metric_map)

df_long = df_long[['symbol', 'date', 'metric', 'value']]
df_long = df_long.sort_values(['symbol', 'date', 'metric'])

print(f"Ready: {len(df_long):,} rows")

# ====================== 2. CONNECT ======================
engine = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

# Ensure DB exists
try:
    with engine.connect(): pass
except OperationalError:
    tmp = create_engine(f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/postgres")
    with tmp.connect() as conn:
        conn.execute("COMMIT")
        conn.execute(text(f"CREATE DATABASE {DB_NAME}"))

# ====================== 3. CREATE TABLE ======================
with engine.begin() as conn:
    conn.execute(text(f"""
        DROP TABLE IF EXISTS {TABLE_NAME};
        CREATE TABLE {TABLE_NAME} (
            symbol  VARCHAR(10)   NOT NULL,
            date    DATE         NOT NULL,
            metric  VARCHAR(6)    NOT NULL CHECK (metric IN ('open','high','low','close','volume')),
            value   DOUBLE PRECISION,
            PRIMARY KEY (symbol, date, metric)
        );
        CREATE INDEX idx_symbol ON {TABLE_NAME}(symbol);
        CREATE INDEX idx_date   ON {TABLE_NAME}(date);
    """))
print(f"Table `{TABLE_NAME}` created.")

# ====================== 4. LOAD (FAST + REPLACE) ======================
print("Loading data...")
df_long.to_sql(
    TABLE_NAME,
    engine,
    if_exists='append',   # or 'replace' if you want to wipe first
    index=False,
    method='multi',
    chunksize=10_000
)
print(f"Done! Loaded {len(df_long):,} rows.")

Loading and melting CSV...
Ready: 1,255,005 rows
Table `sp500_historical_data` created.
Loading data...
Done! Loaded 1,255,005 rows.
