In [1]:
import duckdb,os
import pandas as pd
import yfinance as yf
import time
from tqdm import tqdm

# Load raw securities csv
sp500 = pd.read_csv("../../S&P500.csv")
gics = pd.read_csv("../../GICS - Industry Standard-2023-kaggle.csv")

In [2]:
# Clean symbol
sp500.columns = sp500.columns.str.lower()
sp500["symbol"] = sp500["symbol"].str.strip().str.upper()
sp500.head()

Unnamed: 0,no,company,symbol,weight,price,chg,% chg
0,1,Nvidia,NVDA,7.4%,186.26,4.1,-2.2%
1,2,Apple Inc.,AAPL,6.3%,262.82,3.24,-1.2%
2,3,Microsoft,MSFT,6.3%,523.61,3.05,-0.6%
3,4,Amazon,AMZN,3.9%,224.21,3.12,-1.4%
4,5,Meta Platforms,META,3.0%,738.36,4.36,-0.6%


In [3]:
# Fetch GICS industry info via yahoo
records = []
for symbol, name in tqdm(zip(sp500["symbol"], sp500["company"]), total=len(sp500)):
    try:
        info = yf.Ticker(symbol).info
        sector = info.get("sector", None)
        industry = info.get("industry", None)
        records.append({
            "symbol": symbol,
            "name": name,
            "sector": sector,
            "industry": industry
        })
    except Exception as e:
        print(f"Error fetching data for {symbol}: {e}")
        records.append({
            "symbol": symbol,
            "name": name,
            "sector": sector,
            "industry": industry
        })
    time.sleep(0.1)  # Rate limit

df = pd.DataFrame(records)
df.head()

100%|██████████| 503/503 [02:39<00:00,  3.15it/s]


Unnamed: 0,symbol,name,sector,industry
0,NVDA,Nvidia,Technology,Semiconductors
1,AAPL,Apple Inc.,Technology,Consumer Electronics
2,MSFT,Microsoft,Technology,Software - Infrastructure
3,AMZN,Amazon,Consumer Cyclical,Internet Retail
4,META,Meta Platforms,Communication Services,Internet Content & Information


In [4]:
df.to_csv("../../securities_with_gics.csv", index=False)
print(f"Saved securities with GICS to ../../securities_with_gics.csv | rows={len(df):,}")

Saved securities with GICS to ../../securities_with_gics.csv | rows=503


In [2]:
# Confirm address in duckdb
con = duckdb.connect("../../data/warehouse/data.duckdb")
print("Absolute path:", os.path.abspath("../../data/warehouse/data.duckdb"))
print("Tables:", con.execute("SHOW TABLES;").fetchdf())
con.close()

Absolute path: /home/clsx6609/ds5110/data/warehouse/data.duckdb
Tables:                  name
0   corporate_actions
1  factor_definitions
2       factor_values
3        fundamentals
4              prices
5          securities


In [3]:
con = duckdb.connect("/home/clsx6609/ds5110/data/warehouse/data.duckdb")
con.execute("CREATE SEQUENCE IF NOT EXISTS seq_security_id START 1;")
con.execute("""
INSERT INTO securities (security_id, symbol, name, sector, industry)
SELECT 
    nextval('seq_security_id') AS security_id,
    i.symbol,
    i.name,
    i.sector,
    i.industry
FROM read_csv_auto('../../securities_with_gics.csv') AS i
LEFT JOIN securities s ON s.symbol = i.symbol
WHERE s.symbol IS NULL;
""")
print(con.execute("SELECT COUNT(*) FROM securities;").fetchdf())
con.close()


   count_star()
0           503
