In [None]:
import pandas as pd
import os

# Base folders
filings_folder = "Data/10k_fillings_links"
sp500_folder = "Data/sp500"
output_folder = "Data/sp500_10k_links"
os.makedirs(output_folder, exist_ok=True)

years = range(2018, 2024)

for year in years:
    try:
        filings_path = os.path.join(filings_folder, f"10K_filings_{year+1}_with_links.csv")
        sp500_path = os.path.join(sp500_folder, f"sp500_full_{year}.csv")
        output_path = os.path.join(output_folder, f"filtered_10K_filings_{year}.csv")

        # Try parsing with comma, fallback to semicolon if needed
        try:
            filings_df = pd.read_csv(filings_path, sep=',', engine='python', on_bad_lines='skip')
            if len(filings_df.columns) == 1:
                raise ValueError("Single column detected, retrying with semicolon separator...")
        except:
            filings_df = pd.read_csv(filings_path, sep=';', engine='python', on_bad_lines='skip')

        # Clean up column names
        filings_df.columns = filings_df.columns.str.strip().str.lower()

        # Load and clean S&P 500 CSV
        sp500_df = pd.read_csv(sp500_path)
        sp500_df.columns = sp500_df.columns.str.strip().str.lower()

        print(f"\n[{year}] Filings columns: {filings_df.columns.tolist()}")
        print(f"[{year}] S&P columns: {sp500_df.columns.tolist()}")

        # Ensure CIK column is present
        if 'cik' not in filings_df.columns or 'cik' not in sp500_df.columns:
            print(f"[{year}] Skipping: 'cik' column not found.")
            continue

        # Normalize CIKs 
        filings_df['cik_clean'] = filings_df['cik'].astype(str).str.strip().str.replace(r'\.0$', '', regex=True)
        sp500_df['cik_clean'] = sp500_df['cik'].astype(str).str.strip().str.replace(r'\.0$', '', regex=True)

        # Filter by CIK 
        filtered_df = filings_df[filings_df['cik_clean'].isin(sp500_df['cik_clean'])].copy()
        filtered_df.drop(columns=['cik_clean'], inplace=True)

        filtered_df.to_csv(output_path, index=False)
        print(f"[{year}] ✅ Saved: {output_path} with {len(filtered_df)} rows")

    except Exception as e:
        print(f"[{year}] ❌ Error: {e}")


[2018] Filings columns: ['year', 'company', 'cik', 'form_type', 'date_filed', 'url', 'document_url']
[2018] S&P columns: ['company', 'ticker', 'cik', 'sector']
[2018] ✅ Saved: Data/sp500_10k_links/filtered_10K_filings_2018.csv with 444 rows

[2019] Filings columns: ['year', 'company', 'cik', 'form_type', 'date_filed', 'document_url']
[2019] S&P columns: ['company', 'ticker', 'cik', 'sector']
[2019] ✅ Saved: Data/sp500_10k_links/filtered_10K_filings_2019.csv with 445 rows

[2020] Filings columns: ['year', 'company', 'cik', 'form_type', 'date_filed', 'url', 'document_url']
[2020] S&P columns: ['company', 'ticker', 'cik', 'sector']
[2020] ✅ Saved: Data/sp500_10k_links/filtered_10K_filings_2020.csv with 445 rows

[2021] Filings columns: ['year', 'company', 'cik', 'form_type', 'date_filed', 'url', 'document_url']
[2021] S&P columns: ['company', 'ticker', 'cik', 'sector']
[2021] ✅ Saved: Data/sp500_10k_links/filtered_10K_filings_2021.csv with 440 rows

[2022] Filings columns: ['year', 'comp