In [None]:
import pandas as pd

years = range(2018, 2024)
summary_data = []
detailed_data = []

# Define the target items and their expected [NOT FOUND] text
not_found_map = {
    "Item_1": "[NOT FOUND] ITEM 1. BUSINESS → ITEM 1A. RISK FACTORS",
    "Item_1A": "[NOT FOUND] ITEM 1A. RISK FACTORS → ITEM 1B. UNRESOLVED STAFF COMMENTS",
    "Item_7": "[NOT FOUND] ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS → ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK",
    "Item_8": "[NOT FOUND] ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA → ITEM 9. CHANGES IN AND DISAGREEMENTS WITH ACCOUNTANTS ON ACCOUNTING AND FINANCIAL DISCLOSURE",
}

for year in years:
    try:
        # Load the extracted items file for the year
        file_path = f"Data/sp500_10k_items/items_filtered_10K_filings_{year}.csv"
        df = pd.read_csv(file_path)

        missing_flags = df[['company']].copy()
        for col, not_found_msg in not_found_map.items():
            missing_flags[col] = df[col].apply(lambda x: "✓" if x == not_found_msg else "")

        # Count missing items
        counts = {
            'Year': year,
            'Missing_Item_1': (missing_flags["Item_1"] == "✓").sum(),
            'Missing_Item_1A': (missing_flags["Item_1A"] == "✓").sum(),
            'Missing_Item_7': (missing_flags["Item_7"] == "✓").sum(),
            'Missing_Item_8': (missing_flags["Item_8"] == "✓").sum(),
        }
        summary_data.append(counts)
        
        missing_year_df = missing_flags[(missing_flags[list(not_found_map.keys())] == "✓").any(axis=1)].copy()
        missing_year_df.insert(0, 'Year', year)
        missing_year_df.to_csv(f"Data/Missing_items/missing_sections_{year}.csv", index=False)
        detailed_data.append(missing_year_df)

        print(f"[{year}] ✅ Saved missing item report: missing_sections_{year}.csv")

    except Exception as e:
        print(f"[{year}] ❌ Error: {e}")

summary_df = pd.DataFrame(summary_data)
summary_df.to_csv("Data/Missing_items/missing_sections_summary_2018_2023.csv", index=False)
print("\n✅ Saved yearly summary report: missing_sections_summary_2018_2023.csv")
print(summary_df)

if detailed_data:
    all_detailed_df = pd.concat(detailed_data, ignore_index=True)
    all_detailed_df.to_csv("Data/Missing_items/missing_sections_detailed_2018_2023.csv", index=False)
    print("✅ Saved detailed missing report: missing_sections_detailed_2018_2023.csv")

[2018] ✅ Saved missing item report: missing_sections_2018.csv
[2019] ✅ Saved missing item report: missing_sections_2019.csv
[2020] ✅ Saved missing item report: missing_sections_2020.csv
[2021] ✅ Saved missing item report: missing_sections_2021.csv
[2022] ✅ Saved missing item report: missing_sections_2022.csv
[2023] ✅ Saved missing item report: missing_sections_2023.csv

✅ Saved yearly summary report: missing_sections_summary_2018_2023.csv
   Year  Missing_Item_1  Missing_Item_1A  Missing_Item_7  Missing_Item_8
0  2018             322              152              66              71
1  2019              17               12              15              31
2  2020              14               10              14              30
3  2021              14                9              14              23
4  2022              17               12              16              26
5  2023              14               11              15              26
✅ Saved detailed missing report: missing_secti