## Loading the reports

In [297]:
import os
import pandas as pd
from pathlib import Path

In [298]:
# Set the root directory where the files are located
root_dir = Path("10k_2")

# Collect all .txt files in subdirectories
records = []
for path in root_dir.rglob("*.txt"):
    folder_name = path.parent.name
    file_name = path.name
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
    records.append({"folder": folder_name, "file": file_name, "text": text})

In [299]:
# Create a DataFrame
df = pd.DataFrame(records)

In [300]:
df["year"] = df["file"].str.extract(r'_(\d{4})-\d{2}-\d{2}\.txt')

In [301]:
df

Unnamed: 0,folder,file,text,year
0,RCL,RCL_2007-02-28.txt,"SECURITIES AND EXCHANGE COMMISSION Washington,...",2007
1,RCL,RCL_2010-02-23.txt,"SECURITIES AND EXCHANGE COMMISSION Washington,...",2010
2,RCL,RCL_2005-03-14.txt,"SECURITIES AND EXCHANGE COMMISSION Washington,...",2005
3,RCL,RCL_2009-02-24.txt,"SECURITIES AND EXCHANGE COMMISSION Washington,...",2009
4,RCL,RCL_2022-03-01.txt,"SECURITIES AND EXCHANGE COMMISSION Washington,...",2022
...,...,...,...,...
1292,GLP,GLP_2017-03-10.txt,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,...",2017
1293,GLP,GLP_2023-02-27.txt,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,...",2023
1294,GLP,GLP_2018-03-09.txt,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,...",2018
1295,GLP,GLP_2009-03-13.txt,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,...",2009


In [311]:
#Get rid of anything before 2014 because we're not using it 
df = df[df["year"].astype(int) >= 2014]

# Combining those texts into a single string per year because they are largely amendments
df = df.groupby(['folder', 'year']).agg(
    text=('text', lambda x: ' '.join(x)),
    folderfiletext=('file', lambda x: x.iloc[0] + '_COM')
).reset_index()

## Assessing Data Coverage

In [317]:
#transpose the df so it has one column per year and just the company names as the rows; then there is an 1 for each year the company has a record 

df_transposed = df.pivot_table(index="folder", columns="year", aggfunc='size', fill_value=0)

# remove the folder row/multiindex
df_transposed.reset_index(inplace=True)

# remove the year column from the index
df_transposed.columns.name = None

#replace 0s with NaN
df_transposed.replace(0, pd.NA, inplace=True)

In [318]:
df_transposed

Unnamed: 0,folder,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,AEP,1,1,1,1,1,1,1,1,1,1,1,1
1,AES,1,1,1,1,1,1,1,1,1,1,1,1
2,AIG,1,1,1,1,1,1,1,1,1,1,1,1
3,APA,,,,,,,,,1,1,1,1
4,APC,1,1,1,1,1,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,UAL,1,1,1,1,1,1,1,1,1,1,1,1
59,WMB,1,1,1,1,1,1,1,1,1,1,1,1
60,WMT,1,1,1,1,1,1,1,1,1,1,1,1
61,X,1,1,1,1,1,1,1,1,1,1,1,1


In [319]:
df_transposed.describe()

Unnamed: 0,folder,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
count,63,53,53,57,56,57,58,58,59,61,61,60,51
unique,63,1,1,1,1,1,1,1,1,1,1,1,1
top,AEP,1,1,1,1,1,1,1,1,1,1,1,1
freq,1,53,53,57,56,57,58,58,59,61,61,60,51


In [320]:
#list the companies that have a record for each and every year
companies_with_records = df_transposed.dropna(how='any')

In [322]:
#save df_transposed to a csv file
output_file = root_dir / "10k_coverage.csv"
df_transposed.to_csv(output_file, index=False)

## Starting with just these companies; we may go back and include the others later