## Loading the reports

In [329]:
import os
import pandas as pd
from pathlib import Path

In [330]:
# Set the root directory where the files are located
root_dir = Path("10ks")

# Collect all .txt files in subdirectories
records = []
for path in root_dir.rglob("*.txt"):
    folder_name = path.parent.name
    file_name = path.name
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
    records.append({"folder": folder_name, "file": file_name, "text": text})

In [331]:
# Create a DataFrame
df = pd.DataFrame(records)

In [332]:
df["year"] = df["file"].str.extract(r'_(\d{4})-\d{2}-\d{2}\.txt')

In [333]:
df

Unnamed: 0,folder,file,text,year
0,RCL,RCL_2007-02-28.txt,"SECURITIES AND EXCHANGE COMMISSION Washington,...",2007
1,RCL,RCL_2010-02-23.txt,"SECURITIES AND EXCHANGE COMMISSION Washington,...",2010
2,RCL,RCL_2005-03-14.txt,"SECURITIES AND EXCHANGE COMMISSION Washington,...",2005
3,RCL,RCL_2009-02-24.txt,"SECURITIES AND EXCHANGE COMMISSION Washington,...",2009
4,RCL,RCL_2022-03-01.txt,"SECURITIES AND EXCHANGE COMMISSION Washington,...",2022
...,...,...,...,...
1292,GLP,GLP_2017-03-10.txt,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,...",2017
1293,GLP,GLP_2023-02-27.txt,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,...",2023
1294,GLP,GLP_2018-03-09.txt,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,...",2018
1295,GLP,GLP_2009-03-13.txt,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,...",2009


In [334]:
#Get rid of anything before 2014 because we're not using it 
df = df[df["year"].astype(int) >= 2014]

# Combining those texts into a single string per year because they are largely amendments
df = df.groupby(['folder', 'year']).agg(
    text=('text', lambda x: ' '.join(x)),
    folderfiletext=('file', lambda x: x.iloc[0] + '_COM')
).reset_index()

## Assessing Data Coverage

In [335]:
#transpose the df so it has one column per year and just the company names as the rows; then there is an 1 for each year the company has a record 

df_transposed = df.pivot_table(index="folder", columns="year", aggfunc='size', fill_value=0)

# remove the folder row/multiindex
df_transposed.reset_index(inplace=True)

# remove the year column from the index
df_transposed.columns.name = None

#replace 0s with NaN
df_transposed.replace(0, pd.NA, inplace=True)

In [336]:
df_transposed

Unnamed: 0,folder,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
0,AEP,1,1,1,1,1,1,1,1,1,1,1,1
1,AES,1,1,1,1,1,1,1,1,1,1,1,1
2,AIG,1,1,1,1,1,1,1,1,1,1,1,1
3,APA,,,,,,,,,1,1,1,1
4,APC,1,1,1,1,1,1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,UAL,1,1,1,1,1,1,1,1,1,1,1,1
59,WMB,1,1,1,1,1,1,1,1,1,1,1,1
60,WMT,1,1,1,1,1,1,1,1,1,1,1,1
61,X,1,1,1,1,1,1,1,1,1,1,1,1


In [337]:
df_transposed.describe()

Unnamed: 0,folder,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024,2025
count,63,53,53,57,56,57,58,58,59,61,61,60,51
unique,63,1,1,1,1,1,1,1,1,1,1,1,1
top,AEP,1,1,1,1,1,1,1,1,1,1,1,1
freq,1,53,53,57,56,57,58,58,59,61,61,60,51


In [338]:
#list the companies that have a record for each and every year
companies_with_records = df_transposed.dropna(how='any')

In [339]:
#save df_transposed to a csv file
output_file = root_dir / "10k_coverage.csv"
df_transposed.to_csv(output_file, index=False)

## Starting with just these companies; we may go back and include the others later

In [340]:
companies_with_records_list = companies_with_records['folder'].tolist()

In [341]:
data = df[df['folder'].isin(companies_with_records_list)]

In [342]:
data

Unnamed: 0,folder,year,text,folderfiletext
0,AEP,2014,EX-13 20 ye13aepar.htm ANNUAL REPORT ye13aepar...,AEP_2014-02-25.txt_COM
1,AEP,2015,XML 119 R17.htm IDEA: XBRL DOCUMENT v2.4.1.9 B...,AEP_2015-02-20.txt_COM
2,AEP,2016,EX-13 9 ye15aepar.htm ANNUAL REPORT Exhibit 20...,AEP_2016-02-24.txt_COM
3,AEP,2017,EX-13 10 aep10kfrex1320164q.htm ANNUAL REPORT ...,AEP_2017-02-28.txt_COM
4,AEP,2018,EX-13 16 aep10kfrex1320174q.htm ANNUAL REPORT ...,AEP_2018-02-23.txt_COM
...,...,...,...,...
679,XOM,2021,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,...",XOM_2021-02-24.txt_COM
680,XOM,2022,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,...",XOM_2022-02-23.txt_COM
681,XOM,2023,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,...",XOM_2023-02-22.txt_COM
682,XOM,2024,"SECURITIES AND EXCHANGE COMMISSION WASHINGTON,...",XOM_2024-02-28.txt_COM


In [344]:
#save the data for companies with records to a csv file
output_data_file = root_dir / "full_data_filtered.csv"
data.to_csv(output_data_file, index=False)

In [345]:
#open the classified_output file 
classified_output_file = "classified_output.csv"
classified_output = pd.read_csv(classified_output_file)

  classified_output = pd.read_csv(classified_output_file)


In [347]:
classified_output_1s = classified_output[classified_output['climate_litigation'] == 'climate_litigation: 1']

In [348]:
classified_output_1s

Unnamed: 0,company,year,text,climate_litigation
26,AEP,2014,"10\nNumerous affected entities, states and oth...",climate_litigation: 1
29,AEP,2014,"CO 2 Regulation\nIn March 2012, the Federal EP...",climate_litigation: 1
141,AEP,2014,and the Sierra Club. The modified settlement ...,climate_litigation: 1
150,AEP,2014,ENVIRONMENTAL CONTINGENCIES\nCarbon Dioxide Pu...,climate_litigation: 1
356,AEP,2014,267\nIndemnifications and Other Guarantees – A...,climate_litigation: 1
357,AEP,2014,Supreme Court asking the court to remand the c...,climate_litigation: 1
364,AEP,2014,District Court for the Southern District of Oh...,climate_litigation: 1
446,AEP,2014,Court of Appeals for the District of Columbia ...,climate_litigation: 1
607,AEP,2016,Since it is difficult to predict the outcome o...,climate_litigation: 1
966,AEP,2017,Court of Appeals for the District of Columbia ...,climate_litigation: 1


In [None]:
#save this classified output to a csv file
classified_output_1s_file = "classified_output_1s.csv"
classified_output_1s.to_csv(classified_output_1s_file, index=False)


: 