In [1]:
import os
import pandas as pd 
from tqdm import tqdm
import re
import codecs

In [2]:
def find_purchase_contract(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        occurrences = text.count("purchase agreement")
        return occurrences

In [3]:
def generate_directories(start_year, end_year):
    directories = []
    quarters = [1,2,3,4]
    for year in range(start_year, end_year + 1):
        for quarter in quarters:
            directory = f"sec-data/data/{year}_{quarter}/10-K"
            directories.append(directory)
    return directories


In [4]:
def find_all(a_str, sub):
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1: return
        yield start
        start += len(sub)
        
list(find_all('spam spam spam spam', 'spam'))

[0, 5, 10, 15]

In [7]:
def find_purchase_contract_context(file_path):
    pattern = re.compile(r'([^\.]*?purchase agreement[^\.]*\.)', re.IGNORECASE)

    # Initialize list to store contexts
    contexts = []

    with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()

        instances = list(find_all(text, 'purchase agreement'))
        if len(instances) == 0:
            return contexts

        # Find matches in the text
        for instance in instances:
            start_index = max(0, instance - 500)
            end_index = min(len(text), instance + 500)
            context = text[start_index:end_index]
            contexts.append(context)

    return contexts

In [8]:
file = "sec-data/data/1995_1/10-K/0000893220-95-000104.txt"
find_purchase_contract_context(file)

["rs are\nstayed pursuant to Section 362 of the Bankruptcy Code)\n\n       A.   Appalachian Producer Litigation\n\n          1.   Enterprise Energy Corp. et al. v. Columbia Gas Transmission\nCorp., C. A. No. C2-85-1209, (U. S. Dist. Ct., S. D. Ohio, filed July 26,\n1985).  See II.C.1. below\n\n          2.   Phillips Production Co. v. Columbia Gas Transmission Corp., C.A.\nNo. 89-0269, (U.S. Dist. Ct., W.D. Pa. filed February 7, 1989).  The complaint\nas filed contained six separate counts involving ten gas purchase contracts\nwith Columbia Transmission.  All claims except those relating to Columbia\nTransmission's invocation of the cost recovery clause were settled and\ndismissed December 18, 1989, pursuant to agreement of the parties.  Phillips\ncost recovery claim was stayed by Columbia Transmission's bankruptcy filing.\n\n          3.   Columbia Gas Transmission Corp. v. Alamco, Inc. et al., C.A. No.\n88-C-38-2 (Harrison (W.Va) Cir. Ct. filed January 15, 1988).  Under a 1983\nrelea

In [10]:

start_year = 2005
end_year = 2005
directories = generate_directories(start_year, end_year)


csv_path = "purchase_contract.csv"
if os.path.exists(csv_path):
    existing_df = pd.read_csv(csv_path)
else:
    existing_df = pd.DataFrame()

data = []
for directory in directories:
    print(directory)
    file_list = os.listdir(directory)

    for filename in tqdm(file_list):
        file_path = os.path.join(directory, filename)
        
        if file_path.endswith(".txt"):

            contexts = find_purchase_contract_context(file_path)
            
            if len(contexts) != 0:
                for paragraph in contexts:
                    data.append({'Directory': directory, 'File': filename, 'Paragraph': paragraph})

df = pd.DataFrame(data)

combined_df = pd.concat([existing_df, df], ignore_index=True)
combined_df.to_csv(csv_path, index=True)


sec-data/data/2005_1/10-K


100%|██████████| 6531/6531 [03:06<00:00, 35.11it/s] 


sec-data/data/2005_2/10-K


100%|██████████| 855/855 [00:24<00:00, 35.31it/s]


sec-data/data/2005_3/10-K


100%|██████████| 670/670 [00:19<00:00, 35.18it/s] 


sec-data/data/2005_4/10-K


100%|██████████| 535/535 [00:20<00:00, 26.43it/s]


In [29]:

data = []

for directory in directories:
    counter =0

    file_list = os.listdir(directory)

    for filename in tqdm(file_list):
        file_path = os.path.join(directory, filename)
        
        # Check if the file is a text file
        if file_path.endswith(".txt"):
            # Find instances of "purchase contract" in the file
            occurrences = find_purchase_contract(file_path)
            
            # Print the filename and the number of occurrences
            if occurrences != 0:
                print(f"File: {filename}, Occurrences: {occurrences}")
                counter += 1
                
    data.append({'Directory': directory, 'Purchase Contract Mentioned': counter, 'Total Files': len(file_list)})

df = pd.DataFrame(data)
df.to_csv("count_purchase_contract.csv", index=False)


  2%|▏         | 20/1281 [00:00<00:07, 171.16it/s]

File: 0000950144-95-000704.txt, Occurrences: 3
File: 0000950129-95-000094.txt, Occurrences: 1
File: 0000102212-95-000012.txt, Occurrences: 1


  4%|▍         | 50/1281 [00:00<00:12, 96.98it/s] 

File: 0000916641-95-000050.txt, Occurrences: 3
File: 0000018540-95-000045.txt, Occurrences: 2
File: 0000072741-95-000017.txt, Occurrences: 4


  6%|▌         | 75/1281 [00:00<00:11, 103.47it/s]

File: 0000950134-95-000291.txt, Occurrences: 9
File: 0000012400-95-000005.txt, Occurrences: 3


  9%|▊         | 110/1281 [00:01<00:11, 104.22it/s]

File: 0000014407-95-000003.txt, Occurrences: 2
File: 0000017927-95-000007.txt, Occurrences: 3
File: 0000018540-95-000041.txt, Occurrences: 2
File: 0000018675-95-000007.txt, Occurrences: 1
File: 0000018540-95-000042.txt, Occurrences: 2


 10%|█         | 133/1281 [00:01<00:10, 106.20it/s]

File: 0000018808-95-000005.txt, Occurrences: 1
File: 0000950109-95-000991.txt, Occurrences: 1
File: 0000201533-95-000029.txt, Occurrences: 4
File: 0000065984-95-000008.txt, Occurrences: 2


 12%|█▏        | 155/1281 [00:01<00:12, 90.58it/s] 

File: 0000021271-95-000010.txt, Occurrences: 5
File: 0000893220-95-000104.txt, Occurrences: 8


 14%|█▍        | 177/1281 [00:01<00:12, 90.47it/s]

File: 0000016573-95-000003.txt, Occurrences: 1
File: 0000023304-95-000006.txt, Occurrences: 1
File: 0000023738-95-000029.txt, Occurrences: 4
File: 0000024545-95-000003.txt, Occurrences: 1


 14%|█▍        | 183/1281 [00:01<00:11, 99.39it/s]


KeyboardInterrupt: 