In [1]:
import os
import pandas as pd 
from tqdm import tqdm
import re
import codecs

In [10]:
def find_total_matches(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        occurrences = 0
        occurrences += text.count("purchase agreement")
        occurrences += text.count("purchase contract")
        return occurrences

In [4]:
def generate_directories(start_year, end_year):
    directories = []
    quarters = [1,2,3,4]
    for year in range(start_year, end_year + 1):
        for quarter in quarters:
            directory = f"sec-data/data/{year}_{quarter}/10-K"
            directories.append(directory)
    return directories


In [5]:
def find_all(a_str, sub):
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1: return
        yield start
        start += len(sub)
        
list(find_all('spam spam spam spam', 'spam'))

[0, 5, 10, 15]

In [7]:
def find_purchase_contract_context(file_path):
    pattern = re.compile(r'([^\.]*?purchase agreement[^\.]*\.)', re.IGNORECASE)

    # Initialize list to store contexts
    contexts = []

    with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()

        instances = list(find_all(text, 'purchase agreement'))
        if len(instances) == 0:
            return contexts

        # Find matches in the text
        for instance in instances:
            start_index = max(0, instance - 500)
            end_index = min(len(text), instance + 500)
            context = text[start_index:end_index]
            contexts.append(context)

    return contexts

In [17]:
def find_purchase_all_context(file_path):

    # Initialize list to store contexts
    contexts = []

    with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()

        instances = list(find_all(text, 'purchase agreement'))
        if len(instances) == 0:
            return contexts

        instances += list(find_all(text, 'purchase contract'))

        if len(instances) < 5:
            return contexts

        # Find matches in the text
        for instance in instances:
            start_index = max(0, instance - 300)
            end_index = min(len(text), instance + 300)
            context = text[start_index:end_index]
            contexts.append(context)

    return contexts

In [16]:
file = "sec-data/data/1995_1/10-K/0000893220-95-000104.txt"
find_purchase_all_context(file)



[]

In [26]:

start_year = 1995
end_year = 1995
directories = generate_directories(start_year, end_year)


csv_path = "combined.csv"
if os.path.exists(csv_path):
    existing_df = pd.read_csv(csv_path)
else:
    existing_df = pd.DataFrame()

data = []
for directory in directories:
    print(directory)
    file_list = os.listdir(directory)

    for filename in tqdm(file_list):
        file_path = os.path.join(directory, filename)
        
        if file_path.endswith(".txt"):

            contexts = find_purchase_contract_context(file_path)
            
            if len(contexts) != 0:
                for paragraph in contexts:
                    data.append({'Directory': directory, 'File': filename, 'Paragraph': paragraph})

df = pd.DataFrame(data)

combined_df = pd.concat([existing_df, df], ignore_index=True)
combined_df.to_csv(csv_path, index=True)


sec-data/data/1995_1/10-K


  9%|▉         | 118/1281 [00:18<02:58,  6.52it/s]


KeyboardInterrupt: 

In [25]:

data = []

for directory in directories:
    counter =0

    file_list = os.listdir(directory)

    for filename in tqdm(file_list):
        file_path = os.path.join(directory, filename)
        
        # Check if the file is a text file
        if file_path.endswith(".txt"):
            # Find instances of "purchase contract" in the file
            occurrences = find_purchase_contract_context(file_path)
            
            # Print the filename and the number of occurrences
            if occurrences != 0:
                print(f"File: {filename}, Occurrences: {occurrences}")
                counter += 1
                
    data.append({'Directory': directory, 'Purchase Contract Mentioned': counter, 'Total Files': len(file_list)})

df = pd.DataFrame(data)
df.to_csv("count_purchase_contract.csv", index=False)


  0%|          | 1/1281 [00:01<33:34,  1.57s/it]

File: 0000912057-95-000305.txt, Occurrences: []


  0%|          | 2/1281 [00:02<28:37,  1.34s/it]

File: 0000100122-95-000007.txt, Occurrences: []


  0%|          | 3/1281 [00:03<27:22,  1.29s/it]

File: 0000100166-95-000031.txt, Occurrences: ['uch bank being an "Approved Bank"), in each case with\nmaturities of not more than 270 days from the date of\nacquisition, (iii) commercial paper and variable or fixed rate\nnotes issued by any Approved Bank (or by the parent company\nthereof) or any variable rate notes issued by, or guaranteed by\nany domestic corporation rated A-2 (or the equivalent thereof)\nor better by S&P or P-2 (or the equivalent thereof) or better by\nMoody\'s and maturing within six months of the date of\nacquisition and (iv) repurchase agreements with a bank or trust\ncompany (including the Bank) or recognized securities dealer\nhaving capital and surplus in excess of $500,000,000 for direct\nobligations issued by or fully guaranteed by the United States\nof America in which the Borrower shall have a perfected first\npriority security interest (subject to no other liens or\nencumbrances) and having, on the date of purchase thereof, a\nfair market value of at leas

  0%|          | 4/1281 [00:04<19:44,  1.08it/s]

File: 0000100331-95-000014.txt, Occurrences: []


  0%|          | 5/1281 [00:05<18:34,  1.14it/s]

File: 0000100517-95-000011.txt, Occurrences: []


  0%|          | 6/1281 [00:05<14:15,  1.49it/s]

File: 0000950117-95-000071.txt, Occurrences: []


  1%|          | 8/1281 [00:05<08:18,  2.55it/s]

File: 0000899652-95-000014.txt, Occurrences: []
File: 0000950131-95-000606.txt, Occurrences: []
File: 0000101001-95-000002.txt, Occurrences: []


  1%|          | 11/1281 [00:06<06:03,  3.49it/s]

File: 0000101063-95-000004.txt, Occurrences: []
File: 0000950144-95-000704.txt, Occurrences: []


  1%|          | 13/1281 [00:06<04:29,  4.71it/s]

File: 0000101265-95-000008.txt, Occurrences: []
File: 0000912057-95-000101.txt, Occurrences: []


  1%|          | 14/1281 [00:06<04:13,  4.99it/s]



  1%|          | 16/1281 [00:07<04:31,  4.67it/s]

File: 0000950129-95-000094.txt, Occurrences: []
File: 0000950149-95-000135.txt, Occurrences: ['t company, USL Capital Corporation, whose assets are available first\nand foremost to satisfy the claims of its creditors. Beginning in June 1994, the\nCompany discontinued securitizing additional leases under this arrange-ment to\nreplace the run-off of principal of the leases initially securitized. See Note 2\nof Notes to Consolidated Financial Statements on page 26.\n \n     On April 30, 1992, the Company securitized and sold approximately $94\nmillion in principal amount of receivables under lease-purchase agreements with\nvarious state and local governments. These lease-purchase agreements were\npurchased by the Company from Ford Credit, for whom they had been managed by the\n \n                                       12\n<PAGE>   13\n \nCompany. The Company then immediately transferred them to tax-exempt grantor\ntrusts, which issued tax-exempt, asset-backed certificates to investors in 

  1%|▏         | 18/1281 [00:07<04:07,  5.11it/s]

File: 0000101829-95-000012.txt, Occurrences: []
File: 0000950123-95-000826.txt, Occurrences: [' under limited\ncircumstances generally applicable to acquisitions of troubled institutions.\n \n     FIRREA gives the FDIC as conservator or receiver of a failed depository\ninstitution express authority to repudiate contracts with such institution which\nit determines to be burdensome or if such repudiation will promote the orderly\nadministration of the institution\'s affairs. Certain "qualified financial\ncontracts", defined to include securities contracts, commodity contracts,\nforward contracts, repurchase agreements, and swap agreements, are generally\nexcluded from the repudiation powers of the FDIC. The FDIC is also given\nauthority to enforce contracts made by a depository institution, notwithstanding\nany contractual provision providing for termination, default, acceleration, or\nexercise of rights upon, or solely by reason of, insolvency or the appointment\nof a conservator or rec

  2%|▏         | 20/1281 [00:07<03:32,  5.92it/s]

File: 0000101830-95-000013.txt, Occurrences: []
File: 0000102212-95-000012.txt, Occurrences: ["eir carrying amounts).  The carrying\namounts for variable-rate, fixed-term money market accounts and\ncertificates of deposit approximate their fair values at the\nreporting date.  Fair values for fixed-rate certificates of\ndeposit are estimated using a discounted cash flow calculation\nthat applies interest rates currently being offered on\ncertificates to a schedule of aggregated expected monthly\nmaturities on time deposits.\n\nShort-term borrowings:  The carrying amounts of securities sold\nunder repurchase agreements, and other short-term borrowings\napproximate their fair values.\n\nLong-term debt:  The fair values of the Corporation's long-term\nborrowings (other than deposits) are estimated using discounted\ncash flow analyses, based on the Corporation's current borrowing\nrates for similar types of borrowing arrangements.\n\n\n\nNote 15.  Parent Company Financial Information\n\nCon

  2%|▏         | 22/1281 [00:08<04:51,  4.32it/s]

File: 0000889810-95-000002.txt, Occurrences: []
File: 0000102729-95-000005.txt, Occurrences: []


  2%|▏         | 24/1281 [00:08<03:36,  5.82it/s]

File: 0000102420-95-000009.txt, Occurrences: []
File: 0000897101-95-000015.txt, Occurrences: []


  2%|▏         | 26/1281 [00:09<03:36,  5.78it/s]

File: 0000931302-95-000001.txt, Occurrences: []
File: 0000898430-95-000356.txt, Occurrences: []


  2%|▏         | 28/1281 [00:09<03:41,  5.65it/s]

File: 0000103575-95-000011.txt, Occurrences: []
File: 0000922423-95-000040.txt, Occurrences: ["cquisition, Vishay refinanced all of\nRoederstein's existing bank debt of DM 160,381,000 ($99,062,000).\nFunds to refinance Roederstein's debt were provided by a DM\n104,316,000 term loan with a group of banks, $20,000,000 borrowed\nunder an unsecured credit agreement, and borrowings under an\nexisting line of credit. \n\nEffective January 1, 1992, the Company acquired the worldwide\ntantalum capacitor and U.S. thick film resistor network\nbusinesses of Sprague Technologies, Inc.  Under the terms of the\npurchase agreement, Vishay paid $127,000,000 cash, transferred to\nSprague real property with a fair value of $4,771,000, and\nassumed certain liabilities relating to the businesses.  Vishay\nalso entered into certain ancillary agreements with the seller,\nincluding one-year sales and distribution agreements under which\nVishay received fees of $3,325,000 during 1992, which are\nincluded in o

  2%|▏         | 30/1281 [00:09<03:44,  5.58it/s]

File: 0000916641-95-000050.txt, Occurrences: []
File: 0000950123-95-000109.txt, Occurrences: []


  2%|▏         | 31/1281 [00:10<03:31,  5.90it/s]

File: 0000103973-95-000003.txt, Occurrences: []


  2%|▏         | 32/1281 [00:10<05:30,  3.78it/s]

File: 0000010427-95-000006.txt, Occurrences: []


  3%|▎         | 33/1281 [00:10<06:08,  3.38it/s]

File: 0000912057-95-001626.txt, Occurrences: ["      are, at the time of acquisition, rated at least A by\n                    Standard & Poor's Corporation or Moody's Investors Service,\n                    Inc.;\n\n               (2)  Repurchase Agreements with any domestic bank with debt rated\n                    'AA' or better by Standard & Poor's Corporation, or any\n                    foreign bank rated  at least 'AA' by Standard & Poor's\n                    Corporation and 'Aa' by Moody's Investors Service, Inc.; or\n                    repurchase agreements with such other Persons on such terms\n                    as the Company and the Agent shall agree in writing;\n                    provided the term of all such repurchase agreements is for\n                    one year or less;\n\n               (3)  Direct obligations of the United States of America, or\n                    Investments in any Person, which Investments are guaranteed\n                    by the full fa

  3%|▎         | 34/1281 [00:11<08:14,  2.52it/s]

File: 0000950131-95-000701.txt, Occurrences: []


  3%|▎         | 36/1281 [00:11<06:16,  3.30it/s]

File: 0000950133-95-000180.txt, Occurrences: []


  3%|▎         | 37/1281 [00:12<06:27,  3.21it/s]

File: 0000104867-95-000002.txt, Occurrences: []


  3%|▎         | 38/1281 [00:12<06:53,  3.00it/s]

File: 0000950117-95-000057.txt, Occurrences: ['5 had no impact on earnings since all\nnonequity securities are categorized as "held-to-maturity" and,\naccordingly, continue to be carried at amortized cost.  At December\n31, 1994 and 1993, respectively, gross unrealized gains were $.4 and\n$4.3.  Gross unrealized losses were $4.0 at December 31, 1994.  The\ninvestment securities portfolio was comprised of negotiable\ncertificates of deposit, Puerto Rico government bonds, guaranteed\ncollateralized mortgage obligations and Ginnie Mae certificates,\nrepurchase agreements and short-term U.S. dollar-linked Mexican\ngovernment bonds.  Equity securities, categorized as "available-for-\nsale," were immaterial.  \n\nThe investment securities (mentioned above) were reported in the\nfollowing balance sheet categories:\n\n<TABLE>\n<CAPTION>\n- -------------------------------------------------------------------\nDecember 31,                             1994                  1993\n- ----------------

  3%|▎         | 39/1281 [00:13<08:21,  2.48it/s]

File: 0000912057-95-001533.txt, Occurrences: ['7)        (46)        (129)         (56)     (185)\n     Certificates of deposit                                      (1)         (1)         (2)          (8)          (1)       (9)\n     Other time deposits                                          (1)          1          --           (1)          --        (1)\n     Deposits in foreign offices                                  44          --          44           (1)          (2)       (3)\n  Federal funds purchased and securities\n     sold under repurchase agreements                             45          25          70           (8)          (4)      (12)\n  Commercial paper and other short-term borrowings                 1           3           4           (1)          (2)       (3)\n  Senior debt                                                    (12)         11          (1)          --          (23)      (23)\n  Subordinated debt                                              (26)    

  3%|▎         | 41/1281 [00:13<06:16,  3.30it/s]

File: 0000950115-95-000063.txt, Occurrences: []
File: 0000950150-95-000229.txt, Occurrences: []


  3%|▎         | 42/1281 [00:14<07:39,  2.70it/s]

File: 0000105982-95-000052.txt, Occurrences: ['fset by strong growth  in  higher-yielding\nloans, interest bearing time deposits due from banks\ndeclined  $962 million (71.7%) to an average of $380\nmillion in 1994. This follows a decline of $886 million\n(39.8%) in 1993 and an increase of $471 million (26.8%) in\n1992. The level of the  interbank placement  of funds\ntypically  expands  and contracts in conjunction  with the\nliquidity  and  yields  available in the  market  and\nalternative investments  for  such funds. Federal funds\nsold and   repurchase agreements averaged $471 million in\n1994, a decline of $811 million (63.3%). This follows a\ndecline of 24.9% in 1993 and an increase of 20.0%  in\n1992.  It  is expected that these  earning assets  will\nessentially remain flat during 1995.\n\nAt December 31, 1994, interest bearing time deposits due\nfrom banks totaled  $26 million, a substantial decline of\n$1.1 billion (97.8%) from the level a year earlier. At the\nsame time, f

  3%|▎         | 43/1281 [00:14<07:06,  2.90it/s]

File: 0000018540-95-000045.txt, Occurrences: ['ext  two  years  as contractual standards are met.   Additionally,\n  CSW Development-I, Inc. has entered into a fixed price contract  to\n  construct   the  Mulberry  thermal  host  facility.   The   maximum\n  potential  liability  under  this  fixed  price  contract  is   $14\n  million.  The thermal host facility is expected to be completed  by\n  the  first quarter of 1995.  CSW has provided additional guarantees\n  to the project totaling approximately $57 million.\n\n  CSWE  has  entered  into a purchase agreement  on  the  Ft.  Lupton\n  project  to provide $79.5 million of equity upon the occurrence  of\n  certain events.  As of January 9, 1995, $43 million has been  paid.\n  CSWE  has  provided three letters of credit to the project totaling\n  $14.3  million.   During March 1995, CSWE closed permanent  project\n  financing  on  the  Ft.  Lupton facility  in  the  amount  of  $208\n  million.\n\n  CSWE  has  committed to provide 




KeyboardInterrupt: 

In [14]:
import os
from pathlib import Path


path = Path("sec-data\data/2015_1/10-K/html") 
file = "edgar/data/1315257/0001193125-15-073376.txt"
a = file.split('/')

b = path / f"{a[-2]}_{a[-1]}"

b

WindowsPath('sec-data/data/2015_1/10-K/html/1315257_0001193125-15-073376.txt')