In [7]:
import os
import pandas as pd 
from tqdm import tqdm
import re
import codecs

In [1]:
def find_total_matches(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        occurrences = 0
        occurrences += text.count("purchase agreement")
        #occurrences += text.count("purchase contract")
        return occurrences

In [13]:
def generate_directories(start_year, end_year):
    directories = []
    quarters = [1,2,3,4]
    for year in range(start_year, end_year + 1):
        for quarter in quarters:
            directory = f"sec-data/data/{year}_{quarter}/10-K"
            directories.append(directory)
    return directories


In [2]:
def find_all(a_str, sub):
    start = 0
    while True:
        start = a_str.find(sub, start)
        if start == -1: return
        yield start
        start += len(sub)
        
list(find_all('spam spam spam     spam', 'spam'))

[0, 5, 10, 19]

In [12]:
def find_purchase_contract_context(file_path):
    pattern = re.compile(r'([^\.]*?purchase agreement[^\.]*\.)', re.IGNORECASE)

    # Initialize list to store contexts
    contexts = []

    with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()

        instances = list(find_all(text, 'purchase agreement'))
        if len(instances) == 0:
            return contexts

        # Find matches in the text
        for instance in instances:
            start_index = max(0, instance - 500)
            end_index = min(len(text), instance + 500)
            context = text[start_index:end_index]
            contexts.append(context)

    return contexts

In [9]:
def find_purchase_all_context(file_path):

    # Initialize list to store contexts
    contexts = []

    with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()

        instances = list(find_all(text, 'purchase contract'))
        if len(instances) == 0:
            return contexts

        '''
        instances += list(find_all(text, 'purchase agreement'))

        if len(instances) < 5:
            return contexts
        '''
        
        # Find matches in the text
        for instance in instances:
            start_index = max(0, instance - 300)
            end_index = min(len(text), instance + 300)
            context = text[start_index:end_index]
            contexts.append(context)

    return contexts

In [10]:
file = "sec-data/data/1995_1/10-K/0000893220-95-000104.txt"
find_purchase_all_context(file)

["C2-85-1209, (U. S. Dist. Ct., S. D. Ohio, filed July 26,\n1985).  See II.C.1. below\n\n          2.   Phillips Production Co. v. Columbia Gas Transmission Corp., C.A.\nNo. 89-0269, (U.S. Dist. Ct., W.D. Pa. filed February 7, 1989).  The complaint\nas filed contained six separate counts involving ten gas purchase contracts\nwith Columbia Transmission.  All claims except those relating to Columbia\nTransmission's invocation of the cost recovery clause were settled and\ndismissed December 18, 1989, pursuant to agreement of the parties.  Phillips\ncost recovery claim was stayed by Columbia Transmission's b",
 'are stayed as to Columbia Transmission; indemnification agreements will be\neffective if the contract providing indemnification is not rejected)\n\n          1.  Royalty Owners Litigation: The agreements between Columbia\nTransmission and certain southwest producers effective in 1985 which reformed\ngas purchase contracts have resulted in a number of lawsuits against the\nproducers

In [14]:

start_year = 1995
end_year = 1995
directories = generate_directories(start_year, end_year)


csv_path = "combined.csv"
if os.path.exists(csv_path):
    existing_df = pd.read_csv(csv_path)
else:
    existing_df = pd.DataFrame()

data = []
for directory in directories:
    print(directory)
    file_list = os.listdir(directory)

    for filename in tqdm(file_list):
        file_path = os.path.join(directory, filename)
        
        if file_path.endswith(".txt"):

            contexts = find_purchase_contract_context(file_path)
            
            if len(contexts) != 0:
                for paragraph in contexts:
                    data.append({'Directory': directory, 'File': filename, 'Paragraph': paragraph})

df = pd.DataFrame(data)

combined_df = pd.concat([existing_df, df], ignore_index=True)
combined_df.to_csv(csv_path, index=True)


sec-data/data/1995_1/10-K


100%|██████████| 1281/1281 [00:14<00:00, 87.11it/s]


sec-data/data/1995_2/10-K


100%|██████████| 253/253 [00:02<00:00, 104.05it/s]


sec-data/data/1995_3/10-K


100%|██████████| 305/305 [00:03<00:00, 84.85it/s] 


sec-data/data/1995_4/10-K


100%|██████████| 339/339 [00:04<00:00, 77.05it/s] 


In [15]:

data = []

for directory in directories:
    counter =0

    file_list = os.listdir(directory)

    for filename in tqdm(file_list):
        file_path = os.path.join(directory, filename)
        
        # Check if the file is a text file
        if file_path.endswith(".txt"):
            # Find instances of "purchase contract" in the file
            occurrences = find_purchase_contract_context(file_path)
            
            # Print the filename and the number of occurrences
            if occurrences != 0:
                print(f"File: {filename}, Occurrences: {occurrences}")
                counter += 1
                
    data.append({'Directory': directory, 'Purchase Contract Mentioned': counter, 'Total Files': len(file_list)})

df = pd.DataFrame(data)
df.to_csv("count_purchase_contract.csv", index=False)


  0%|          | 0/1281 [00:00<?, ?it/s]

 17%|█▋        | 215/1281 [00:00<00:00, 1085.00it/s]

File: 0000912057-95-000305.txt, Occurrences: []
File: 0000100122-95-000007.txt, Occurrences: []
File: 0000100166-95-000031.txt, Occurrences: ['uch bank being an "Approved Bank"), in each case with\nmaturities of not more than 270 days from the date of\nacquisition, (iii) commercial paper and variable or fixed rate\nnotes issued by any Approved Bank (or by the parent company\nthereof) or any variable rate notes issued by, or guaranteed by\nany domestic corporation rated A-2 (or the equivalent thereof)\nor better by S&P or P-2 (or the equivalent thereof) or better by\nMoody\'s and maturing within six months of the date of\nacquisition and (iv) repurchase agreements with a bank or trust\ncompany (including the Bank) or recognized securities dealer\nhaving capital and surplus in excess of $500,000,000 for direct\nobligations issued by or fully guaranteed by the United States\nof America in which the Borrower shall have a perfected first\npriority security interest (subject to no other lien

 25%|██▌       | 324/1281 [00:00<00:00, 1046.86it/s]

File: 0000029917-95-000005.txt, Occurrences: []
File: 0000029924-95-000002.txt, Occurrences: []
File: 0000891092-95-000025.txt, Occurrences: []
File: 0000003000-95-000003.txt, Occurrences: ["pany has available $30,000,000 of financing under uncommitted\nmoney market lines of credit with several banks.  These facilities bear\ninterest at rates that vary with the banks' cost of funds and are typically\nless than the prevailing bank prime rate.  These credit lines are used in\nconjunction with the revolving credit agreement to facilitate settlement\nand accommodate short-term borrowing fluctuations.\n     Note payable to vendor consists of a non-interest bearing financing\nprovided through a purchase agreement with a vendor for the acquisition of\nan aircraft.\n     The Company has classified the borrowings outstanding under the money\nmarket lines of credit and note payable to vendor as long-term.  These\namounts will be refinanced under the revolving credit agreement.\n     The Company'

 51%|█████     | 651/1281 [00:00<00:00, 1386.74it/s]

File: 0000950124-95-000944.txt, Occurrences: []
File: 0000059558-95-000020.txt, Occurrences: []
File: 0000950150-95-000182.txt, Occurrences: []
File: 0000914039-95-000029.txt, Occurrences: []
File: 0000060195-95-000001.txt, Occurrences: []
File: 0000060086-95-000007.txt, Occurrences: []
File: 0000060302-95-000013.txt, Occurrences: []
File: 0000060512-95-000006.txt, Occurrences: []
File: 0000892917-95-000074.txt, Occurrences: []
File: 0000060653-95-000004.txt, Occurrences: []
File: 0000912057-95-001802.txt, Occurrences: []
File: 0000950152-95-000480.txt, Occurrences: []
File: 0000950109-95-000969.txt, Occurrences: []
File: 0000899243-95-000116.txt, Occurrences: []
File: 0000061425-95-000002.txt, Occurrences: []
File: 0000061611-95-000005.txt, Occurrences: [" to reflect\nan expectation that simple fluctuations in actual avoided costs would not\nrender a QF contract out of compliance with PURPA, but that a continuous and\nsubstantial discrepancy between actual and estimated avoided costs 

 71%|███████▏  | 915/1281 [00:00<00:00, 1091.05it/s]

File: 0000074931-95-000006.txt, Occurrences: []
File: 0000749502-95-000003.txt, Occurrences: ["on Center Magnetic Imaging, an investor group comprised of\nlocal physicians, the general partner of which is Magnetic\nImaging of Santa Ana, Inc.  The joint venture agreement provides\nfor the establishment and operation of a diagnostic imaging\ncenter located in Costa Mesa, California.  Concurrent with the\nsigning of the joint venture agreement, Registrant entered into a\nPurchase and Assumption agreement with CMR, whereby Registrant\nassumed CMR's obligations under facility contracts and equipment\npurchase agreements and reimbursed CMR for certain costs incurred\nduring the development phase of the venture. Registrant also\nentered into an Assignment of Lease, as amended, for the premises\nunder which Registrant assumed all obligations as lessee from\nCMR.  The lease was scheduled to expire on May 23, 1995 and\nrequired a monthly base rental of $8,890 (see below).\n\nRegistrant also ente

 80%|████████  | 1029/1281 [00:00<00:00, 1063.55it/s]

File: 0000908737-95-000026.txt, Occurrences: ['ht-line basis over the estimated useful lives ranging up to 40\n   years.  If the estimated net realizable value of an investment is less\n   than the carrying value, an allowance for possible investment loss is\n   established.  The determination of net realizable value includes\n   consideration of many factors including income to be earned from the\n   investment, holding costs, estimated selling prices, and prevailing\n   economic conditions.\n\n       Cash and cash equivalents.  Cash, over-night repurchase agreements\n   and short-term investments with maturities of three months or less at\n   date of purchase are carried at cost plus accrued interest.\n\n       Deferred interest and finance costs.  Costs incurred to secure\n   certain borrowings are capitalized and amortized over the terms of their\n   respective loans.  \n\n       Interest rate hedging arrangements.  The Company enters into\n   interest rate hedging arrangements to 

100%|██████████| 1281/1281 [00:01<00:00, 1090.58it/s]


File: 0000897069-95-000017.txt, Occurrences: []
File: 0000928385-95-000075.txt, Occurrences: []
File: 0000950129-95-000275.txt, Occurrences: []
File: 0000854727-95-000002.txt, Occurrences: []
File: 0000950152-95-000432.txt, Occurrences: []
File: 0000854094-95-000002.txt, Occurrences: []
File: 0000854884-95-000001.txt, Occurrences: ['ng company subsidiaries, Chicago and North\nWestern Acquisition Corp. and CNW Corporation, were eliminated by merger.\n\n\nRecent Developments - Transaction with Union Pacific Corporation\n\n     On March 10, 1995, the Company and Union Pacific Corporation ("Union\nPacific") announced that they had agreed that Union Pacific will acquire 100%\nof the Company\'s common stock at a price of $35 per share in cash, subject,\namong other things, to negotiation and execution of a mutually satisfactory\ndefinitive purchase agreement and approvals by the respective boards of\ndirectors of the Company and Union Pacific.  On March 16, 1995, the respective\nboards of di

  0%|          | 0/253 [00:00<?, ?it/s]

File: 0000891554-95-000083.txt, Occurrences: ['   -0-              (2)           -0-        -0-        -0-        -0-\nTreas.\n</TABLE>\n\n\n                                      -31-\n<PAGE>\n\n\n- ----------\n(1)   Includes,  for the years to which this footnote  applies,  life  insurance\n      premiums ($59,584 for Mr. Wolk in fiscal 1995) and amounts credited to Mr.\n      Wolk  and Mr.  Jackowitz  under  their  respective  employment  agreements\n      against  monthly  payments owed to URT under their  respective  promissory\n      notes and stock purchase agreements, all as described below.\n\n(2)   Pursuant to applicable rules,  information is not included with respect to\n      other annual  compensation  which does not exceed the lesser of $50,000 or\n      10% of the salary and bonus reported for the named executive officer.\n\n\nEmployment Contracts\n\n         On April  3,  1994,  when the 1995  fiscal  year  began,  Mr.  Wolk was\nemployed  by URT  under an  employment  

 51%|█████▏    | 130/253 [00:00<00:00, 1289.80it/s]

File: 0000950116-95-000272.txt, Occurrences: []
File: 0000070415-95-000010.txt, Occurrences: []
File: 0000950109-95-002567.txt, Occurrences: []
File: 0000912057-95-005004.txt, Occurrences: []
File: 0000891554-95-000084.txt, Occurrences: ["ld have received if he had survived and not become disabled would be\naccellerated to the date of death or disability. Such credits are required to be\npaid both during his  employment  and  consulting  periods until the  promissory\nnotes against which they are to be applied have been paid in full.  Such credits\nrepresent  monthly  amounts  which are payable by him under  certain  promissory\nnotes.  The credits under the 1992  Agreement had also been applied  against his\nobligations under a stock purchase agreement described below.\n\n         Mr.  Jackowitz' 1994 Agreement  retained  provisions  which were in his\n1992 Agreement  requiring PEC,  during his period of employment,  to furnish him\nwith an  automobile,  reimburse  him for business  e

100%|██████████| 253/253 [00:00<00:00, 1019.37it/s]


File: 0000814580-95-000003.txt, Occurrences: []
File: 0000950109-95-002477.txt, Occurrences: []
File: 0000950131-95-001777.txt, Occurrences: []
File: 0000817632-95-000004.txt, Occurrences: ["        of the Company's Form 10-K/A Amendment No. 1\n                              for December 31, 1992.\n\n            10.5              Form of Assent to Plan for a Common Law\n                              Composition of all Non-Bank Creditors of\n                              Registrant.  Incorporated by reference from\n                              Exhibit 10.7 of the Company's Form 10-K/A\n                              Amendment No. 1 for December 31, 1992.\n\n            10.6              Asset purchase agreement by and between Road\n                              and Show East, Inc. and Shared Technologies\n                              Cellular, Inc. Incorporated by reference from\n                              Exhibit 10.8 of the Company's Form 10-K/A\n                              Amend

  0%|          | 0/305 [00:00<?, ?it/s]

File: 0000100378-95-000013.txt, Occurrences: []
File: 0000950112-95-002435.txt, Occurrences: []
File: 0000102588-95-000022.txt, Occurrences: []
File: 0000906601-95-000012.txt, Occurrences: []
File: 0000950109-95-003953.txt, Occurrences: []
File: 0000950109-95-003715.txt, Occurrences: []
File: 0000950152-95-002236.txt, Occurrences: []
File: 0000892569-95-000540.txt, Occurrences: []
File: 0000107140-95-000015.txt, Occurrences: []
File: 0000896463-95-000121.txt, Occurrences: []
File: 0000107681-95-000016.txt, Occurrences: []
File: 0000012779-95-000029.txt, Occurrences: []
File: 0000108703-95-000005.txt, Occurrences: []
File: 0000950152-95-001893.txt, Occurrences: []
File: 0000950124-95-003099.txt, Occurrences: []
File: 0000950131-95-001905.txt, Occurrences: []
File: 0000017843-95-000018.txt, Occurrences: []
File: 0000912057-95-006316.txt, Occurrences: []
File: 0000950116-95-000422.txt, Occurrences: []
File: 0000950124-95-003067.txt, Occurrences: ['I\nto the Company to secure the TSI Note 

 38%|███▊      | 116/305 [00:00<00:00, 1157.93it/s]

File: 0000891618-95-000564.txt, Occurrences: [" finished products. In June 1995, the Company entered into a lease\nagreement for an additional manufacturing facility being constructed at its San\nJose campus site. In August 1995, the Company leased two additional facilities\nadjacent to its San Jose campus site. The Company is also cross-training\npersonnel, so that it can respond to changes in product mix by reallocating\npersonnel in addition to hiring.\n\n     The Company has been working with key vendors to improve inventory\nmanagement. Volume purchase agreements and just-in-time delivery schedules have\nreduced both inventory levels and costs. The Company's manufacturing engineers,\nin conjunction with key vendors, are improving the manufacturability and\nreliability of the new wafer and reticle inspection systems.\n\n     Many of the components and subassemblies are standard products, although\ncertain items are made to Company specifications. Certain of the components and\nsuba

 76%|███████▌  | 232/305 [00:00<00:00, 1034.57it/s]

File: 0000950152-95-001886.txt, Occurrences: []
File: 0000800459-95-000010.txt, Occurrences: []
File: 0000801558-95-000002.txt, Occurrences: []
File: 0000898430-95-001338.txt, Occurrences: []
File: 0000912057-95-007997.txt, Occurrences: ["------------   ------------\n                                           ------------   ------------\n</TABLE>\n\nManagement has determined, based on the Company's history of operating earnings\nand its expectations for the future, that operating income of the Company will\nmore likely than not be sufficient to realize fully these deferred tax assets.\n\n(7) REPURCHASE AGREEMENTS AND CREDIT RISK\nAs is customary in the manufactured housing industry, the Company is\ncontingently liable under the terms of repurchase agreements with financial\ninstitutions providing inventory financing for dealers of the Company's homes.\nAlthough the total contingent liability approximates $41.0 million at July 1,\n1995, the risk of loss is spread over numerous dealers a

100%|██████████| 305/305 [00:00<00:00, 973.77it/s] 


File: 0000950134-95-002110.txt, Occurrences: []
File: 0000912057-95-007823.txt, Occurrences: []
File: 0000916002-95-000015.txt, Occurrences: []
File: 0000083573-95-000006.txt, Occurrences: []
File: 0000840826-95-000021.txt, Occurrences: []
File: 0000950131-95-002330.txt, Occurrences: ['basis.  The\nRecapitalization was completed on November 2, 1993.\n\nThe Recapitalization allowed the Company to make significant improvements to its\noperating cost structure.  Until November 1993, the Company operated its own\nphosphate ore mining operation through the Conda Partnership, which has the\nrights to approximately 60-70 million tons of proven ore reserves, sufficient\nfor nearly 40 years of operations at present ore consumption rates.  In fiscal\n1994, the Company entered into a seven year purchase agreement with Rhone-\nPoulenc Basic Chemicals Company, a division of Rhone-Poulenc, Inc. ("RP") to\npurchase phosphate ore from a 20-year deposit owned by RP near the Conda Plant\n(the "RP Agreem

  0%|          | 0/339 [00:00<?, ?it/s]

File: 0000100712-95-000016.txt, Occurrences: []
File: 0000102710-95-000022.txt, Occurrences: []
File: 0000103595-95-000006.txt, Occurrences: []
File: 0000104207-95-000004.txt, Occurrences: []
File: 0000101357-95-000006.txt, Occurrences: ['\ndirectors. No such change has been made as of June 30, 1995.  For \nfinancial statement purposes, preferred dividends which accrued during the \nperiod are deducted from the results of operations in determining loss \napplicable to common shares whether or not such dividends are paid or \ndeclared. Liquidation value of this series of preferred stock during the \nfirst year after issuance was $35.26 a share and will increase over a 10 \nyear period to $100 a share.\n\nPursuant to the terms of the stock purchase agreement dated August 18, \n1982, as amended, between the Company and a corporation wholly-owned by \nthe President of the Company, the Company agreed to sell 1,000,000 shares \nof its Common Stock in four annual installments of 250,000 share

 28%|██▊       | 95/339 [00:00<00:00, 946.16it/s]

File: 0000903893-95-000061.txt, Occurrences: []
File: 0000950135-95-002535.txt, Occurrences: ['D SHORT-TERM INVESTMENTS:\n\nThe Company applies Statement of Financial Accounting Standards No. 115\n"Accounting for Certain Investments in Debt and Equity Securities" ("SFAS No.\n115"). The Company considers all highly liquid investments with original\nmaturities of three months or less at the time of acquisition to be cash\nequivalents . Cash equivalents consist of money market funds, repurchase\nagreements and debt securities at August 31, 1995. Cash equivalents consist of\nmoney market funds and repurchase agreements at August 31, 1994. The Company has\n$784,471 and $94,674 in restricted cash at August 31, 1995 and 1994,\nrespectively, in connection with certain capital lease obligations. (See Note\n7).\n\nThe Company\'s portfolio of investments are marketable securities classified as\nheld-to-maturity (recorded at amortized cost). Marketable securities consisted\nof commercial paper and

 56%|█████▌    | 190/339 [00:00<00:00, 865.30it/s]

File: 0000912057-95-011125.txt, Occurrences: []
File: 0000074046-95-000013.txt, Occurrences: []
File: 0000744106-95-000012.txt, Occurrences: []
File: 0000950134-95-002574.txt, Occurrences: []
File: 0000745026-95-000026.txt, Occurrences: []
File: 0000889812-95-000784.txt, Occurrences: ["roperties as its primary source of\nliquidity. During the year ended September 30, 1995, the Radisson South\ngenerated positive cash flow while the Somerset Marriott Hotel generated\nnegative cash flow due to significant property improvements. The Holiday Inn\nCrowne Plaza, owned by the unconsolidated joint venture, experienced positive\ncash flow during the year ended September 30, 1995. Working capital reserves are\nusually invested in United States Treasury obligations, money market accounts\nand repurchase agreements secured by United States Treasury obligations.\n\n     Registrant distributed $32,334,000 to the limited partners ($354.99 per\nlimited partnership unit) and $660,000 to the General Part

 82%|████████▏ | 277/339 [00:00<00:00, 768.67it/s]

File: 0000008154-95-000035.txt, Occurrences: []
File: 0000914233-95-000037.txt, Occurrences: []
File: 0000950115-95-000456.txt, Occurrences: []
File: 0000081870-95-000075.txt, Occurrences: []
File: 0000892569-95-000741.txt, Occurrences: []
File: 0000930661-95-000501.txt, Occurrences: []
File: 0000819539-95-000017.txt, Occurrences: []
File: 0000909654-95-000102.txt, Occurrences: ["empts\n         to  manage  this  risk  and  utilizes   off-balance   sheet   financial\n         instruments to a limited extent to manage its risks.\n\n         Home Federal obtains advances from the FHLB-NY upon the security of its\n         residential  mortgage  loans  and  mortgage-backed   securities.   Such\n         advances are made pursuant to several  different credit programs,  each\n         of which has its own interest rate and range of maturities.\n\n         Home Federal also employs repurchase agreements as a means of borrowing\n         funds.  It is the Savings Bank's policy to enter into 

100%|██████████| 339/339 [00:00<00:00, 813.64it/s]
