In [35]:
import os
import spacy
from spacy.matcher import Matcher
from spacy import displacy
import csv

# spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

input_directory = 'C:/Users/dipasha/Bank Guarantees Text'
annotations_output_directory = 'C:/Users/dipasha/Bank Guarantees Annotations'
csv_output_directory = 'C:/Users/dipasha/Bank Guarantees CSV'

if not os.path.exists(annotations_output_directory):
    os.makedirs(annotations_output_directory)

if not os.path.exists(csv_output_directory):
    os.makedirs(csv_output_directory)

# List all text files in the input directory
text_files = [f for f in os.listdir(input_directory) if f.lower().endswith(('.txt'))]

for text_file in text_files:
    input_path = os.path.join(input_directory, text_file)
    annotations_output_path = os.path.join(annotations_output_directory, os.path.splitext(text_file)[0] + '_annotations.html')
    csv_output_path = os.path.join(csv_output_directory, os.path.splitext(text_file)[0] + '.csv')

    with open(input_path, 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()

    patterns = {
        "BankName": [{"LOWER": {"in": ["bank", "branch"]}}, {"POS": "PROPN"}],
        "BranchName": [{"LOWER": "branch"}, {"POS": "PROPN"}],
        "Date": [{"LIKE_NUM": True}, {"LOWER": {"in": ["date", "issued"]}}],
        "BorrowerName": [{"LOWER": {"in": ["name", "borrower"]}}, {"POS": "PROPN"}],
        "AccountNumber": [{"TEXT": {"REGEX": r'\b\d{9,12}\b'}}, {"LOWER": {"in": ["account", "number"]}}],
        "GuaranteeType": [{"LOWER": {"in": ["type", "guarantee"]}}],
        "BeneficiaryName": [{"LOWER": {"in": ["name", "beneficiary"]}}, {"POS": "PROPN"}],
        "BeneficiaryAddress": [{"LOWER": {"in": ["address", "beneficiary"]}}],
        "CurrencyAmount": [{"IS_CURRENCY": True}, {"LIKE_NUM": True}],
        "AmountBankGuarantee": [{"LIKE_NUM": True}, {"LOWER": {"in": ["amount", "bank guarantee"]}}],
        "Purpose": [{"LOWER": {"in": ["purpose", "use"]}}],
        "ExpiryDate": [{"LIKE_NUM": True}, {"LOWER": {"in": ["expiry", "date"]}}],
        "ClaimPeriod": [{"LOWER": {"in": ["claim", "period"]}}],
        "BeneficiaryBankIFSC": [{"LOWER": {"in": ["ifsc", "code"]}}],
        "BankGuaranteeTenor": [{"LIKE_NUM": True}, {"LOWER": {"in": ["tenor", "bank guarantee"]}}],}

    doc = nlp(text)

    # Add the patterns to the Matcher
    matcher = Matcher(nlp.vocab)

    # Optional on_match callback function
    def on_match_callback(matcher, doc, id, matches):
        pass  # You can define a callback function here if needed

    for entity, pattern in patterns.items():
        matcher.add(entity, patterns=[pattern], on_match=on_match_callback)

    # Use the Matcher to find entities
    matches = matcher(doc)

    # Extract entities
    entities = [[(doc[start:end].text, doc[start:end].label_) for match_id, start, end in matches]]
    for match_id, start, end in matches:
        entity_text = doc[start:end].text
        entity_label = nlp.vocab.strings[match_id]
        entities.append((entity_text, entity_label))

    # Save annotations
    displacy_html = displacy.render(doc, style='ent', page=True)
    if displacy_html:
        with open(annotations_output_path, 'w', encoding='utf-8') as annotations_output_file:
             annotations_output_file.write(displacy_html)
        print(f"Annotations saved to {annotations_output_path}")
    else:
        print("No entities found for visualization.")

    # Save CSV
    with open(csv_output_path, 'w', encoding='utf-8', newline='') as csv_file:
       csv_writer = csv.writer(csv_file)
       csv_writer.writerow(["Entity", "Label"])  # Write header
       csv_writer.writerows(entities)

    print(f"CSV saved to {csv_output_path}")



No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\1694678403.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\21115843--quotFinding-of-Violation-and-Order-for-Compliance-EPA-alleges-that-the-City-of-Holbrook-has-violated-its-NPDES-permit-for-the-Painted-Mesa-Water-Reclamation-Facility-epa--x-01-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\21115843--quotFinding-of-Violation-and-Order-for-Compliance-EPA-alleges-that-the-City-of-Holbrook-has-violated-its-NPDES-permit-for-the-Painted-Mesa-Water-Reclamation-Facility-epa--x-01.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\283909228--Attachment-1132-Form-of-Bank-Guarantee-MIN-ppp-projetopontal-codevasf-gov--x-01-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\283909228--Attachment-1132-Form-of-Bank-Guarantee-MIN-ppp-projetopontal-codevasf-gov--x-01.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\36776449-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\36776449.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\advance bank guarentee-page-002-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\advance bank guarentee-page-002.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\BANK GURANTEE.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\Bank-Guarantee-Document-1-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\Bank-Guarantee-Document-1.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\Bank-Guarantee-for-Performence-of-Equipment-Installation-744x1024-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\Bank-Guarantee-for-Performence-of-Equipment-Installation-744x1024.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\bank-guarantee-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\bank-guarantee.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\BCB-€1B-Cash-Funds-File-RWA-Letter-MT760-Verbiage-Fraud2-Example1.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\Fake-Mt799-Swift-Barclays-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\Fake-Mt799-Swift-Barclays.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\Irrevocable-Corporate-Refund-Undertaking-Bank-Endorsed-ICRU-1_001-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\Irrevocable-Corporate-Refund-Undertaking-Bank-Endorsed-ICRU-1_001.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\Irrevocable-Corporate-Refund-Undertaking-Updated-Bank-Endorsed-ICRU_001 (1)-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\Irrevocable-Corporate-Refund-Undertaking-Updated-Bank-Endorsed-ICRU_001 (1).csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\Irrevocable-Corporate-Refund-Undertaking-Updated-Bank-Endorsed-ICRU_001-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\Irrevocable-Corporate-Refund-Undertaking-Updated-Bank-Endorsed-ICRU_001.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\sample-of-letter-of-guarantee1-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\sample-of-letter-of-guarantee1.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\standard-formate-bank-guarantee-Scanned.csv


No entities found for visualization.
CSV saved to C:/Users/dipasha/Bank Guarantees CSV\standard-formate-bank-guarantee.csv
