In [2]:
!pip uninstall nltk -y
!pip install nltk==3.8.1

Found existing installation: nltk 3.8.1
Uninstalling nltk-3.8.1:
  Successfully uninstalled nltk-3.8.1
Collecting nltk==3.8.1
  Using cached nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.8.1


In [3]:
!pip install PyPDF2 pandas



In [4]:
import re
import nltk
import pandas as pd
from PyPDF2 import PdfReader
from nltk.tokenize import sent_tokenize

nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/marsanto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# Helper functions
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + " "
    return text

def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def extract_sentences(text):
    return sent_tokenize(text)

def is_high_risk(sentence):
    sentence_lower = sentence.lower()
    
    high_risk_triggers = [
        "loss of customer", "significant customer", "limited number of customers",
        "limited number of suppliers", "depend on", "rely on", "material adverse effect",
        "our largest customer", "one or more customers", "critical supplier", 
        "top customer", "major customer", "accounted for more than"
    ]
    
    return any(trigger in sentence_lower for trigger in high_risk_triggers)

def extract_type(sentence):
    sentence_lower = sentence.lower()
    if "supplier" in sentence_lower:
        return "supplier"
    elif "customer" in sentence_lower:
        return "customer"
    return ""

def extract_percentage(sentence):
    match = re.search(r'(\d{1,3}\.\d{1,2}%)', sentence)
    return match.group(1) if match else ""


In [6]:
# Analyze the PDF
pdf_path = "/Users/marsanto/DevProjects/python/risk-extraction/data/Comfort Systems.pdf"  # using 1 pdf for testing
raw_text = extract_text_from_pdf(pdf_path)
cleaned_text = clean_text(raw_text)
sentences = extract_sentences(cleaned_text)

results = []

for sentence in sentences:
    if "customer" in sentence.lower() or "supplier" in sentence.lower():
        risk_flag = is_high_risk(sentence)
        if risk_flag:  # Only flag sentences considered risky
            results.append({
                "sentence": sentence.strip(),
                "percentage": extract_percentage(sentence),
                "type": extract_type(sentence),
                "is_high_risk": risk_flag
            })


In [None]:
import os
from datetime import datetime

# Create a timestamped filename
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_folder = "/Users/marsanto/DevProjects/python/risk-extraction/output"
output_filename = f"risk_sentences_{timestamp}.csv"
output_path = os.path.join(output_folder, output_filename)

# Save results to CSV
df = pd.DataFrame(results)
df.to_csv(output_path, index=False)

print(f"Saved to: {output_path}")
df.head()


✅ Saved to: /Users/marsanto/DevProjects/python/risk-extraction/output/risk_sentences_2025-07-01_22-57-48.csv


Unnamed: 0,sentence,percentage,type,is_high_risk
0,Sales and Marketing We have a diverse customer...,13.3%,customer,True
1,Our largest customer can change from year to y...,,customer,True
2,"Although we have7/1/25, 9:07 PM Comfort System...",,customer,True
3,A loss of business from a significant customer...,,customer,True
4,We hire third-party subcontractors to perform ...,,supplier,True
