<a href="https://colab.research.google.com/github/cherypallysaisurya/ResuVerse/blob/main/dataextraction%26matchingscore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install PyMuPDF pytesseract opencv-python-headless Pillow transformers torch

Collecting PyMuPDF
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu

###code for extration###

In [None]:
import os
import re
import fitz  # PyMuPDF
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import logging
import nltk
from nltk.tokenize import sent_tokenize
from concurrent.futures import ThreadPoolExecutor
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
nltk.download('punkt_tab')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt', quiet=True)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

REGEX_NEWLINES = re.compile(r'\n{3,}')
REGEX_PAGENUM = re.compile(r'Page \d+ of \d+')
REGEX_MULTISPACE = re.compile(r' {2,}')
REGEX_BULLET = re.compile(r'•\s*')

section_synonyms = {
    "scope of work": [
        "scope of work", "services required", "project scope", "exhibit a",
        "description and standards", "scope", "statement of work",
        "contract scope", "work requirements", "contractor responsibilities",
        "technical requirements", "deliverables", "performance requirements"
    ],
    "background": [
        "background", "project background", "general information",
        "project overview", "project description", "introduction",
        "program background", "overview", "agency background",
        "purpose", "executive summary", "agency needs", "situation overview"
    ]
}

section_keywords = {
    "scope of work": [
        "provider", "contractor", "responsibilities", "services", "requirements",
        "shall", "must", "deliverable", "perform", "duty", "obligation",
        "sourcing", "screening", "reimbursement", "insurance", "licensure",
        "travel", "lodging", "documentation", "EHR"
    ],
    "background": [
        "mission", "purpose", "organization", "department", "agency",
        "established", "serves", "location", "region", "operation",
        "history", "overview", "goal", "objective", "population", "need"
    ]
}

def extract_text_from_pdf(file_path):
    start_time = time.time()
    try:
        doc = fitz.open(file_path)
        full_text = ""
        for page_num in range(len(doc)):
            page = doc[page_num]
            blocks = page.get_text("blocks")
            page_text = "\n\n".join(block[4].strip() for block in blocks if block[4].strip())
            full_text += f"\n--- PAGE {page_num + 1} ---\n{page_text}"
        logger.info(f"PDF extraction for {file_path} completed in {time.time() - start_time:.2f} sec")
        return full_text
    except Exception as e:
        logger.error(f"Error extracting text from {file_path}: {e}")
        return ""

def clean_text(text):
    text = REGEX_NEWLINES.sub("\n\n", text)
    text = REGEX_PAGENUM.sub("", text)
    text = REGEX_MULTISPACE.sub(" ", text)
    text = REGEX_BULLET.sub("• ", text)
    text = re.sub(r'\n--- PAGE \d+ ---\n', '\n\n', text)
    return text.strip()

def identify_section_markers(text):
    patterns = [
        r'(?:(?:^|\n)(?:SECTION|Section)[\s:.]*[0-9A-Z.-]+\s+([A-Z][A-Za-z\s]+))',
        r'(?:(?:^|\n)([A-Z][A-Z\s]{2,})(?:$|\n))',
        r'(?:(?:^|\n)(\d+(?:\.\d+)*[\s.]+[A-Z][A-Za-z\s]+)(?:$|\n))',
        r'(?:(?:^|\n)([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3})(?:$|\n))'
    ]

    section_markers = set()
    for pattern in patterns:
        matches = re.finditer(pattern, text)
        for match in matches:
            header = match.group(1) if match.groups() else ""
            if header and len(header.strip()) > 3:
                section_markers.add(header.strip().lower())

    return section_markers

def extract_section(text, section_name, identified_markers=None):
    base_name = section_name.lower()
    confidence = 0.0
    patterns = [
        re.compile(rf"(?:^|\n)(?:SECTION|Section)[\s:.]*[0-9A-Z.-]+\s+.*{re.escape(base_name)}[^\n]*\n+(.*?)(?=\n(?:SECTION|Section)[\s:.]*[0-9A-Z.-]+\s+|$)", re.IGNORECASE | re.DOTALL),
        re.compile(rf"(?:^|\n)\d+(?:\.\d+)*\s+{re.escape(base_name)}[^\n]*\n+(.*?)(?=\n\d+(?:\.\d+)*\s+|$)", re.IGNORECASE | re.DOTALL),
        re.compile(rf"(?:^|\n){re.escape(base_name.upper())}[^\n]*\n+(.*?)(?=\n[A-Z][A-Z\s]+(?:$|\n)|$)", re.IGNORECASE | re.DOTALL),
        re.compile(rf"(?:^|\n){re.escape(base_name.title())}[^\n]*\n+(.*?)(?=\n[A-Z][a-z]+\s+[A-Z][a-z]+(?:$|\n)|$)", re.IGNORECASE | re.DOTALL)
    ]

    if ' of ' in base_name:
        parts = base_name.split(' of ')
        if len(parts) == 2:
            of_pattern = re.compile(rf"(?:^|\n){re.escape(parts[0])}\s+of\s+{re.escape(parts[1])}[^\n]*\n+(.*?)(?=\n[A-Z][\w\s]+|$)", re.IGNORECASE | re.DOTALL)
            patterns.insert(0, of_pattern)

    best_match = None

    for pattern in patterns:
        match = pattern.search(text)
        if match:
            current_match = match.group(1).strip()
            current_confidence = min(1.0, len(current_match.split()) / 500) * 0.5

            context_start = max(0, match.start() - 100)
            context = text[context_start:match.start()]
            if re.search(rf"\b{re.escape(base_name)}\b", context, re.IGNORECASE):
                current_confidence += 0.3

            if base_name in section_keywords:
                key_terms = section_keywords[base_name]
                term_matches = sum(1 for term in key_terms if term.lower() in current_match.lower())
                current_confidence += min(0.2, term_matches * 0.02)

            word_count = len(current_match.split())
            if word_count > 1500:
                possible_endpoints = re.finditer(r'\n\s*\n[A-Z]', current_match)
                positions = [m.start() for m in possible_endpoints]
                if positions and positions[0] > 300:
                    current_match = current_match[:positions[0]]
                    current_confidence = min(current_confidence + 0.1, 1.0)

            if current_confidence > confidence:
                confidence = current_confidence
                best_match = current_match

    if best_match is None and identified_markers:
        section_words = set(base_name.split())
        best_marker = None
        best_overlap = 0

        for marker in identified_markers:
            marker_words = set(marker.split())
            overlap = len(section_words.intersection(marker_words))
            if overlap > best_overlap:
                best_overlap = overlap
                best_marker = marker

        if best_marker and best_overlap >= 1:
            marker_pattern = re.compile(rf"(?:^|\n){re.escape(best_marker)}[^\n]*\n+(.*?)(?=\n[A-Z][\w\s]+(?:$|\n)|$)", re.IGNORECASE | re.DOTALL)
            match = marker_pattern.search(text)
            if match:
                best_match = match.group(1).strip()
                confidence = 0.3 + (best_overlap / len(section_words)) * 0.3

    if best_match is None:
        paragraphs = re.split(r'\n\s*\n', text)
        relevant_paragraphs = []

        for para in paragraphs:
            if base_name in section_keywords:
                keyword_count = sum(1 for kw in section_keywords[base_name] if kw.lower() in para.lower())
                if keyword_count >= 3 or re.search(rf"\b{re.escape(base_name)}\b", para, re.IGNORECASE):
                    relevant_paragraphs.append(para)

        if relevant_paragraphs:
            best_match = "\n\n".join(relevant_paragraphs[:5])
            confidence = 0.3

    if best_match:
        return best_match, confidence
    else:
        return f"No '{section_name.title()}' content found.", 0.0

def extract_section_by_synonyms(text, canonical_section, markers):
    synonyms = section_synonyms.get(canonical_section, [canonical_section])
    best_text = None
    best_conf = 0.0

    for syn in synonyms:
        extracted_text, conf = extract_section(text, syn, markers)
        if conf > best_conf and not extracted_text.startswith("No '"):
            best_conf = conf
            best_text = extracted_text

    if best_text:
        return best_text, best_conf
    else:
        return f"No '{canonical_section.title()}' content found.", 0.0

def preprocess_for_summarization(text):
    text = re.sub(r'\s*\n\s*\n\s*', '\n\n', text)
    text = re.sub(r'(?<=\n)[•*-]\s*', '• ', text)
    text = re.sub(r'(?<=\n)\s*\d+[\.)]\s+', lambda m: f"{m.group().strip()} ", text)
    text = re.sub(r'(?<=\n)\s*\d+\s*(?=\n)', '', text)
    text = re.sub(r'(?<=\n).*confidential.*(?=\n)', '', text, flags=re.IGNORECASE)
    return text

def chunk_text_intelligently(text, max_words=300):
    subsection_pattern = r'\n\s*\n(?:\d+\.\d+|[A-Z]\.\d+|[a-z]\)|\([a-z]\))\s+'
    subsections = re.split(subsection_pattern, text)

    if len(subsections) > 1:
        chunks = []
        for subsec in subsections:
            subsec = subsec.strip()
            if not subsec:
                continue

            if len(subsec.split()) > max_words:
                chunks.extend(split_by_sentences(subsec, max_words))
            else:
                chunks.append(subsec)
        return chunks
    else:
        return split_by_sentences(text, max_words)

def split_by_sentences(text, max_words=300):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        sentence_words = len(sentence.split())
        if sentence_words > max_words:
            if current_chunk:
                chunks.append(" ".join(current_chunk))
                current_chunk = []
                current_word_count = 0

            parts = re.split(r'[;:,] (?=[A-Za-z])', sentence)
            for part in parts:
                part_words = len(part.split())
                if part_words > max_words/2:
                    chunks.append(part)
                else:
                    if current_word_count + part_words > max_words and current_chunk:
                        chunks.append(" ".join(current_chunk))
                        current_chunk = [part]
                        current_word_count = part_words
                    else:
                        current_chunk.append(part)
                        current_word_count += part_words
        elif current_word_count + sentence_words > max_words and current_chunk:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_word_count = sentence_words
        else:
            current_chunk.append(sentence)
            current_word_count += sentence_words

            if current_word_count >= max_words:
                chunks.append(" ".join(current_chunk))
                current_chunk = []
                current_word_count = 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def generate_section_specific_prompt(section_name, chunk, chunk_info):
    if section_name.lower() == "scope of work":
        prompt = (
            f"Summarize the following text extracted from the 'Scope of Work' section of an RFP. "
            f"Provide a clear and concise summary that outlines the contractor's responsibilities, including provider sourcing, screening, reimbursement, "
            f"insurance, licensure verification, travel/lodging management, and EHR documentation. "
            f"Do not include extra commentary. Text to summarize {chunk_info}:\n\n{chunk}"
        )
    elif section_name.lower() == "background":
        prompt = (
            f"Summarize the following text extracted from the 'Project Background' section of an RFP. "
            f"Provide a clear and concise summary that highlights the department's mission, purpose, geographic context, and operational details. "
            f"Do not include extra commentary. Text to summarize {chunk_info}:\n\n{chunk}"
        )
    else:
        prompt = (
            f"Summarize the following text from an RFP. Provide a concise summary of key points. "
            f"Text to summarize {chunk_info}:\n\n{chunk}"
        )
    return prompt

def generate_final_summary_prompt(section_name, combined_summaries, confidence):
    note = ""
    if confidence < 0.5:
        note = " Note: The extracted content may be incomplete."

    if section_name.lower() == "scope of work":
        prompt = (
            f"Refine and consolidate the following texts into a final, clear, and detailed summary of the 'Scope of Work' section. "
            f"Focus exclusively on contractor responsibilities including provider sourcing, screening, reimbursement, insurance, licensure verification, "
            f"travel/lodging management, and EHR documentation. Output only the final summary, formatted as a coherent paragraph followed by 4-7 bullet points of key responsibilities. {note}\n\nCombined Text:\n{combined_summaries}"
        )
    elif section_name.lower() == "background":
        prompt = (
            f"Refine and consolidate the following texts into a final, clear, and comprehensive summary of the 'Project Background' section. "
            f"Focus exclusively on the department's mission, purpose, geographic context, and operational details. Output only the final summary, formatted as a coherent paragraph followed by 3-5 bullet points of key organizational details. {note}\n\nCombined Text:\n{combined_summaries}"
        )
    else:
        prompt = (
            f"Refine and consolidate the following texts into a final summary. "
            f"Output only the final summary. {note}\n\nCombined Text:\n{combined_summaries}"
        )
    return prompt

def remove_redundancy(text):
    sentences = sent_tokenize(text)

    if len(sentences) <= 5:
        return text

    vectorizer = TfidfVectorizer(stop_words='english')
    try:
        tfidf_matrix = vectorizer.fit_transform(sentences)
        similarity_matrix = cosine_similarity(tfidf_matrix)

        redundant_indices = set()
        for i in range(len(sentences)):
            if i in redundant_indices:
                continue
            for j in range(i+1, len(sentences)):
                if similarity_matrix[i, j] > 0.7:
                    redundant_indices.add(j)

        unique_sentences = [s for idx, s in enumerate(sentences) if idx not in redundant_indices]
        return " ".join(unique_sentences)
    except:
        return text

def verify_section_content(summary, section_name):
    if section_name.lower() == "scope of work":
        key_topics = {
            "provider sourcing": ["provider", "sourcing", "recruit", "acquire"],
            "screening": ["screen", "evaluate", "assess", "review"],
            "reimbursement": ["reimbursement", "payment", "compensation", "fee"],
            "insurance": ["insurance", "coverage", "liability"],
            "licensure": ["license", "certification", "credential"],
            "travel/lodging": ["travel", "lodging", "accommodation", "housing"],
            "EHR documentation": ["EHR", "documentation", "record", "chart"]
        }
    elif section_name.lower() == "background":
        key_topics = {
            "mission": ["mission", "purpose", "goal", "objective"],
            "organizational purpose": ["organization", "department", "agency", "authority"],
            "geographic context": ["location", "region", "area", "geographic", "jurisdiction"],
            "operational details": ["operation", "process", "activity", "service", "statistic"]
        }
    else:
        return summary

    missing_topics = []
    for topic, keywords in key_topics.items():
        if not any(kw.lower() in summary.lower() for kw in keywords):
            missing_topics.append(topic)

    if missing_topics:
        note = f"\n\nNote: This summary may not cover: {', '.join(missing_topics)}."
        summary += note

    return summary

def safe_generate(prompt, max_length, min_length, tokenizer, model, max_retries=3):
    tries = 0
    while tries < max_retries:
        try:
            tokens = tokenizer.tokenize(prompt)
            if len(tokens) > 1024:
                tokens = tokens[:1024]
                prompt = tokenizer.convert_tokens_to_string(tokens)

            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            model.to(device)

            inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                min_length=min_length,
                do_sample=False,
                num_beams=4,
                early_stopping=True
            )

            result = tokenizer.decode(outputs[0], skip_special_tokens=True)
            return result
        except Exception as e:
            logger.error(f"Error in text generation: {e}")
            tries += 1
            time.sleep(1)

    return f"Error generating summary after {max_retries} retries."

def summarize_section(section_text, canonical_section, confidence, tokenizer, model):
    if section_text.startswith("No '") and section_text.endswith("' content found."):
        return section_text

    cleaned_section = preprocess_for_summarization(section_text)
    chunks = chunk_text_intelligently(cleaned_section, max_words=300)

    if len(cleaned_section.split()) < 150:
        return cleaned_section

    chunk_summaries = []
    with ThreadPoolExecutor(max_workers=min(4, len(chunks))) as executor:
        futures = []
        for i, chunk in enumerate(chunks):
            chunk_info = f"(chunk {i+1}/{len(chunks)})"
            prompt = generate_section_specific_prompt(canonical_section, chunk, chunk_info)
            chunk_word_count = len(chunk.split())
            max_summary_length = max(150, min(300, chunk_word_count // 2))
            min_summary_length = max(75, min(150, chunk_word_count // 4))

            future = executor.submit(
                safe_generate,
                prompt,
                max_length=max_summary_length,
                min_length=min_summary_length,
                tokenizer=tokenizer,
                model=model
            )
            futures.append(future)

        for future in futures:
            chunk_summaries.append(future.result())

    if len(chunk_summaries) == 1:
        final_summary = chunk_summaries[0]
    else:
        combined_summary = " ".join(chunk_summaries)
        final_prompt = generate_final_summary_prompt(canonical_section, combined_summary, confidence)
        final_summary = safe_generate(final_prompt, max_length=800, min_length=300, tokenizer=tokenizer, model=model)

    final_summary = remove_redundancy(final_summary)
    final_summary = verify_section_content(final_summary, canonical_section)

    return final_summary

def analyze_document_structure(text):
    patterns = {
        'numbered': len(re.findall(r'(?:^|\n)\d+(?:\.\d+)*\s+[A-Z]', text)),
        'all_caps': len(re.findall(r'(?:^|\n)[A-Z][A-Z\s]{3,}(?:$|\n)', text)),
        'section_word': len(re.findall(r'(?:^|\n)(?:SECTION|Section)\s+[0-9A-Z]', text)),
        'title_case': len(re.findall(r'(?:^|\n)(?:[A-Z][a-z]+\s+){1,3}(?:$|\n)', text))
    }
    dominant_pattern = max(patterns.items(), key=lambda x: x[1])[0]
    return {
        'patterns': patterns,
        'dominant_pattern': dominant_pattern,
        'section_markers': identify_section_markers(text)
    }

def extract_additional_metadata(text):
    metadata = {}
    rfp_id_match = re.search(r'(?:RFP|Request for Proposal)[\s#:]*([A-Z0-9-]+)', text, re.IGNORECASE)
    if rfp_id_match:
        metadata['rfp_id'] = rfp_id_match.group(1).strip()
    due_date_match = re.search(r'(?:due|submission|deadline)(?:\s+date)?(?:\s*[-:]\s*|\s+is\s+)([A-Za-z]+\s+\d{1,2},?\s+\d{4}|\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', text, re.IGNORECASE)
    if due_date_match:
        metadata['due_date'] = due_date_match.group(1).strip()
    org_patterns = [
        r'(?:issued|published|released)\s+by(?:\s+the)?\s+([A-Z][A-Za-z\s]+)(?:,|\.|\n)',
        r'(?:^|\n)([A-Z][A-Za-z\s]+(?:Department|Agency|Corporation|Authority|Office|Bureau))'
    ]
    for pattern in org_patterns:
        org_match = re.search(pattern, text, re.IGNORECASE)
        if org_match:
            metadata['issuing_organization'] = org_match.group(1).strip()
            break
    return metadata

def process_section(canonical_section, full_text, tokenizer, model, markers):
    logger.info(f"Processing section: {canonical_section}")
    section_text, conf = extract_section_by_synonyms(full_text, canonical_section, markers)
    if not section_text.startswith("No '"):
        summary = summarize_section(section_text, canonical_section, conf, tokenizer, model)
        return canonical_section, section_text, summary, conf
    else:
        return canonical_section, "", section_text, 0.0

def process_pdf(pdf_file, output_dir, tokenizer, model):
    logger.info(f"Processing PDF: {pdf_file}")
    raw_text = extract_text_from_pdf(pdf_file)
    if not raw_text:
        logger.error(f"No text extracted from {pdf_file}")
        return None
    cleaned_text = clean_text(raw_text)
    doc_structure = analyze_document_structure(cleaned_text)
    markers = doc_structure['section_markers']
    metadata = extract_additional_metadata(cleaned_text)

    target_sections = ["scope of work", "background"]

    section_summaries = {}
    section_confidences = {}

    for sec in target_sections:
        sec, sec_text, summary, conf = process_section(sec, cleaned_text, tokenizer, model, markers)
        section_summaries[sec] = {"text": sec_text, "summary": summary}
        section_confidences[sec] = conf
        logger.info(f"Section '{sec}' in {pdf_file} processed with confidence: {conf:.2f}")

    output_lines = []
    output_lines.append("--- DOCUMENT METADATA ---\n")
    if metadata:
        for key, value in metadata.items():
            output_lines.append(f"{key.replace('_', ' ').title()}: {value}\n")
    output_lines.append("\n" + "-" * 50 + "\n")

    for sec in target_sections:
        if sec in section_summaries and section_summaries[sec]["summary"]:
            note = " (Low confidence extraction)" if section_confidences[sec] < 0.5 else ""
            output_lines.append(f"--- {sec.upper()} SUMMARY{note} ---\n")
            output_lines.append(section_summaries[sec]["summary"] + "\n")
            output_lines.append("-" * 50 + "\n")

    pdf_filename = os.path.splitext(os.path.basename(pdf_file))[0]
    output_file = os.path.join(output_dir, f"{pdf_filename}_key_section_summaries.txt")
    with open(output_file, "w", encoding="utf-8") as f:
        f.writelines(output_lines)

    logger.info(f"Processing complete for {pdf_file}. Key section summaries saved to '{output_file}'")
    return output_file

def process_document(file_path, target_sections=None):
    if target_sections is None:
        target_sections = ["scope of work", "background"]

    start_time = time.time()

    try:
        raw_text = extract_text_from_pdf(file_path)
        if not raw_text:
            return {"error": "Failed to extract text from PDF"}

        cleaned_text = clean_text(raw_text)

        # Initialize the tokenizer and model - using t5-base for this example.
        tokenizer = AutoTokenizer.from_pretrained("t5-base")
        model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

        structure_info = analyze_document_structure(cleaned_text)
        metadata = extract_additional_metadata(cleaned_text)

        results = {}
        for section in target_sections:
            section_text, confidence = extract_section_by_synonyms(
                cleaned_text, section, structure_info["section_markers"]
            )

            summary = summarize_section(section_text, section, confidence, tokenizer, model)
            formatted_summary = summary  # You can call format_summary_with_structure(summary) if needed

            results[section] = {
                "section_text": section_text,
                "confidence": confidence,
                "summary": formatted_summary
            }

        results["metadata"] = metadata
        results["structure"] = structure_info
        results["processing_time"] = time.time() - start_time

        return results

    except Exception as e:
        logger.error(f"Error processing document {file_path}: {e}")
        return {"error": str(e)}

def main():
    # Hardcode the PDF file path here
    pdf_file = "/content/STAFF-8601 (2).pdf"
    output_dir = os.path.join(os.path.dirname(pdf_file), "rfp_analysis_output")
    os.makedirs(output_dir, exist_ok=True)

    # For this example, we use "facebook/bart-large-cnn"
    model_name = "facebook/bart-large-cnn"

    logger.info("Loading NLP model for summarization")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)

    process_pdf(pdf_file, output_dir, tokenizer, model)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

###matching score ###

In [None]:
import os
import re
from sentence_transformers import SentenceTransformer, util
import torch
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

# Download required NLTK resources if not already downloaded
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

# Function to extract the "Scope of Work" summary from the output file
def extract_scope_summary(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        pattern = r"--- SCOPE OF WORK SUMMARY ---\s*(.*?)(?:\n[-]{10,}|\Z)"
        match = re.search(pattern, content, re.DOTALL)
        if match:
            return match.group(1).strip()
        print("Scope of Work summary not found.")
        return ""
    except Exception as e:
        print(f"Error reading file: {e}")
        return ""

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase and remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text.lower())
    # Remove URLs and file paths
    text = re.sub(r'http\S+|www\S+|https\S+|\S+\.com\S*|\S+\.org\S*|\S*\.pdf', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

# Function to split text into manageable chunks
def split_into_chunks(text, max_length=512):
    if len(text) < max_length:
        return [text]

    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += " " + sentence
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    # Fallback to splitting by words if sentences are too long
    if not chunks:
        words = text.split()
        current_chunk = ""
        for word in words:
            if len(current_chunk) + len(word) + 1 < max_length:
                current_chunk += " " + word
            else:
                chunks.append(current_chunk.strip())
                current_chunk = word
        if current_chunk:
            chunks.append(current_chunk.strip())

    return chunks or [text[:max_length]]

# Function to compute semantic similarity using sentence transformers
def compute_similarity(text1, text2, model_name='all-MiniLM-L6-v2'):
    try:
        # Clean and preprocess texts
        clean_text1 = preprocess_text(text1)
        clean_text2 = preprocess_text(text2)

        if not clean_text1 or not clean_text2:
            print("Warning: One or both texts are empty after preprocessing")
            return 0.0

        # Load model and split texts into chunks
        model = SentenceTransformer(model_name)
        chunks1 = split_into_chunks(clean_text1)
        chunks2 = split_into_chunks(clean_text2)

        # Encode all chunks
        embeddings1 = model.encode(chunks1, convert_to_tensor=True)
        embeddings2 = model.encode(chunks2, convert_to_tensor=True)

        # Calculate best matches for each chunk
        cosine_scores = []
        for emb1 in embeddings1:
            chunk_sims = [util.pytorch_cos_sim(emb1, emb2).item() for emb2 in embeddings2]
            cosine_scores.append(max(chunk_sims) if chunk_sims else 0)

        # Average the similarities
        similarity = sum(cosine_scores) / len(cosine_scores) if cosine_scores else 0.0

        # Calibrate the score to provide better differentiation
        calibrated_similarity = (similarity - 0.3) * 1.4
        return max(0.0, min(1.0, calibrated_similarity))

    except Exception as e:
        print(f"Error computing similarity: {e}")
        return 0.0

# Function to provide feedback based on similarity score
def provide_feedback(score):
    if score >= 0.75:
        return "Strong match: Your experience closely aligns with the scope requirements."
    elif score >= 0.5:
        return "Good match: Your experience has significant overlap with the requirements."
    elif score >= 0.25:
        return "Partial match: Your experience has some relevant elements."
    else:
        return "Limited match: Consider highlighting relevant transferable skills."

# Main execution
if __name__ == "__main__":
    # Define the path to the output file
    output_file_path = os.path.join("rfp_analysis_output", "STAFF-8601 (2)_key_section_summaries.txt")

    # Extract the Scope of Work summary
    scope_summary = extract_scope_summary(output_file_path)
    if not scope_summary:
        print("Could not extract the Scope of Work summary.")
        exit(1)

    print("Extracted Scope of Work Summary:")
    print(scope_summary)
    print("\n" + "="*80 + "\n")

    # Get user input for their experience
    user_experience = input("Enter your experience description: ").strip()
    if not user_experience:
        print("No experience input provided.")
        exit(1)

    # Calculate similarity and provide feedback
    similarity_score = compute_similarity(scope_summary, user_experience)
    print(f"\nSimilarity Score: {similarity_score:.2f}")
    print(provide_feedback(similarity_score))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Extracted Scope of Work Summary:
Refine and consolidate the following texts into a final, clear, and detailed summary of the 'Scope of Work' section. Focus exclusively on contractor responsibilities including provider sourcing, screening, reimbursement, insurance, licensure verification, travel/lodging management, and EHR documentation. The Kern Behavioral Health and Recovery Services (KernBHRS) administration office is located in Bakersfield, the county sear, in the San Joaquin Valley. KernBH RS is seeking a locum tenens agency capable of providing board certified or board eligible psychiatrists, psychiatric certified nurse practitioners and registered nurses with mental health experience. The Department expects to spend approximately $2,100,000 per fiscal year for these services among all providers. Services shall begin on July 1, 2023. Three Agreements will be negotiated between Kern BH RS and the prospective service provider. The final summary, formatted as a coherent paragraph fol

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Similarity Score: 0.30
Partial match: Your experience has some relevant elements.


###requirements.txt###

In [None]:
'''PyMuPDF
torch
transformers
nltk
scikit-learn
numpy
sentence-transformers'''

'PyMuPDF\ntorch\ntransformers\nnltk\nscikit-learn\nnumpy\nsentence-transformers'

In [None]:
# Continuation Code: Generative Answer Integration

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

def generate_answer(summarized_text, user_question, tokenizer, model):
    """
    Generates an answer to the user question using the summarized text as context.

    Parameters:
    - summarized_text: The document summary (e.g., Scope of Work summary).
    - user_question: The question provided by the user.
    - tokenizer: The tokenizer for the generative model.
    - model: The generative model.

    Returns:
    - answer: The generated answer text.
    """
    # Build a prompt that combines the summary and user question.
    prompt = (
        f"Based on the following document summary:\n\n{summarized_text}\n\n"
        f"Please answer the following question:\n{user_question}\n\nAnswer:"
    )

    # Use safe_generate (from your previous code) to produce the answer.
    answer = safe_generate(prompt, max_length=300, min_length=100, tokenizer=tokenizer, model=model)
    return answer

def run_generative_qa():
    # Specify the path to your PDF file (make sure it matches your setup)
    file_path = "/content/STAFF-8601 (2).pdf"

    # Process the document using your existing function to obtain section summaries
    results = process_document(file_path)

    # For this example, we'll use the "Scope of Work" summary as context.
    summary_text = results.get("scope of work", {}).get("summary", "")
    if not summary_text:
        print("No 'Scope of Work' summary available. Please check your document processing.")
        return

    print("Document Summary (Scope of Work):\n")
    print(summary_text)
    print("\n" + "="*80 + "\n")

    # Get the user's question.
    user_question = input("Enter your question regarding the document: ").strip()
    if not user_question:
        print("No question provided. Exiting.")
        return

    # Load a generative model (using t5-base as an example) for answering queries.
    model_name = "t5-base"
    tokenizer_gen = AutoTokenizer.from_pretrained(model_name)
    model_gen = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model_gen.to(device)

    # Generate an answer using the summarized text and user query.
    answer = generate_answer(summary_text, user_question, tokenizer_gen, model_gen)
    print("\nGenerated Answer:\n")
    print(answer)

# Run the generative QA function
run_generative_qa()


Enter or paste the document summary text (press Ctrl+D or Ctrl+Z on a new line when finished):

Choose model quality (better quality = slower):
1. Fast (lower quality)
2. Balanced (medium quality)
3. High quality (slower)

Generating answer using google/flan-t5-xxl...


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

(…)a5b18a05535c9e14c7a355904270e15b0945ea86:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/9.60G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/6.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-bb762891b197>", line 138, in <cell line: 0>
    run_scope_analyzer()
  File "<ipython-input-11-bb762891b197>", line 122, in run_scope_analyzer
    answer, time_taken = generate_answer_with_llm(summary_text, question, model_name)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-11-bb762891b197>", line 60, in generate_answer_with_llm
    outputs = model.generate(
              ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/generation/utils.py", line 2254, in generate
    result = self._beam_search(
             ^

TypeError: object of type 'NoneType' has no len()