<a href="https://colab.research.google.com/github/cherypallysaisurya/ResuVerse/blob/main/Updated_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ✅ Step 1: Install necessary packages

# ✅ Step 2: Define file paths directly (replace with yours)
jd_path = "/content/STAFF-8601.pdf"        # Path to your Job Description PDF
resume_path = "/content/experience.pdf"    # Path to your Resume/Experience PDF

# ✅ Step 3: Import required libraries
import pdfplumber
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# ✅ Step 4: Utility functions
def extract_text_from_pdf(path):
    with pdfplumber.open(path) as pdf:
        return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

def extract_scope_sections(full_text):
    lines = full_text.split('\n')
    relevant = []
    capture = False

    include_keywords = ['Job Description', 'Roles', 'Responsibilities', 'Scope of Work', 'Duties', 'Position Summary']
    end_keywords = ['Qualifications', 'Requirements', 'Skills', 'Education', 'Benefits', 'Compensation']

    # First pass - try to find structured sections
    for line in lines:
        lower = line.lower().strip()
        if any(kw.lower() in lower for kw in include_keywords):
            capture = True
            relevant.append(line)  # Include the section header
        elif capture and any(kw.lower() in lower for kw in end_keywords):
            capture = False
        elif capture:
            relevant.append(line)

    # If nothing was captured, return a larger portion of the document
    if not relevant:
        # Look for any content that appears to be descriptive
        for line in lines:
            if len(line.strip()) > 30 or re.search(r'(provide|perform|responsible|service|work|duty|task)', line.lower()):
                relevant.append(line)

    return "\n".join(relevant) if relevant else "\n".join(lines[:150])  # Return more lines if structured sections not found

def summarize_text(text, model, max_chunk_words=500):
    words = text.split()
    if len(words) <= 100:
        return text
    summaries = []
    chunk = []
    for word in words:
        chunk.append(word)
        if len(chunk) >= max_chunk_words:
            input_text = " ".join(chunk)
            summary = model(input_text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
            summaries.append(summary)
            chunk = []
    if chunk:
        input_text = " ".join(chunk)
        summary = model(input_text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
        summaries.append(summary)
    return "\n".join(summaries)

def compute_similarity(text1, text2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([text1, text2])
    return round(cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 100, 2)

def extract_skills_from_text(text):
    """Extract common skills and requirements from text."""
    common_skills = [
        "python", "java", "javascript", "html", "css", "sql", "nosql", "aws", "azure",
        "cloud", "docker", "kubernetes", "agile", "scrum", "devops", "ci/cd", "git",
        "machine learning", "ai", "data analysis", "statistics", "communication",
        "leadership", "project management", "problem solving", "analytics", "microsoft",
        "development", "design", "testing", "security", "database", "programming", "software",
        "hardware", "network", "system", "maintenance", "support", "administration"
    ]

    skills_found = []
    for skill in common_skills:
        if re.search(r'\b' + re.escape(skill) + r'\b', text.lower()):
            skills_found.append(skill)

    return skills_found

def extract_specific_information(text):
    """Extract specific information like facilities, clinics, and locations."""
    info = {}

    # Extract clinics and facilities
    facilities_match = re.search(r'(?:at|in)\s+(?:the\s+)?([^,.]*(?:Clinics?|Units?|Facilities|Centers?|Hospitals?)[^,.]*)', text, re.IGNORECASE)
    if facilities_match:
        facilities_text = facilities_match.group(1)
        info['facilities'] = facilities_text.strip()

    # Look for specific mentions of clinics with numbers
    clinics_match = re.search(r'(\d+)\s*(?:Clinics|Outpatient\s+Clinics)', text, re.IGNORECASE)
    if clinics_match:
        info['clinics_count'] = clinics_match.group(1)

    # Look for locations
    location_match = re.search(r'(?:located|location|address|based in|office in)\s+(?:in\s+)?([A-Z][a-zA-Z]+(?:,\s*[A-Z][a-zA-Z]+)*)', text, re.IGNORECASE)
    if location_match:
        info['location'] = location_match.group(1)

    # Look for service areas
    areas_match = re.search(r'(\d+)\s*(?:Geographic\s+Service\s+Areas|Service\s+Areas|Geographic\s+Areas)', text, re.IGNORECASE)
    if areas_match:
        info['service_areas'] = areas_match.group(1)

    return info

# ✅ Step 5: Smart Hybrid Q&A Class
class SmartJDChatbot:
    def __init__(self):
        from transformers import pipeline
        import torch
        device = 0 if torch.cuda.is_available() else -1
        self.generator = pipeline("text2text-generation", model="google/flan-t5-large", device=device)
        from sentence_transformers import SentenceTransformer
        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

    def find_relevant_sentences(self, context, question, top_k=5):
        from sentence_transformers import util
        import torch

        # Better sentence splitting that handles abbreviations and special cases
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', context)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 15]

        if not sentences:
            # Fallback to simpler splitting if no sentences were extracted
            sentences = [s.strip() for s in context.split('.') if len(s.strip()) > 15]

        if not sentences:
            return context[:500]  # Last resort fallback

        sentence_embeddings = self.sentence_model.encode(sentences, convert_to_tensor=True)
        question_embedding = self.sentence_model.encode(question, convert_to_tensor=True)

        similarities = util.pytorch_cos_sim(question_embedding, sentence_embeddings)[0]
        top_results = similarities.argsort(descending=True)[:top_k]

        return ". ".join([sentences[i] for i in top_results])

    def ask_question(self, context, question):
        try:
            q = question.strip().lower()

            # Specially handle facilities and clinics questions
            if any(keyword in q for keyword in ['clinic', 'facilit', 'hospital', 'center', 'unit']):
                # Look specifically for mentions of clinics and other facilities
                clinic_pattern = r'(?:KernBHRS|Kern\s+Behavioral\s+Health).{0,50}(?:Outpatient\s+Clinics|Clinics)'
                clinic_match = re.search(clinic_pattern, context, re.IGNORECASE)

                jail_pattern = r'(?:Kern\s+County\s+Jail|County\s+Jail)'
                jail_match = re.search(jail_pattern, context, re.IGNORECASE)

                csu_pattern = r'(?:Crisis\s+Stabilization\s+Unit|CSU)'
                csu_match = re.search(csu_pattern, context, re.IGNORECASE)

                facility_list = []
                if clinic_match:
                    facility_list.append("KernBHRS Outpatient Clinics")
                if jail_match:
                    facility_list.append("Kern County Jail")
                if csu_match:
                    facility_list.append("KernBHRS Crisis Stabilization Unit")

                if facility_list:
                    return "The facilities mentioned include: " + ", ".join(facility_list)

            # 1️⃣ Rule-based direct answers for specific question types
            # Location questions
            if any(word in q for word in ['where', 'location', 'address', 'based']):
                location_pattern = r'(?:located|location|address|based in|office in).*?((?:[A-Z][a-z]+,?\s?)+)'
                matches = re.search(location_pattern, context, re.IGNORECASE)
                if matches:
                    return matches.group(0)

                # Look for any city names that might indicate location
                cities = ['Bakersfield', 'Los Angeles', 'San Francisco', 'Sacramento', 'Fresno']
                for city in cities:
                    if city in context:
                        surrounding = re.search(r'[^.]*' + city + r'[^.]*\.?', context)
                        if surrounding:
                            return surrounding.group(0)

            # Geographic organization
            if 'geographic' in q or 'areas' in q or 'regions' in q:
                geo_pattern = r'(?:county|region|area).{0,50}divided into.{0,50}(?:geographic|service areas|regions)'
                matches = re.search(geo_pattern, context, re.IGNORECASE)
                if matches:
                    return matches.group(0)

                # Look for numbers that might indicate geographic divisions
                number_areas = re.search(r'(\d+).{0,20}(?:geographic|service).{0,20}(?:areas|regions)', context, re.IGNORECASE)
                if number_areas:
                    return number_areas.group(0)

            # Facilities questions (backup approach)
            if any(word in q for word in ['facilities', 'units', 'centers', 'clinics']) and 'how many' in q:
                number_pattern = r'(\d+)\s*(?:clinics|units|facilities|centers|hospitals)'
                matches = re.search(number_pattern, context, re.IGNORECASE)
                if matches:
                    return f"There are {matches.group(1)} clinics mentioned."

                # If no specific number is found, look for lists of facilities
                facilities_list = re.findall(r'(?:Outpatient|KernBHRS|Department|County)\s+(?:Clinics?|Units?|Centers?)', context, re.IGNORECASE)
                if facilities_list:
                    return f"The facilities mentioned include: {', '.join(facilities_list)}"

            # Agreement questions
            if 'agreement' in q or 'contract' in q:
                agreement_pattern = r'(\d+).{0,30}(?:agreements|contracts).{0,100}(?:negotiated|executed)'
                matches = re.search(agreement_pattern, context, re.IGNORECASE)
                if matches:
                    return matches.group(0)

            # Authority questions
            if 'authority' in q or 'bind' in q or 'binding' in q:
                authority_pattern = r'(?:authority|authorized).{0,100}(?:bind|binding).{0,100}(?:County|contract)'
                matches = re.search(authority_pattern, context, re.IGNORECASE)
                if matches:
                    return matches.group(0)

            # 2️⃣ Semantic retrieval
            relevant_context = self.find_relevant_sentences(context, question)

            # 3️⃣ Generative answer
            prompt = f"""Based on the following job description excerpt:

{relevant_context}

Answer the question clearly and professionally:
{question}"""
            answer = self.generator(prompt, max_length=200, do_sample=False)[0]['generated_text']

            # 4️⃣ Post-processing to enhance the answer
            if len(answer.strip()) < 10 or "don't know" in answer.lower() or "no information" in answer.lower():
                # Try a direct search approach
                for facility_type in ["clinic", "outpatient", "jail", "crisis", "stabilization", "unit"]:
                    if facility_type in q.lower():
                        # Look for sentences containing this facility type
                        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', context)
                        for sentence in sentences:
                            if facility_type in sentence.lower():
                                return sentence.strip()

                # If still no good answer, look for RFP title or header which often contains facility info
                rfp_title = re.search(r'Request for Proposal.*?(?:provide|services).*?\n', context, re.IGNORECASE)
                if rfp_title:
                    services_section = context[rfp_title.start():rfp_title.start()+500]  # Get a chunk after the title
                    facility_mentions = re.findall(r'(?:at|in)\s+[^.]*?(?:Clinics?|Units?|Facilities|Jail)[^.]*\.', services_section, re.IGNORECASE)
                    if facility_mentions:
                        return facility_mentions[0]

            return answer.strip()

        except Exception as e:
            return f"⚠️ Error: {e}"

# ✅ Step 6: Run the analysis

# Load and summarize
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

jd_text_full = extract_text_from_pdf(jd_path)
resume_text = extract_text_from_pdf(resume_path)

jd_scope_text = extract_scope_sections(jd_text_full)
jd_summary = summarize_text(jd_scope_text, summarizer)
resume_summary = summarize_text(resume_text, summarizer)

# Extract specific information for better context
specific_info = extract_specific_information(jd_text_full)

# Match score
score = compute_similarity(jd_scope_text, resume_text)  # Compare full JD to resume for better matching

# Extract skills for recommendation
jd_skills = extract_skills_from_text(jd_text_full)
resume_skills = extract_skills_from_text(resume_text)
matching_skills = set(jd_skills).intersection(set(resume_skills))

# Output
print("\n📄 Job Description Summary:\n", jd_summary)
print("\n👤 Resume Summary:\n", resume_summary)
print(f"\n📊 Match Score: {score}%")
if score >= 75:
    print("✅ Strong match!")
elif score >= 50:
    print("⚠️ Moderate match.")
else:
    print("❌ Low match. Consider highlighting these skills in your resume:", ", ".join(set(jd_skills) - set(resume_skills)))

# Display extracted specific information if available
if specific_info:
    print("\n📋 Key Information Extracted:")
    for key, value in specific_info.items():
        print(f"- {key.replace('_', ' ').title()}: {value}")

# Q&A
chatbot = SmartJDChatbot()
print("\n💬 Ask questions about the job description (type 'quit' to stop):")
while True:
    question = input("Your question: ").strip()
    if question.lower() == "quit":
        break

    # For facilities/clinics questions, use a specialized approach
    if any(keyword in question.lower() for keyword in ['clinic', 'facilit', 'hospital', 'center', 'unit', 'how many']):
        # First try to find direct mentions in the title or first few paragraphs
        first_500_chars = jd_text_full[:500]
        facilities_in_title = re.search(r'(?:at|in)\s+(?:the\s+)?([^,.]*(?:Clinics?|Units?|Facilities|Centers?|Hospitals?|Jail)[^,.]*)', first_500_chars, re.IGNORECASE)

        if facilities_in_title:
            answer = "The facilities mentioned include: " + facilities_in_title.group(1)
            # Look for more specific mentions later in the text
            additional = []
            if "clinic" in jd_text_full.lower():
                additional.append("KernBHRS Outpatient Clinics")
            if "jail" in jd_text_full.lower():
                additional.append("Kern County Jail")
            if "crisis" in jd_text_full.lower() and "unit" in jd_text_full.lower():
                additional.append("Crisis Stabilization Unit")

            if additional:
                answer += "\nSpecifically: " + ", ".join(additional)
        else:
            answer = chatbot.ask_question(jd_text_full, question)
    else:
        answer = chatbot.ask_question(jd_text_full, question)

    print("\nAnswer:", answer)

Device set to use cpu



📄 Job Description Summary:
 The Kern Behavioral Health and Recovery Services (KernBHRS) administration office is located in Bakersfield, the county sear, in the southern region of the San Joaquin Valley. The Department’s goal is to ensure the citizens of Kern County who are afflicted with mental and behavioral health disorders are provided with services and resources necessary for their treatment and recovery. The County is divided into eleven (11) Geographic Service Areas for serving individuals needing mental health care.
4 clinics, Kern County Jail or Crisis Stabilization Unit located in the greater Bakersfield area. The Department expects to spend approximately $2,100,000 per fiscal year for these services among all providers. Three Agreements will be negotiated between KernBHRS and the prospective service provider.
The parties agree that the venue of any action relating to this agreement shall be in the County of Kern. It is understood that Contractor, in Contractor’s performance

Device set to use cpu



💬 Ask questions about the job description (type 'quit' to stop):

Answer: The facilities mentioned include: KernBHRS Outpatient Clinics, Kern County Jail, KernBHRS Crisis Stabilization Unit
Your question: quit
