<a href="https://colab.research.google.com/github/dhwanishhh/smart-resume-analyzer/blob/main/RESUME_ANALYZER2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install required packages
!pip install gradio spacy PyPDF2 scikit-learn nltk python-docx pyngrok
!python -m spacy download en_core_web_sm
!python -m nltk.downloader stopwords

# Import libraries
import gradio as gr
import spacy
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import PyPDF2
import docx
import re
import nltk
from pyngrok import ngrok
from typing import List, Dict, Any

# Initialize NLP
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

# Text extraction functions
def extract_text_from_pdf(file_obj):
    """Extract text from PDF file"""
    reader = PyPDF2.PdfReader(file_obj)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

def extract_text_from_docx(file_obj):
    """Extract text from DOCX file"""
    doc = docx.Document(file_obj)
    return "\n".join([para.text for para in doc.paragraphs])

# Text processing functions
def preprocess_text(text: str) -> str:
    """Clean and preprocess text"""
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Information extraction functions
def extract_skills(text: str) -> List[str]:
    """Extract skills from text using NLP"""
    doc = nlp(text)
    skills = []

    # Extract noun phrases
    for chunk in doc.noun_chunks:
        if chunk.text.lower() not in stop_words:
            skills.append(chunk.text)

    # Hardcoded skills list
    skill_keywords = ['python', 'java', 'sql', 'machine learning', 'nlp',
                     'tensorflow', 'pytorch', 'scikit-learn', 'data analysis',
                     'pandas', 'numpy', 'flask', 'django', 'react', 'aws',
                     'docker', 'kubernetes', 'git', 'linux', 'statistics']

    for token in doc:
        if token.text.lower() in skill_keywords:
            skills.append(token.text)

    return list(set(skills))

def extract_experience(text: str) -> str:
    """Extract experience duration from text"""
    experience_pattern = r'(\d+\+?\s*(years?|yrs?)\s*.*?experience)'
    matches = re.findall(experience_pattern, text, re.IGNORECASE)
    return matches[0][0] if matches else "Experience not specified"

def extract_education(text: str) -> List[str]:
    """Extract education information from text"""
    education = []
    education_keywords = ['bachelor', 'master', 'phd', 'degree', 'diploma',
                         'school', 'university', 'college', 'institute']

    for sent in text.split('.'):
        for word in education_keywords:
            if word in sent.lower():
                education.append(sent.strip())
                break

    return education if education else ["Education not specified"]

# Resume processing functions
def process_resume(file_obj) -> Dict[str, Any]:
    """Process a resume file and extract information"""
    filename = file_obj.name
    if filename.endswith('.pdf'):
        raw_text = extract_text_from_pdf(file_obj)
    elif filename.endswith('.docx'):
        raw_text = extract_text_from_docx(file_obj)
    else:
        return {"error": "Unsupported file format"}

    processed_text = preprocess_text(raw_text)

    return {
        "filename": filename,
        "raw_text": raw_text,
        "skills": extract_skills(processed_text),
        "experience": extract_experience(raw_text),
        "education": extract_education(raw_text),
        "processed_text": processed_text
    }

def compare_with_jd(resumes: List[Dict[str, Any]], jd_text: str) -> List[Dict[str, Any]]:
    """Compare resumes with job description and rank them"""
    if not jd_text.strip():
        return sorted(resumes, key=lambda x: x['filename'])

    processed_jd = preprocess_text(jd_text)
    corpus = [processed_jd] + [resume['processed_text'] for resume in resumes]

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)

    similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

    for i, resume in enumerate(resumes):
        resume['similarity_score'] = float(similarities[i])

    return sorted(resumes, key=lambda x: x.get('similarity_score', 0), reverse=True)

# Main analysis function
def analyze_resumes(resume_files, jd_text):
    """Main function to process resumes and compare with JD"""
    processed_resumes = []
    for file in resume_files:
        try:
            processed = process_resume(file)
            if "error" not in processed:
                processed_resumes.append(processed)
        except Exception as e:
            print(f"Error processing {file.name}: {str(e)}")

    ranked_resumes = compare_with_jd(processed_resumes, jd_text)

    # Prepare output for Gradio
    output = []
    for i, resume in enumerate(ranked_resumes):
        output.append([
            i+1,
            resume['filename'],
            f"{resume.get('similarity_score', 0)*100:.1f}%" if 'similarity_score' in resume else "N/A",
            ", ".join(resume['skills'][:5]) + ("..." if len(resume['skills']) > 5 else ""),
            resume['experience'],
            resume['education'][0] if resume['education'] else ""
        ])

    return output

# Create Gradio interface
with gr.Blocks(title="Smart Resume Analyzer") as interface:
    gr.Markdown("# Smart Resume Analyzer")
    gr.Markdown("Upload multiple resumes and a job description to rank candidates based on relevance")

    with gr.Row():
        with gr.Column():
            resume_input = gr.File(file_count="multiple", label="Upload Resumes (PDF/DOCX)")
            jd_input = gr.Textbox(label="Job Description", lines=5)
            submit_btn = gr.Button("Analyze Resumes")

        with gr.Column():
            results_table = gr.Dataframe(
                headers=["Rank", "Filename", "Match Score", "Top Skills", "Experience", "Education"],
                datatype=["number", "str", "str", "str", "str", "str"],
                col_count=(6, "fixed")
            )

    submit_btn.click(
        fn=analyze_resumes,
        inputs=[resume_input, jd_input],
        outputs=results_table
    )

# Launch with ngrok for public access
try:
    # Get your ngrok authtoken from https://dashboard.ngrok.com/auth
    NGROK_AUTH_TOKEN = "2vowxugOFKbLM3V1m4SbBsvPl0w_6ghXTMGQ6qfbrG3iBfmta"  # Replace with your actual ngrok token
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)
    public_url = ngrok.connect(7860)
    print(" * Public URL:", public_url)
    interface.launch()
except Exception as e:
    print(f"Error with ngrok: {e}")
    print("Launching without ngrok")
    interface.launch(share=True)  # Fallback to Gradio share

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ERROR:pyngrok.process.ngrok:t=2025-04-21T06:09:13+0000 lvl=eror msg="failed to reconnect session" obj=tunnels.session err="authentication failed: Your account is limited to 1 simultaneous ngrok agent sessions.\nYou can run multiple simultaneous tunnels from a single agent session by defining the tunnels in your agent configuration file and starting them with the command `ngrok start --all`.\nRead more about the agent configuration file: https://ngrok.com/docs/secure-tunnels/ngrok-agent/reference/config\nYou can view your current agent sessions in the dashboard:\nhttps://dashboard.ngrok.com/agents\r\n\r\nERR_NGROK_108\r\n"


Error with ngrok: The ngrok process errored on start: authentication failed: Your account is limited to 1 simultaneous ngrok agent sessions.\nYou can run multiple simultaneous tunnels from a single agent session by defining the tunnels in your agent configuration file and starting them with the command `ngrok start --all`.\nRead more about the agent configuration file: https://ngrok.com/docs/secure-tunnels/ngrok-agent/reference/config\nYou can view your current agent sessions in the dashboard:\nhttps://dashboard.ngrok.com/agents\r\n\r\nERR_NGROK_108\r\n.
Launching without ngrok
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e2400d9855c652cd26.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
