In [None]:
# Using Google Colab
# Necessary package installation
!pip install pdfplumber
!pip install spacy
!pip install stanza
!pip install nltk

In [None]:
# When facing issues with spacy
!pip install --force-reinstall spacy

In [None]:
import pdfplumber
import spacy
import nltk
from stanza.server import CoreNLPClient
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Extract Text from PDF
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# NLP Analysis with spaCy
# Load spaCy's pre-trained model
nlp = spacy.load("en_core_web_sm")
def analyze_with_spacy(text):
    doc = nlp(text)
    # Extract Named Entities (NER)
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]
    # Extracting tokens and filtering out stopwords
    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return named_entities, tokens

# Additional Processing with NLTK
def analyze_with_nltk(text):
    # Sentence tokenization
    sentences = sent_tokenize(text)
    # Word tokenization and stopword removal
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words and word.isalpha()]
    return sentences, filtered_words

# Advanced Analysis with Stanford CoreNLP
def analyze_with_corenlp(text):
    with CoreNLPClient(annotators=['tokenize','ssplit', 'ner', 'depparse', 'coref'], timeout=30000, memory='16G') as client:
        ann = client.annotate(text)
        return ann

# Extract and Filter Relevant Information
def extract_information(text, entities, tokens):
    # Extracting key fields from the resume using Named Entities
    personal_info = {
        'Name': '',
        'Email': '',
        'Phone': '',
        'Skills': [],
        'Education': []
    }
    for entity, label in entities:
        if label == "PERSON" and not personal_info['Name']:
            personal_info['Name'] = entity
        elif label == "ORG" or label == "GPE":
            personal_info['Education'].append(entity)
        elif label == "EMAIL":
            personal_info['Email'] = entity
        elif label == "PHONE":
            personal_info['Phone'] = entity

    # Example for filtering skills (custom keyword matching)
    skill_keywords = ['Programming', 'Management', 'Communication', 'Problem Solving', 'Leadership']
    personal_info['Skills'] = [word for word in tokens if word in skill_keywords]
    return personal_info

# Display Extracted Information
def display_extracted_info(info):
    print("----- Resume Information -----")
    print(f"Name: {info['Name']}")
    print(f"Email: {info['Email']}")
    print(f"Phone: {info['Phone']}")
    print(f"Skills: {', '.join(info['Skills'])}")
    print(f"Education: {', '.join(info['Education'])}")

# Main Workflow
def main(pdf_file):
    # Extract text from PDF
    resume_text = extract_text_from_pdf(pdf_file)
    print("Extracted Resume Text:")
    print(resume_text)
    # Analyze with spaCy
    entities, tokens = analyze_with_spacy(resume_text)
    print("\nNamed Entities:")
    print(entities)
    print("\nFiltered Tokens (spaCy):")
    print(tokens)
    # Additional Processing with NLTK
    sentences, filtered_words = analyze_with_nltk(resume_text)
    print("\nSentences (NLTK):")
    print(sentences)
    print("\nFiltered Words (NLTK):")
    print(filtered_words)
    # Extract relevant information (Name, Email, Skills, etc.)
    personal_info = extract_information(resume_text, entities, tokens)
    # Display the extracted information
    display_extracted_info(personal_info)

# Replace with Resume file path
if __name__ == "__main__":
    pdf_file = '/content/Resume.pdf'
    main(pdf_file)

In [None]:
# Download English model if not downloaded
import stanza
stanza.download('en')

In [None]:
import re
import pdfplumber
import spacy
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from stanza.server import CoreNLPClient

# Download NLTK data if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Initialize Stanza
nlp_stanza = stanza.Pipeline('en')  # Initialize the Stanza Pipeline

# Initialize spaCy
nlp_spacy = spacy.load("en_core_web_sm")

# Extract Text from PDF
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# NLP Analysis with spaCy
def analyze_with_spacy(text):
    doc = nlp_spacy(text)
    # Extract Named Entities (NER)
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]
    # Extracting tokens and filtering out stopwords
    tokens = [token.text for token in doc if not token.is_stop and not token.is_punct]
    return named_entities, tokens

# Regex-Based Extraction for Contact Info and Duration
def extract_contact_info(text):
    name, contact_number, email = '', '', ''
    # Extract email
    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.[a-z]{2,4}', text)
    if email_match:
        email = email_match.group(0).strip()
    # Extract phone number
    phone_match = re.search(r'(\+?\d{1,2}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}|\d{10})', text)
    if phone_match:
        contact_number = phone_match.group(0).strip()
    # Extract Name
    name_match = re.search(r'\b[A-Z][a-z]+\s[A-Z][a-z]+(\s[A-Z][a-z]+)?\b', text[:200])
    if name_match:
        name = name_match.group(0).strip()
    if any(word in name.lower() for word in ["apartment", "road", "street", "complex"]):
        # Reset name if it's mistakenly captured as an address
        name = "" 
    return name, contact_number, email

def extract_duration(text):
    # Regex to match different date formats
    duration_regex = r'\b(?:\d{4}[-–to]+\d{4}|' \
                     r'\d{4}[-–to]+(?:present|now|ongoing)|' \
                     r'(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|' \
                     r'sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)(?:\s+\d{1,2})?\s+\d{4}[-–to]+(?:' \
                     r'(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|' \
                     r'sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)(?:\s+\d{1,2})?\s+\d{4}|present|now|ongoing)|' \
                     r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[-/]\d{4}[-–to]+(?:' \
                     r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[-/]\d{4}|present|now|ongoing)|' \
                     r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}[-–to]+\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|' \
                     r'from\s+(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|' \
                     r'sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)(?:\s+\d{1,2})?\s+\d{4}\s+to\s+(?:jan(?:uary)?|' \
                     r'feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|' \
                     r'nov(?:ember)?|dec(?:ember)?)(?:\s+\d{1,2})?\s+\d{4}|present|now|ongoing)'
    duration_matches = re.findall(duration_regex, text, re.IGNORECASE)
    return duration_matches

# NLP Analysis with Stanza for Section Analysis
def analyze_with_stanza(text):
    doc = nlp_stanza(text)
    sentences = [sentence.text for sentence in doc.sentences]
    return sentences

# Extract and Filter Relevant Information by Sections
def extract_information(text, entities, tokens, name, contact_number, email, duration_matches):
    personal_info = {
        'Name': name,
        'Email': email,
        'Phone': contact_number,
        'Skills': [],
        'Education': [],
        'Experience': [],
        'Languages': []
    }
    # Splitting the resume into sections for specific data extraction
    sections = {'education': '', 'experience': '', 'skills': '', 'languages': ''}
    lower_text = text.lower()
    # Identifying different sections of the resume by keywords
    if "education" in lower_text:
        sections['education'] = text[lower_text.index("education"):]
    if "experience" in lower_text:
        sections['experience'] = text[lower_text.index("experience"):]
    if "skills" in lower_text:
        sections['skills'] = text[lower_text.index("skills"):]
    if "languages" in lower_text:
        sections['languages'] = text[lower_text.index("languages"):]
    # Extracting education details from the 'education' section
    education_lines = sections['education'].split("\n")
    education_info = []
    current_edu = {}

    for line in education_lines:
        if 'interests' in line.lower():
            continue
        if 'degree' in line.lower() or 'bachelor' in line.lower() or 'master' in line.lower():
            if current_edu:
                education_info.append(current_edu)
                current_edu = {}
            current_edu['Degree'] = line.strip()
        elif 'university' in line.lower() or 'college' in line.lower() or 'institute' in line.lower():
            current_edu['Institution'] = line.strip()
        elif any(dur in line for dur in duration_matches): 
            # Checking for duration using regex matches
            current_edu['Duration'] = line.strip()
        elif 'gpa' in line.lower() or 'score' in line.lower():
            current_edu['Score'] = line.strip()

    if current_edu:
        education_info.append(current_edu)

    personal_info['Education'] = education_info

    # Extracting experience details from the 'experience' section
    experience_lines = sections['experience'].split("\n")
    experience_info = []
    current_exp = {}

    for line in experience_lines:
        if 'interests' in line.lower():
            continue
        if 'company' in line.lower() or 'firm' in line.lower() or 'organization' in line.lower():
            if current_exp:
                experience_info.append(current_exp)
                current_exp = {}
            current_exp['Company'] = line.strip()
        elif 'role' in line.lower() or 'manager' in line.lower() or 'intern' in line.lower():
            current_exp['Role'] = line.strip()
        elif any(dur in line for dur in duration_matches): 
            # Checking for duration using regex matches
            current_exp['Duration'] = line.strip()

    if current_exp:
        experience_info.append(current_exp)

    personal_info['Experience'] = experience_info

    # Extracting skills from the 'skills' section (using keywords)
    skill_keywords = ['Programming', 'Management', 'Communication', 'Problem Solving', 'Leadership']
    personal_info['Skills'] = list(set([word for word in tokens if word in skill_keywords]))

    # Extracting languages from the 'languages' section
    language_keywords = ['English', 'Spanish', 'French', 'German', 'Hindi', 'Tamil']
    for sentence in analyze_with_stanza(sections['languages']):
        for language in language_keywords:
            if language.lower() in sentence.lower():
                personal_info['Languages'].append(language)

    return personal_info

# Display Extracted Information in Specified Format
def display_extracted_info(info):
    print("----- Extracted Resume Information -----")
    # Personal Information
    if info['Name']:
        print(f"Name: {info['Name']}")
    if info['Phone']:
        print(f"Contact Number: {info['Phone']}")
    if info['Email']:
        print(f"Email: {info['Email']}")

    # Education Section
    if info['Education']:
        print("\nEducation:")
        for edu in info['Education']:
            print(f"Institution Name: {edu.get('Institution', '')}")
            print(f"Degree: {edu.get('Degree', '')}")
            print(f"Duration: {edu.get('Duration', '')}")
            print(f"Score: {edu.get('Score', '')}\n")

    # Experience Section
    if info['Experience']:
        print("Experience:")
        for exp in info['Experience']:
            print(f"Company Name: {exp.get('Company', '')}")
            print(f"Role: {exp.get('Role', '')}")
            print(f"Duration: {exp.get('Duration', '')}\n")

    # Skills Section
    if info['Skills']:
        print("Skills:")
        for skill in info['Skills']:
            print(f"- {skill}")

    # Languages Section
    if info['Languages']:
        print("\nLanguages Known:")
        for language in info['Languages']:
            print(f"- {language}")

def main(pdf_file):
    # Extract text from PDF
    resume_text = extract_text_from_pdf(pdf_file)
    # Analyze with spaCy
    entities, tokens = analyze_with_spacy(resume_text)
    # Extract contact information
    name, contact_number, email = extract_contact_info(resume_text)
    # Extract duration details
    duration_matches = extract_duration(resume_text)
    # Extract relevant information (Name, Email, Skills, etc.)
    personal_info = extract_information(resume_text, entities, tokens, name, contact_number, email, duration_matches)
    # Display the extracted information in the desired format
    display_extracted_info(personal_info)

# Replace with Resume file path
if __name__ == "__main__":
    pdf_file = '/content/Resume.pdf'
    main(pdf_file)