In [415]:
import fitz  # PyMuPDF
from PIL import Image
import pytesseract

import docx
import re
import spacy
import os

pytesseract.pytesseract.tesseract_cmd = "C:\Program Files\Tesseract-OCR\/tesseract.exe"

In [416]:
nlp = spacy.load("en_core_web_sm")

In [417]:
def extract_text_with_fitz(pdf_path):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    extracted_text = ""

    # Iterate through each page
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        # Render page to a high-resolution image (adjust zoom for DPI)
        zoom = 2  # Higher zoom improves OCR accuracy
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat, alpha=False)  # Remove alpha channel
        
        # Convert to PIL Image
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        # Use Tesseract to extract text
        text = pytesseract.image_to_string(img)
        
        extracted_text += text + "\n"

    return extracted_text

In [418]:
pdf_path = "data/sample_cvs/File.pdf"
text = extract_text_with_fitz(pdf_path)
print(text)

GIULIA GONZALEZ

CONTACT

‘ggonzalez@emailcom
(123) 456-7890
Detroit, MI

inkedIn

Github

EDUCATION

MS.
Computer Science
University of Chicago
2014-2016

Chicago, IL

BS.
Computer Science
University of Pittsburgh
2010-2014

Pittsburgh, PA

SKILLS

HTMU CSS
SQL (PostgreSQL, Oracle)
JavaScript (Angular)
Python (Django)

REST APIs (GraphQU)
‘AWS (Redshift, $3)

Git

F

Y THON DEVELOPER

WORK EXPERIENCE

Python Developer
DoorDash
September 2017 -current / Detroit, MI

‘Worked on building new Angular components for the customer-facing
‘web app, which improved the time on page for the average user by 2
minutes

Collaborated with an agile team of 6, and helped prioritize and scope
feature requests to ensure that the biggest impact features were
worked on first

Built extensive test coverage for all new features, which reduced the
number of customer complaints by 23%

‘Acquired and ingested data to build and maintain data pipelines that
led to discovering an opportunity for a new site featur

In [419]:
import re
from collections import defaultdict

In [420]:
def extract_personal_info(header):
    """Extract personal information"""

    header_text = ' '.join(header)
    
    result = {
        "name": header[0].strip() if header else "",
        "email": "",
        "phone": "",
        "location": ""
    }

    email_match = re.search(r'\S+@\S+\.\S+', header_text)
    if email_match:
        result["email"] = email_match.group(0)

    phone_match = re.search(r'\(\d{3}\)\s*\d{3}-\d{4}', header_text)
    if phone_match:
        result["phone"] = phone_match.group(0)

    location_match = re.search(r'([A-Za-z]+,\s*[A-Z]{2})', header_text)
    if location_match:
        result["location"] = location_match.group(0)

    return result

In [421]:
def extract_education(edu_lines):
    """Extract education history from text."""

    education = []
    current_entry = {}

    # Patterns for identification
    year_pattern = re.compile(r"\b\d{4}\s*-\s*\d{4}\b")
    location_pattern = re.compile(r".*,\s*[A-Z]{2}")
    degree_keywords = {"b.s.", "m.s.","bs.","ms.", "ph.d.", "bs", "ms", "bachelor", "master", "doctorate"}

    for item in edu_lines:
        if item in degree_keywords:
            if current_entry:  # Store the previous entry before starting a new one
                education.append(current_entry)
            current_entry = {"degree": item}
        elif year_pattern.match(item):
            current_entry["years"] = item
        elif location_pattern.match(item):
            current_entry["location"] = item
        elif "degree" in current_entry and "field" not in current_entry:
            current_entry["field"] = item
        elif "field" in current_entry and "university" not in current_entry:
            current_entry["university"] = item

    # Ensure the last entry is added correctly
    if current_entry:
        education.append(current_entry)

    return education

In [422]:
def extract_skills(sections):
    """Extract skills from text."""
    return [line.strip() for line in sections[4].split('\n') if line.strip()]
    

In [423]:
def extract_experience(data):
    """Extract structured work experience using NLP and regex."""
    experiences = []
    current_experience = {}

    for line in data:
        # Check for position (assumes position titles are capitalized and not dates)
        if 'Developer' in line or 'Intern' in line:
            if current_experience:  # Save the previous experience before starting a new one
                experiences.append(current_experience)
            current_experience = {'position': line, 'company': '', 'duration': '', 'location': '', 'description': []}
        
        # Check for company name (assumes company name follows the position)
        elif current_experience and not current_experience['company']:
            current_experience['company'] = line
        
        # Check for duration and location (assumes it contains a date or location format)
        elif '/' in line or '-' in line:
            parts = line.split('/')
            if len(parts) == 2:
                current_experience['duration'], current_experience['location'] = map(str.strip, parts)
            else:
                current_experience['description'].append(line)
        # Otherwise, consider it part of the description
        else:
            if current_experience:
                current_experience['description'].append(line)

    # Append the last experience
    if current_experience:
        experiences.append(current_experience)

    # Format descriptions into a single string
    for exp in experiences:
        exp['description'] = ' '.join(exp['description']).strip()

    return experiences

In [426]:
def parse_resume_text(text):
    # Clean up page header and split sections
    text = re.sub(r'^Page \d+:\s*', '', text, flags=re.MULTILINE)
    sections = re.split(r'\n(EDUCATION|SKILLS|WORK EXPERIENCE)\n', text)

    raw_header = sections[0].strip().split('\n')

    raw_education = [line.strip() for line in sections[sections.index('EDUCATION') + 1].split('\n') if line.strip()] if 'EDUCATION' in [s for s in sections] and sections.index('EDUCATION') < len(sections) - 1 else []

    raw_experience = [line.strip() for line in sections[sections.index('WORK EXPERIENCE') + 1].split('\n') if line.strip()] if 'WORK EXPERIENCE' in [s for s in sections] and sections.index('WORK EXPERIENCE') < len(sections) - 1 else []
    
    data = defaultdict(dict)
    data["projects"] = []
    data["certifications"] = []
    data['persnal_info'] = extract_personal_info(raw_header)
    data["education"] = extract_education(raw_education)
    data["skills"] = extract_skills(sections)
    data["experience"] = extract_experience(raw_experience)

    return dict(data)

In [427]:
print(parse_resume_text(text))

{'projects': [], 'certifications': [], 'persnal_info': {'name': 'GIULIA GONZALEZ', 'email': '', 'phone': '(123) 456-7890', 'location': 'Detroit, MI'}, 'education': [{'years': '2010-2014', 'location': 'Pittsburgh, PA'}], 'skills': ['HTMU CSS', 'SQL (PostgreSQL, Oracle)', 'JavaScript (Angular)', 'Python (Django)', 'REST APIs (GraphQU)', '‘AWS (Redshift, $3)', 'Git', 'F', 'Y THON DEVELOPER'], 'experience': [{'position': 'Python Developer', 'company': 'DoorDash', 'duration': 'September 2017 -current', 'location': 'Detroit, MI', 'description': '‘Worked on building new Angular components for the customer-facing ‘web app, which improved the time on page for the average user by 2 minutes Collaborated with an agile team of 6, and helped prioritize and scope feature requests to ensure that the biggest impact features were worked on first Built extensive test coverage for all new features, which reduced the number of customer complaints by 23% ‘Acquired and ingested data to build and maintain dat