# Resume Data Extraction
Authered By : Ashish Dubey

# extract raw text

In [5]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

raw_text = extract_text_from_pdf("data/Sample Resume 0.pdf")

In [6]:
raw_text

' \n \n \n \n \n \n \nDhiraj Sharma \n+91 8105000549 \n     Dhiraj.Sharma@wipro.com \n \n \nSYNOPSIS:  \uf0b7\n \nThirteen years of work experience, wherein last eleven years has been in healthcare \ndomain in Consultancy, Client Relationship, Product, Program, and Requirements \nManagement area. \uf0b7\n \nWell-versed in Healthcare domain i.e. Healthcare Administration and Claims, HIPAA, HL7, \nHospital Information System (EMR for Acute care), Revenue Cycle Management, Provider \nContract Management etc. with exposure to US, Europe and Asian markets. \n \nEDUCATION: \uf0b7\n \nDiploma in Advanced Computing (C-DAC), Ahmedabad (2000) \uf0b7\n \nMaster of Business Administration (MBA) Marketing, Pune University, Pune (1999) \uf0b7\n \nBachelor of Science (B. Sc), Gujarat University, Ahmedabad (1995) \n \n \nCERTIFICATIONS: \uf0b7\n \nPMP from Project Management Institute \uf0b7\n \nFellow, Academy for Healthcare Management (FAHM).   \uf0b7\n \nProfessional, Academy for Healthcare Managem

In [1]:
## Normalize the extracted text

In [14]:
import re

def normalize_text(text):
    # Remove multiple spaces and line breaks
    text = re.sub(r'[\uf0b7•·▪◦●■□♦➤▶►\s]+', ' ', text)
    return text.strip()

clean_text = normalize_text(raw_text)

In [8]:
## locate and extract fields
clean_text

'Dhiraj Sharma +91 8105000549 Dhiraj.Sharma@wipro.com SYNOPSIS: Thirteen years of work experience, wherein last eleven years has been in healthcare domain in Consultancy, Client Relationship, Product, Program, and Requirements Management area. Well-versed in Healthcare domain i.e. Healthcare Administration and Claims, HIPAA, HL7, Hospital Information System (EMR for Acute care), Revenue Cycle Management, Provider Contract Management etc. with exposure to US, Europe and Asian markets. EDUCATION: Diploma in Advanced Computing (C-DAC), Ahmedabad (2000) Master of Business Administration (MBA) Marketing, Pune University, Pune (1999) Bachelor of Science (B. Sc), Gujarat University, Ahmedabad (1995) CERTIFICATIONS: PMP from Project Management Institute Fellow, Academy for Healthcare Management (FAHM). Professional, Academy for Healthcare Management (PAHM). Requirements Management Certified from Rational Corporation (Bangalore) PUBLICATION AND RECOGNIZATION: The Future of RCM – Preparing for n

In [19]:
def extract_name(text):
    # Split into clean lines
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    
    # --- Heuristic 1: Look for first line that looks like a name ---
    for line in lines:
        # Skip lines that look like emails, addresses, or contain digits
        if "@" in line or re.search(r"\d", line):
            continue
        # Capture up to comma, allow middle initials
        m = re.match(r'^([A-Z][a-z]+(?: [A-Z]\.?)?(?: [A-Z][a-z]+){1,2})(?:,.*)?$', line)
        if m:
            return m.group(1)   # return only the name part before comma
    
    # --- Heuristic 2: Line before email ---
    for i, line in enumerate(lines):
        if "@" in line and i > 0:
            candidate = re.sub(r',.*', '', lines[i-1]).strip()
            # Validate candidate looks like a name
            if re.match(r'^[A-Z][a-z]+(?: [A-Z]\.?)?(?: [A-Z][a-z]+){1,2}$', candidate):
                return candidate
    
    return "Name not found"

def extract_fields(text):
    result = {}

    # Name
    result['Name'] = extract_name(text)

    # Email
    email_match = re.search(r'[\w\.-]+@[\w\.-]+', normalize_text(text))
    result['Email'] = email_match.group(0).strip() if email_match else None

    # Phone (common patterns)
    phone_match = re.search(r'(\+?\d[\d\s\-\(\)]{7,}\d)', normalize_text(text))
    if not phone_match:
        result['Phone'] = "Phone # not found"
    else:
        # Keep only digits
        digits = re.sub(r'\D', '', phone_match.group())
        
        # Standardize to XXX-XXX-XXXX (US style) if 10 digits
        if len(digits) == 10:
            result['Phone'] = f"{digits[0:3]}-{digits[3:6]}-{digits[6:]}"
        elif len(digits) == 11 and digits.startswith("1"):  # e.g., 1XXXXXXXXXX
            result['Phone'] =  f"{digits[1:4]}-{digits[4:7]}-{digits[7:]}"
        else:
            result['Phone'] =  digits  # fallback (just return digits if not standard length)

    # Education

    pattern = re.compile(
    r'education\b.*?(?=(skills|employment|experience|credentials|tools|community|$))',
    re.IGNORECASE | re.DOTALL
)

    edu_match = pattern.search(normalize_text(text))

    if edu_match:
        edu_text = edu_match.group(0)
        # Clean header (remove "education" and trailing symbols like +, /, -)
        edu_text_clean = re.sub(r'^education\b[\s\+\-/]*', '', edu_text.strip(), flags=re.IGNORECASE)
    
    result['Education'] = edu_text_clean.strip() if edu_match else None

    # Skills
    skills_matches = re.findall(
    r'(skills(?: & certifications)?|expertise|strengths)\s*[:\-]?\s*(.*?)(?=\s*(education|employment|experience|work experience|$))',
    normalize_text(text), re.IGNORECASE | re.DOTALL)
    
    # Combine all matched sections into one string
    if skills_matches:
        combined_skills = " ".join([m[1].strip() for m in skills_matches])
        result['Skills'] = combined_skills
    else:
        result['Skills'] = "Skills/Expertise Not Found"
    
    return result
    
fields = extract_fields(raw_text)
print(fields)

{'Name': 'Dhiraj Sharma', 'Email': 'Dhiraj.Sharma@wipro.com', 'Phone': '918105000549', 'Education': ': Diploma in Advanced Computing (C-DAC), Ahmedabad (2000) Master of Business Administration (MBA) Marketing, Pune University, Pune (1999) Bachelor of Science (B. Sc), Gujarat University, Ahmedabad (1995) CERTIFICATIONS: PMP from Project Management Institute Fellow, Academy for Healthcare Management (FAHM). Professional, Academy for Healthcare Management (PAHM). Requirements Management Certified from Rational Corporation (Bangalore) PUBLICATION AND RECOGNIZATION: The Future of RCM – Preparing for near term changes http://www.himss.org/content/files/FutureofRevenueCycleWhitePaper-EDITED5-24NV.pdf ‘Just a dose of healthcare statistics’ article, where the state of Indian healthcare vis-à-vis its economy and other markets is compared. Path: http://www.expresshealthcaremgmt.com/20040715/analysis01.shtml White Paper on HIPAA 5010', 'Skills': 'Skills/Expertise Not Found'}


In [41]:
## Convert into structured format

In [113]:
import json

with open("out/resume_data.json", "w") as f:
    json.dump(fields, f, indent=2)

In [4]:
## Apply to multiple resumes

In [20]:
import glob
import pandas as pd

def process_resumes(folder_path):
    data = []
    for file in glob.glob(folder_path + "/*.pdf"):
        text = extract_text_from_pdf(file)
        clean = normalize_text(text)
        fields = extract_fields(text)
        fields['File'] = file
        data.append(fields)
    return data

results = process_resumes("Data")

In [5]:
## save the results to a .csv file

In [21]:
df = pd.DataFrame(results)
df.to_csv("Out/resumes_extracted.csv", index=False)