# Resume Data Extraction
Authered By : Ashish Dubey

# extract raw text

In [4]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

## Normalize the extracted text

In [5]:
import re

def normalize_text(text):
    # Remove multiple spaces and line breaks
    text = re.sub(r'[\uf0b7•·▪◦●■□♦➤▶►\s]+', ' ', text)
    return text.strip()

## Extract Fields

In [6]:
def extract_name(text):
    # Split into clean lines
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    
    # --- Heuristic 1: Look for first line that looks like a name ---
    for line in lines:
        # Skip lines that look like emails, addresses, or contain digits
        if "@" in line or re.search(r"\d", line):
            continue
        # Capture up to comma, allow middle initials
        m = re.match(r'^([A-Z][a-z]+(?: [A-Z]\.?)?(?: [A-Z][a-z]+){1,2})(?:,.*)?$', line)
        if m:
            return m.group(1)   # return only the name part before comma
    
    # --- Heuristic 2: Line before email ---
    for i, line in enumerate(lines):
        if "@" in line and i > 0:
            candidate = re.sub(r',.*', '', lines[i-1]).strip()
            # Validate candidate looks like a name
            if re.match(r'^[A-Z][a-z]+(?: [A-Z]\.?)?(?: [A-Z][a-z]+){1,2}$', candidate):
                return candidate
    
    return "Name not found"

def extract_fields(text):
    result = {}

    # Name
    result['Name'] = extract_name(text)

    # Email
    email_match = re.search(r'[\w\.-]+@[\w\.-]+', normalize_text(text))
    result['Email'] = email_match.group(0).strip() if email_match else None

    # Phone (common patterns)
    phone_match = re.search(r'(\+?\d[\d\s\-\(\)]{7,}\d)', normalize_text(text))
    if not phone_match:
        result['Phone'] = "Phone # not found"
    else:
        # Keep only digits
        digits = re.sub(r'\D', '', phone_match.group())
        
        # Standardize to XXX-XXX-XXXX (US style) if 10 digits
        if len(digits) == 10:
            result['Phone'] = f"{digits[0:3]}-{digits[3:6]}-{digits[6:]}"
        elif len(digits) == 11 and digits.startswith("1"):  # e.g., 1XXXXXXXXXX
            result['Phone'] =  f"{digits[1:4]}-{digits[4:7]}-{digits[7:]}"
        else:
            result['Phone'] =  digits  # fallback (just return digits if not standard length)

    # Education

    pattern = re.compile(
    r'education\b.*?(?=(skills|employment|experience|credentials|tools|community|$))',re.IGNORECASE | re.DOTALL)

    edu_match = pattern.search(normalize_text(text))

    if edu_match:
        edu_text = edu_match.group(0)
        # Clean header (remove "education" and trailing symbols like +, /, -)
        edu_text_clean = re.sub(r'^education\b[\s\+\-/]*', '', edu_text.strip(), flags=re.IGNORECASE)
    
    result['Education'] = edu_text_clean.strip() if edu_match else None

    # Skills
    skills_matches = re.findall(
    r'(skills(?: & certifications)?|expertise|strengths)\s*[:\-]?\s*(.*?)(?=\s*(education|employment|experience|work experience|$))',
    normalize_text(text), re.IGNORECASE | re.DOTALL)
    
    # Combine all matched sections into one string
    if skills_matches:
        combined_skills = " ".join([m[1].strip() for m in skills_matches])
        result['Skills'] = combined_skills
    else:
        result['Skills'] = "Skills/Expertise Not Found"
    
    return result

In [4]:
## Apply to multiple resumes

In [7]:
import glob
import pandas as pd

def process_resumes(folder_path):
    data = []
    for file in glob.glob(folder_path + "/*.pdf"):
        text = extract_text_from_pdf(file)
        fields = extract_fields(text)
        fields['File'] = file
        data.append(fields)
    return data

results = process_resumes("Data")

## save the results to a .csv file

df = pd.DataFrame(results)
df.to_csv("Out/resumes_extracted.csv", index=False)