## **Structured Data Extraction from Resume Files**

**Installations**

In [1]:
!pip install python-docx
!pip install PyMuPDF

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0
Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m62.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [2]:
!pip install pip==23.2.1

!apt-get install -y antiword unrtf poppler-utils tesseract-ocr \
    flac ffmpeg lame libmad0 libsox-fmt-mp3 sox

!pip install textract

Collecting pip==23.2.1
  Downloading pip-23.2.1-py3-none-any.whl.metadata (4.2 kB)
Downloading pip-23.2.1-py3-none-any.whl (2.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m1.3/2.1 MB[0m [31m45.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.1/2.1 MB[0m [31m41.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-23.2.1
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
f

**Imports**

In [2]:
import os
import pandas as pd
from docx import Document
import fitz
import textract

Upload a 'resumes' folder containing subfolders — Peoplesoft, React Developer, SQL Developer, and Workday — so that resumes can be extracted and categorized based on their respective subfolder names.



In [14]:
# Root folder containing subfolders for each role
resume_root_folder = "./resumes"

# Function to extract text from .doc files
def extract_text_from_doc(file_path):
    try:
        return textract.process(file_path).decode("utf-8")
    except Exception as e:
        print(f"Could not read .doc file {file_path}: {e}")
        return None

# Function to extract text from .docx files
def extract_text_from_docx(file_path):
    try:
        doc = Document(file_path)
        return ' '.join([para.text for para in doc.paragraphs])
    except Exception as e:
        print(f"Could not read .docx file {file_path}: {e}")
        return None

# Function to extract text from PDF files
def extract_text_from_pdf(file_path):
    try:
        doc = fitz.open(file_path)
        return ' '.join([page.get_text() for page in doc]).strip()
    except Exception as e:
        print(f"Could not read PDF {file_path}: {e}")
        return None

# Resume-related keywords for validation
def is_resume_content(text):
    if not text or len(text.split()) < 100:
        return False

    text_lower = text.lower()

    # Flexible section keywords — roots and common variations
    section_keywords = [
        "experience", "education", "skill", "project", "summary",
        "certification", "objective", "responsibilit", "training",
        "internship", "career objective", "technical", "professional", "achievement"
    ]
    # Count how many keywords appear (even partial matches like "skillset", "responsibilities")
    match_count = sum(keyword in text_lower for keyword in section_keywords)

    return match_count >= 2

# Go through each role folder and process resumes
results = []

for role_folder in os.listdir(resume_root_folder):
    full_path = os.path.join(resume_root_folder, role_folder)
    if not os.path.isdir(full_path):
        continue  # Skip if not a folder

    for filename in os.listdir(full_path):
        file_path = os.path.join(full_path, filename)
        ext = filename.lower().split('.')[-1]
        text = None

        if ext == "docx":
            text = extract_text_from_docx(file_path)
        elif ext == "pdf":
            text = extract_text_from_pdf(file_path)
        elif ext == "doc":
            text = extract_text_from_doc(file_path)
        else:
            text = None  # Unsupported format

        # Validate resume content
        if not text or not is_resume_content(text):
            text = "Invalid Resume Content"

        results.append({
            "filename": filename,
            "content": text.strip(),
            "ext": ext,
            "role": role_folder
        })

# Save results to CSV
df = pd.DataFrame(results)
df.to_csv("resume_dataset_labeled.csv", index=False)

print("CSV 'resume_dataset_labeled.csv' created with role labels and content checks.")

CSV 'resume_dataset_labeled.csv' created with role labels and content checks.


In [15]:
df=pd.read_csv("resume_dataset_labeled.csv")
df.sample(5)

Unnamed: 0,filename,content,ext,role
63,React Dev_Krishna Kanth_Musquare Technologies....,Ui-Developer/ React JS Developer NAME: KRISHN...,docx,React Developer
18,Harikrishna Akula_Hexaware.doc,Harikrishna Akula\n\n\n\n Summary:\n ❖ 5.2 ...,doc,Workday Consultant
77,kamballapradeep.docx,KAMBALLA PRADEEP ...,docx,SQL Developer
64,Reactjs Developer_Prabakaran_Musquare Technolo...,Page | 1 \n \nName: M. Prabakaran \nTitle: UI...,pdf,React Developer
53,Internship_Ravali_Musquare Technologies (1).docx,Name: Ravali P ...,docx,React Developer


In [16]:
df.shape

(80, 4)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  80 non-null     object
 1   content   80 non-null     object
 2   ext       80 non-null     object
 3   role      80 non-null     object
dtypes: object(4)
memory usage: 2.6+ KB


In [18]:
df["role"].value_counts()

Unnamed: 0_level_0,count
role,Unnamed: 1_level_1
React Developer,24
Workday Consultant,21
Peoplesoft Consultant,21
SQL Developer,14


Only PDF, DOCX, and DOC file formats are accepted; all other formats are removed from the dataset.

In [19]:
df["ext"].value_counts()

Unnamed: 0_level_0,count
ext,Unnamed: 1_level_1
docx,52
doc,26
txt,1
pdf,1


In [20]:
df[df["content"]=="Invalid Resume Content"]

Unnamed: 0,filename,content,ext,role
28,apple.txt,Invalid Resume Content,txt,Peoplesoft Consultant


In [21]:
df=df[df["content"]!="Invalid Resume Content"]

In [22]:
df[df["content"]=="Invalid Resume Content"]

Unnamed: 0,filename,content,ext,role


In [23]:
df["role"].value_counts()

Unnamed: 0_level_0,count
role,Unnamed: 1_level_1
React Developer,24
Workday Consultant,21
Peoplesoft Consultant,20
SQL Developer,14


In [24]:
df.to_csv("resume_cleandataset_labeled.csv", index=False)
print("CSV 'resume_cleandataset_labeled.csv' cleaned with invalid resume content.")

CSV 'resume_cleandataset_labeled.csv' cleaned with invalid resume content.
