**RESUME PARSER**  

In [8]:
!pip install python-docx
!pip install pdfplumber



Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.0-p

imports

In [10]:
import os
import pandas as pd
import spacy
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from docx import Document
import pdfplumber

In [11]:
# Load the pre-trained spaCy model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

Loading the dataset

In [12]:
# Load the RSVP Resume Dataset from Kaggle
df = pd.read_csv('/content/Resume.csv')

Preprocess

In [13]:
# Preprocess the text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s,.\-@]', '', text)  # Keep only alphanumeric characters, spaces, etc.
    return text

df['processed_text'] = df['Resume_str'].apply(preprocess_text)

In [14]:
# Extract entities using spaCy
def extract_entities(text):
    doc = nlp(text)
    entities = {
        'NAME': [],
        'PHONE': [],
        'EMAIL': [],
        'SKILLS': [],
        'EDUCATION': [],
        'EXPERIENCE': []
    }
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            entities['NAME'].append(ent.text)
        elif ent.label_ == 'PHONE':
            entities['PHONE'].append(ent.text)
        elif ent.label_ == 'EMAIL':
            entities['EMAIL'].append(ent.text)
        elif ent.label_ == 'ORG':
            entities['EXPERIENCE'].append(ent.text)
        elif ent.label_ == 'GPE':
            entities['EDUCATION'].append(ent.text)
    return entities


Train-test split

In [15]:
X = df['processed_text']
y = df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Create a text classification pipeline
pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_estimators=100))
pipeline.fit(X_train, y_train)

Evaluate the model

In [17]:
# Evaluate the model
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred, zero_division=1))

                        precision    recall  f1-score   support

            ACCOUNTANT       0.67      0.90      0.76        29
              ADVOCATE       0.80      0.67      0.73        30
           AGRICULTURE       0.33      0.12      0.18         8
               APPAREL       0.69      0.45      0.55        20
                  ARTS       0.30      0.17      0.21        18
            AUTOMOBILE       1.00      0.00      0.00         6
              AVIATION       0.67      0.86      0.75        21
               BANKING       0.60      0.65      0.62        23
                   BPO       1.00      0.00      0.00         2
  BUSINESS-DEVELOPMENT       0.62      0.56      0.59        27
                  CHEF       0.79      0.79      0.79        24
          CONSTRUCTION       0.83      0.71      0.76        34
            CONSULTANT       0.67      0.20      0.31        20
              DESIGNER       0.69      0.95      0.80        19
         DIGITAL-MEDIA       0.86      

Extract text from files

In [18]:
def extract_text_from_docx(filename):
    doc = Document(filename)
    text = ''
    for para in doc.paragraphs:
        text += para.text
    return text


def extract_text_from_pdf(filename):
    text = ''
    with pdfplumber.open(filename) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

Parse Resume

In [19]:
# Parse resume
def parse_resume(filename):
    ext = os.path.splitext(filename)[1].lower()
    if ext == ".docx":
        resume_text = extract_text_from_docx(filename)
    elif ext == ".pdf":
        resume_text = extract_text_from_pdf(filename)
    else:
        print(f"Unsupported file format: {ext}")
        return

    entities = extract_entities(resume_text)
    print("Extracted Entities:", entities)

    predicted_label = pipeline.predict([resume_text])
    print(f"Predicted Label: {predicted_label[0]}")



Test the parser with a file

In [20]:

filename = "/content/46258701.pdf"
parse_resume(filename)


Extracted Entities: {'NAME': ['Hr Coordinator', 'Suite', 'HRMS', 'Wellness Fairs', 'Recruiter Lead', 'Groupwise', 'Incoming Calls', 'Ms Outlook', 'Outlook', 'Recruiter'], 'PHONE': [], 'EMAIL': [], 'SKILLS': [], 'EDUCATION': ['Filing', 'Leads'], 'EXPERIENCE': ['Core Qualifications\nExceptional', 'Applicant Tracking System', 'Microsoft Office', 'HR Coordinator', 'State', 'Kenexa BrassRing', 'HR Coordinator', 'State', 'Edward Don & Company', 'HR Coordinator', '08/2008 - 09/2011 Company Name - City', 'State', 'WOTC', 'SOX', 'the HR Department', 'SAP Administration', "the HR Department's", 'SharePoint', 'CPR', 'Receptionist', 'Hiring Managers', 'State', 'Federal', 'Education', 'DePaul University - City', 'State BA Human Resources\nProfessional Affiliations\nMembership:', 'Society for Human Resources', 'Technical Skills', 'Microsoft Sharepoint', 'SAP,', 'Ms Office', 'Phone System', 'Sterling Inforsystems', 'Orange Tree Employment Services', 'Kenexa - IBM Products', 'Ultipro', 'New Hires', 'S