### Import relevant libraries

In [1]:
import fitz
import re
import spacy
import sys

In [2]:
# Print Python Interpreter Executable Path
print(sys.executable)

C:\Users\SPESSE ENVIROMENT\.conda\envs\mlopspycaret\python.exe


### Step 1: User Input

In [3]:
# Create a variable for the file path
file_path = "sample_cv.pdf"

In [4]:
# Open the file
with open(file_path, "rb") as file:
    pdf_content = file.read()

### Step 2: Preprocess Data

In [5]:
# Define a function to extract text from the pdf using PYMUPDF
def extract_text_from_pdf(pdf_content):
    # Open the PDF document using PyMuPDF's fitz module
    doc = fitz.open("pdf", pdf_content)
    
    # Initialize an empty string to store the extracted text
    text = ""
    
    # Iterate through each page of the PDF document
    for page_num in range(doc.page_count):
        # Retrieve the page using the page number
        page = doc[page_num]
        
        # Get the text content of the page and append it to the 'text' variable
        text += page.get_text()
    
    # Return the accumulated text from all pages
    return text

In [6]:
# Call the function and assign it to a variable
resume_text = extract_text_from_pdf(pdf_content)

In [7]:
# Display the extracted text
resume_text

'NAME\nTel: 0123456789\nE-mail: 123@york.ac.uk\nEDUCATION\n2017 - 2020\nB.Acc. (Hons) Accountancy, Business and Finance, University of York, United Kingdom\n●\nAverage grades: Year 1 - 72% Year 2 - 69%, Expected result: 1st\n●\nModules completed: Financial accounting, Taxation, Audit, Business statistics, Finance,\nFinancial markets, Business law and Management\n●\nDeveloped strong numeracy skills, business and commercial awareness, and project\nmanagement skills\n●\nAttained the highest mark in the year (77%) for delivering the presentation on Public\nEvaluation of Corporations to a panel of lecturers\n●\nImproved leadership skills by leading a team in the inter-university Business Plan Game,\nachieving the 3rd place out of forty\n2015 - 2017\nDiploma in Spanish and English Languages and English Literature, University of Barcelona,\nSpain\nRELEVANT WORK EXPERIENCE\nSummer 2019\nInternship, Business Growth Potential Project, Financial Services Authority, London and\nEdinburgh\n●\nAs a 

In [8]:
# Define a function to preprocess the extracted text
def preprocess_resume_text(resume_text):
    # Remove extra whitespaces, newlines, and other unnecessary characters
    cleaned_text = re.sub(r'\s+', ' ', resume_text).strip()
    return cleaned_text

In [9]:
# Call the function and assign it to a variable
cleaned_resume_text = preprocess_resume_text(resume_text)

In [10]:
# Display the preprocessed text
print(cleaned_resume_text)

NAME Tel: 0123456789 E-mail: 123@york.ac.uk EDUCATION 2017 - 2020 B.Acc. (Hons) Accountancy, Business and Finance, University of York, United Kingdom ● Average grades: Year 1 - 72% Year 2 - 69%, Expected result: 1st ● Modules completed: Financial accounting, Taxation, Audit, Business statistics, Finance, Financial markets, Business law and Management ● Developed strong numeracy skills, business and commercial awareness, and project management skills ● Attained the highest mark in the year (77%) for delivering the presentation on Public Evaluation of Corporations to a panel of lecturers ● Improved leadership skills by leading a team in the inter-university Business Plan Game, achieving the 3rd place out of forty 2015 - 2017 Diploma in Spanish and English Languages and English Literature, University of Barcelona, Spain RELEVANT WORK EXPERIENCE Summer 2019 Internship, Business Growth Potential Project, Financial Services Authority, London and Edinburgh ● As a part of the FSA’s Small Firms

### Step 3: Extract Field of Study and University using spaCy (pre-trained NER Model)

In [11]:
# Load the spaCy English language model
nlp = spacy.load("en_core_web_sm")

In [12]:
# Define a function to extract information from the resume text
def extract_information(resume_text):
    # Process the resume text with spaCy
    doc = nlp(resume_text)
    
    # Extract relevant information (e.g., education, work experience, skills)
    # Using more general spaCy labels like "DATE" and "ORG" for education and experience
    education = [item.text for item in doc.ents if item.label_ == "DATE" or item.label_ == "ORG"]
    experience = [item.text for item in doc.ents if item.label_ == "DATE" or item.label_ == "ORG"]
    skills = [item.text for item in doc.ents if item.label_ == "SKILL" or item.label_ == "ORG"]
    
    # Return the extracted information
    return education, experience, skills

In [13]:
# Call the function with the cleaned resume text
education_info, experience_info, skills_info = extract_information(cleaned_resume_text)

In [14]:
# Print the extracted information
print("Education:", education_info)
print("Experience:", experience_info)
print("Skills:", skills_info)

Education: ['2017 - 2020', 'Business and Finance', 'University of York', 'Year 1 - 72%', 'Year 2 - 69%', 'Taxation, Audit, Business statistics', 'Finance', 'the year', 'forty 2015 - 2017', 'English Literature', 'University of Barcelona', 'Summer 2019', 'Internship, Business Growth Potential Project', 'Financial Services Authority', 'FSA', 'Small Firms Division', 'day', 'monthly', 'FSA', 'Morgan Stanley', 'MS Excel', '2018 - 2020', 'York', 'ACHIEVEMENTS & AWARDS Academic ● Co-Winner', 'Business Plan Game', 'inter-university', '2019', '2015', 'the first 5 months', 'Community Involvement ● Coordinated', 'the Sailing Club’s', 'the Universities League', '2018', 'MS Office', 'CAE - Grade A']
Experience: ['2017 - 2020', 'Business and Finance', 'University of York', 'Year 1 - 72%', 'Year 2 - 69%', 'Taxation, Audit, Business statistics', 'Finance', 'the year', 'forty 2015 - 2017', 'English Literature', 'University of Barcelona', 'Summer 2019', 'Internship, Business Growth Potential Project', 'F