In [15]:
from typing import List, Optional
from pydantic import Field, BaseModel
from pprint import pprint

class Person(BaseModel):
    """Contains personal information about an individual extracted from a resume."""
    name: Optional[str] = Field(..., description="The individual's name", examples=[("my name is Nguyen Van An", "Nguyen Van An")])
    phone: Optional[str] = Field(..., description="The individual's phone number.", examples=[("Contact information: 0123456789", "0123456789")])
    birthday: Optional[str] = Field(..., description="The individual's date of birth", examples=[("I was born on 01/01/1990", "01/01/1990")])
    summary: Optional[str] = Field(..., description="A brief summary of the individual's qualifications and experiences.", examples=[("I am a programmer with 5 years of experience in information technology.", "Programmer with 5 years of experience")])
    location: Optional[str] = Field(..., description="The individual's location.", examples=[("Contact information: Nguyễn Oanh, Ward 17, Go Vap District, Ho Chi Minh City", "Nguyễn Oanh, Ward 17, Go Vap District, Ho Chi Minh City")])
    links: Optional[str] = Field(..., description="URL to the individual's portfolio or professional website.", examples=[("See my portfolio at: www.example.com", "www.example.com")])

class Education(BaseModel):
    """Holds information about the individual's educational background."""
    name: Optional[str] = Field(..., description="The name of the educational institution", examples=[("I studied at Hanoi University of Science and Technology", "Hanoi University of Science and Technology")])
    field: Optional[str] = Field(..., description="The field of study", examples=[("I studied Computer Science", "Computer Science")])
    degree: Optional[str] = Field(..., description="The degree attained.", examples=[("I received a Bachelor's degree in 2020", "Bachelor's degree")])
    start_date: Optional[str] = Field(..., description="The date when the education commenced.", examples=[("I started studying in September 2016", "09/2016")])
    end_date: Optional[str] = Field(..., description="The date when the education concluded.", examples=[("I graduated in June 2020", "06/2020")])
    achievements: Optional[List[str]] = Field(..., description="Notable achievements during the course of education.", examples=[("I received a full scholarship", ["Full scholarship", "First prize in the science competition"])])

class Skills(BaseModel):
    """Represents a specific skill acquired by the individual."""
    name: Optional[str] = Field(..., description="The name of the skill.", examples=[("My skill is programming in Python", "Python")])
    category: Optional[str] = Field(..., description="The category under which the skill falls.", examples=[("My skill is programming in Python", "Programming")])

class Certifications(BaseModel):
    """Details about certifications earned by the individual."""
    name: Optional[str] = Field(..., description="The title of the certification.", examples=[("Certificate: AWS Certified Solutions Architect", "AWS Certified Solutions Architect")])
    institute: Optional[str] = Field(..., description="The organization that issued the certification", examples=[("Issuing organization: Amazon Web Services", "Amazon Web Services")])
    date: Optional[str] = Field(..., description="The date when the certification was awarded.", examples=[("Certificate issued on: 15/05/2021", "15/05/2021")])
    url: Optional[str] = Field(..., description="Link to the certification, if available.", examples=[("See certificate at: www.example.com/certificate", "www.example.com/certificate")])

class WorkPlace(BaseModel):
    """Information regarding the individual's past workplaces."""
    name: Optional[str] = Field(..., description="The name of the company or organization.", examples=[("Experience: Amazing Tech", "Amazing Tech")])
    domain_expertise: Optional[str] = Field(..., description="The main area of expertise of the workplace.", examples=[("AI Development", "AI Development")])
    size: Optional[str] = Field(..., description="The size of the company, typically indicated by the number of employees.", examples=[("200 employees", "200 employees")])
    start_date: Optional[str] = Field(..., description="The date when the individual started working there.", examples=[("01/2020", "01/2020")])
    end_date: Optional[str] = Field(..., description="The date when the individual stopped working there, if applicable.", examples=[("12/2021", "12/2021")])

class Publication(BaseModel):
    """Information about academic or professional publications authored by the individual."""
    name: Optional[str] = Field(..., description="The title of the publication.")
    category: Optional[str] = Field(..., description="The type or category of the publication.")
    conference: Optional[str] = Field(..., description="The conference where the publication was presented.")
    url: Optional[List[str]] = Field(..., description="Links to the publication, if available.")
    date: Optional[str] = Field(..., description="The date of publication.")

class PublishRelationship(BaseModel):
    """Defines the relationship between a person and their publications."""
    name: Optional[str] = Field(..., description="The title of the publication.")
    conference: Optional[str] = Field(..., description="The conference where the publication was presented.")
    url: Optional[List[str]] = Field(..., description="Links to the publication.")
    date: Optional[str] = Field(..., description="The publication date.")

class AchieveRelationship(BaseModel):
    """Details regarding certifications or achievements associated with the individual."""
    name: Optional[str] = Field(..., description="The name of the certification or achievement.")
    institute: Optional[str] = Field(..., description="The organization that awarded the certification.")
    url: Optional[List[str]] = Field(..., description="Links to the certification or achievement.")
    date: Optional[str] = Field(..., description="The date the certification or achievement was received.")

class StudiedAtRelationship(BaseModel):
    """Defines the relationship between a person and their educational background."""
    degree: Optional[str] = Field(..., description="The degree obtained during the education.")
    start: Optional[str] = Field(..., description="The start date of the educational program.")
    end: Optional[str] = Field(..., description="The end date of the educational program.")
    major: Optional[str] = Field(..., description="The major or primary focus of study.")
    achievements: Optional[List[str]] = Field(..., description="Significant achievements during the educational experience.")

class WorkedAtRelationship(BaseModel):
    """Details regarding a person's work history and specific roles."""
    title: Optional[str] = Field(..., description="The job title held at the workplace.")
    start: Optional[str] = Field(..., description="The start date of the job.")
    end: Optional[str] = Field(..., description="The end date of the job.")
    responsibilities: Optional[List[str]] = Field(..., description="Key responsibilities held during the job.")
    achievements: Optional[List[str]] = Field(..., description="Notable achievements during the job.")

class WorkedAsRelationship(BaseModel):
    """Information regarding specific roles held within a position."""
    duration: Optional[int] = Field(..., description="The duration of time worked in the position (in months).")
    responsibilities: Optional[List[str]] = Field(..., description="A list of responsibilities held during the job.")
    achievements: Optional[List[str]] = Field(..., description="A list of achievements during the job.")

class HaveRelationship(BaseModel):
    """Defines a relationship representing ownership of certain skills or certifications."""
    # No properties specified for the HAVE relationship, so it remains empty
    pass

In [16]:
# workplace: List[WorkPlace]
# have_skill: HaveRelationship
# work_as: WorkedAsRelationship
# work_at: WorkedAtRelationship
# study_at: StudiedAtRelationship
# achieve: AchieveRelationship
# publish: PublishRelationship
# public: Publication
# workplace: WorkPlace

In [17]:
# Note Lam sau khi extract xong
# position: List[Position] = Field(..., description="Details about specific job positions held by the individual.")

# class Position(BaseModel):
#     """Details about specific job positions held by the individual."""
#     name: Optional[str] = Field(..., description="The job title held.", examples=[("AI Engineer", "AI Engineer")])
#     description: Optional[str] = Field(..., description="A brief overview of the responsibilities associated with the position.", examples=[("AI Engineer", "Developed machine learning models for various applications.")])
#     start_date: Optional[str] = Field(..., description="The date when the individual began this position.", examples=[("01/2020", "01/2020")])
#     end_date: Optional[str] = Field(..., description="The date when the individual left this position, if applicable.", examples=[("12/2021", "12/2021")])
#     responsibilities: Optional[List[str]] = Field(..., description="A list of key responsibilities undertaken in this role.", examples=[["Developing algorithms", "Collaborating with cross-functional teams", "Presenting findings to stakeholders"]])
#     achievements: Optional[List[str]] = Field(..., description="A list of significant achievements in this position.", examples=[["Improved model accuracy by 20%", "Led a successful project that increased revenue by 15%"]])



In [18]:
class ExtractionPersonData(BaseModel):
    """Overall structure for the extracted data from a resume."""
    person: Person = Field(..., description="Contains personal information about an individual extracted from a resume.")
    education: List[Education] = Field(..., description="Holds information about the individual's educational background.")
    skill: List[Skills] = Field(..., description="Represents a specific skill acquired by the individual.")
    workplace: List[WorkPlace] = Field(..., description="Information regarding the individual's past workplaces.")
    
class ExtractionAchievementData(BaseModel):
    """Overall structure for the extracted data from a resume."""
    certification: List[Certifications] = Field(..., description="Details about certifications earned by the individual.")
    public: Publication = Field(..., description="Information about academic or professional publications authored by the individual.")

In [19]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert at identifying key-value resume in text."
            "Translate to English if the content is in Vietnamese."
            "Some data might be spelled incorrectly, you should correct it semantically, such as NGUYENVAN BA to NGUYENVANBA, etc."
            "Only extract important key-value resume. Extract nothing if no important information can be found in the text."
        ),
        ("human", "{input}"),
    ]
)

In [20]:
import os

os.environ["TOGETHER_API_KEY"] = "80f6eb25e6e817651c95012f6f92a9a4cfc33a7352e996ee46ca65fa4d7ce051"

In [21]:
from langchain_together import ChatTogether

llm = ChatTogether(
    model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
    temperature=0.0,
    max_tokens=None,
    timeout=None,
    max_retries=5,
    # other params...
)

In [22]:
import pytesseract
from pdf2image import convert_from_path


# Path to your Tesseract executable
# Update this path according to your installation
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def pdf_to_text(pdf_path):
    # Convert PDF to images
    images = convert_from_path(pdf_path)

    # Initialize a string to hold the extracted text
    full_text = ""

    # Iterate over the images and extract text
    for img in images:
        text = pytesseract.image_to_string(img)
        full_text += text + "\n"  # Append the text from each page

    return full_text

# Example usage
pdf_path = "dataset/Resume/0d5bb3d4-1875-4ca4-b746-03c4df407fca.pdf" # co achieve  # Path to your PDF file
pdf_path = "dataset/Resume/0aa2fb6f-66d3-4fcc-b960-2bd2f7721ffe.pdf"
text = pdf_to_text(pdf_path)
print(text)

MUC TIEU NGHE NGHIEP
Déi net vé tu duy va cach lam viéc

+ Bat nhip va chuyén déi cac cong
viéc khac nhau nhanh.

+ Ludn tu nghién cuu céng nghé
va ap dung vao san pham.

+ Thuong viét framework dung
chung cho nhiéu san pham.
Kinh nghiém lam viéc

+ C66 nam kinh nghiém trong
xay dung va trién khai phan mém hé
théng ERP.

+ Tao tool test tu déng, test API.

+ Lam Full-Stack trong backend va
frontend.

+ Co kinh nghiém lap trinh gui va
nhan dif liéu vdi hé théng OT.

+ Phan tich va xay dung MES cho
quan ly va hién thi dt liéu thoi gian
thuc cho cac nha may san xuat.

THONG TIN CA NHAN
Ngay sinh 16/11/1994
Quéc tich Viét Nam

Tinh trang hén nhan Dédc than

Gidi tinh Nam
NGOAI NGU

Tiéng Anh So Cap
KY NANG

fs mM ts SCcSS3

.NET
ReactUS
JavaScript
React Native
.Net Core

Entity Framework
jQuery AJAX

jQuery
AngularS

Python 3.9 HTML

Ha Bui

Nhan vién lap trinh phan mém - 7 Nam Kinh Nghiém

$

KINH NGHIEM LAM VIEC

Truéng phong R&D
EASTERN SUN
01/2018 - 01/2024 (6 nam)

Thuc hién lén ké ho

In [23]:
# ExtractionAchievementData ExtractionPersonData

In [24]:
# def extract(DataExtraction):
#     chain = {"input": format_docs} | prompt | llm.with_structured_output(DataExtraction, include_raw=True)
#     result = chain.invoke(text)
#     return result

def extract(DataExtraction):
    chain = prompt | llm.with_structured_output(DataExtraction, include_raw=True)
    result = chain.invoke(text)
    return result

In [25]:
a = extract(ExtractionPersonData)

In [26]:
b = extract(ExtractionAchievementData)

In [27]:
import json

a["raw"].tool_calls[0]

{'name': 'ExtractionPersonData',
 'args': {'person': {'name': 'Ha Bui',
   'phone': None,
   'birthday': '16/11/1994',
   'summary': '7 Nam Kinh Nghiém',
   'location': 'Minh Khai, Bac Tu Liem, Ha Noi, Viet Nam',
   'links': None},
  'education': [{'name': 'Dai hoc Cong Nghiep Ha Noi',
    'field': 'Khoa hoc may tinh',
    'degree': None,
    'start_date': '01/2012',
    'end_date': '12/2016',
    'achievements': None}],
  'skill': [{'name': 'C#', 'category': 'Programming'},
   {'name': 'React', 'category': 'Programming'},
   {'name': 'JavaScript', 'category': 'Programming'},
   {'name': 'React Native', 'category': 'Programming'},
   {'name': '.Net Core', 'category': 'Programming'},
   {'name': 'Entity Framework', 'category': 'Programming'},
   {'name': 'jQuery', 'category': 'Programming'},
   {'name': 'Angular', 'category': 'Programming'},
   {'name': 'Python', 'category': 'Programming'},
   {'name': 'HTML', 'category': 'Programming'},
   {'name': 'CSS', 'category': 'Programming'}],
 

In [28]:
b["raw"].tool_calls[0]

{'name': 'ExtractionAchievementData',
 'args': {'certification': [{'name': 'Certificate: AWS Certified Solutions Architect',
    'institute': 'Amazon Web Services',
    'date': '15/05/2021',
    'url': 'www.example.com/certificate'}],
  'public': [{'name': 'Publication: API Design Patterns',
    'category': 'API Design',
    'conference': 'API Conference',
    'url': ['www.example.com/publication'],
    'date': '01/01/2020'}]},
 'id': 'call_d1jqyclbf9hthliiif5k1vux',
 'type': 'tool_call'}