In [5]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PDFMinerLoader, TextLoader, Docx2txtLoader
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CohereRerank
from langchain_community.llms import Cohere
from langchain_community.chat_models import ChatOllama
from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
import datetime, json, os, tiktoken
from IPython.display import Markdown
from dotenv import load_dotenv, find_dotenv
from PIL import Image

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [67]:
ASSISTANT_LANGUAGE = "english"
cohere_api_key = 'v7yOHn61izIiMkGllF2gzbM5e0PHnDEwa7fjh38E'

In [7]:
def langchain_document_loader(file_path):
    """Load and split a PDF file in Langchain.
    Parameters:
        - file_path (str): path of the file.
    Output:
        - documents: list of Langchain Documents."""

    if file_path.endswith(".pdf"):
        loader = PDFMinerLoader(file_path=file_path)
    else:
        print("You can only upload .pdf files!")

    # 1. Load and split documents
    documents = loader.load_and_split()

    # 2. Update the metadata: add document number to metadata
    for i in range(len(documents)):
        documents[i].metadata = {
            "source": documents[i].metadata["source"],
            "doc_number": i,
        }

    return documents

documents = langchain_document_loader("./data/resume/Khushal_ml.pdf")
print("number of documents:",len(documents))
documents

number of documents: 1


[Document(page_content='Khushal Goyal\nB.Tech\n+91 7610513661\nMail: khushalgoyal77@gmail.com\nlinkedin:https://www.linkedin.com/in/khushal-goyal-a27b54227/\nGithub: https://github.com/darkengross\nPortfolio: https://portfolio-v2-nine-mauve.vercel.app/\nEDUCATION\n\nThe LNM Institute of Information Technology\nAaradhan public school\nCarmel convent Sr. Sec. School\n\nINTERNSHIPS\n\nAgent Oriented Development | LNMIIT\n\nB.Tech in Computer Science | GPA : 7.63/10\n\nclass XII | Percentage : 93/100\n\nclass X | Percentage : 93/100\n\nNovember 2021 – Present\n\nJune 2023 – July 2023\n\n– Developed Algorithms for multiagent systems.\n– Documented more than 22 research papers on heuristic, reinforcement learning based drone navigation algorithms\n\nTECHNICAL SKILLS\n\nLanguages :Python, C/C++, SQL, HTML, CSS, Java\nFrameworks :PyTorch, TensorFlow, Keras, HuggingFace, Transformers, NLTK, Spacy, Power BI\nDevOps and API Tools : Git, Docker,Postman\nOthers : Data Structures, SOLID Principles, 

In [9]:
def select_embeddings_model(LLM_service="llama3.1"):
    """Connect to the embeddings API endpoint by specifying the name of the embedding model."""
    if LLM_service == "OpenAI":
        embeddings = OpenAIEmbeddings(
            model='gpt-3.5-turbo',
            api_key=openai_api_key)

    if LLM_service == "llama3.1":
        embeddings = OllamaEmbeddings(
            model="llama3.1"
        )
    return embeddings
   
# embeddings_OpenAI = select_embeddings_model(LLM_service="OpenAI")
# embeddings_google = select_embeddings_model(LLM_service="Google")
embeddings_ollama = select_embeddings_model(LLM_service='llama3.1')

In [12]:
def create_vectorstore(embeddings, documents):
    vector_store = FAISS.from_documents(documents=documents, embedding=embeddings)
    
    return vector_store

vector_store_ollama = create_vectorstore(embeddings_ollama, documents)

In [17]:
def vectorstore_backed_retriever(vectorstore, search_type = 'similarity', k = 5, score_threshold = None):
    search_kwargs = {}
    if k is not None:
        search_kwargs['k'] = k
    if score_threshold is not None:
        search_kwargs['score_threshold'] = score_threshold
        
    retriever = vectorstore.as_retriever(
        search_type = search_type,
        search_kwargs = search_kwargs
    )
    
    return retriever

base_retriever_ollama = vectorstore_backed_retriever(vector_store_ollama, "similarity", k = min(4, len(documents)))

In [36]:
def CohereRerank_retriever(base_retriever, cohere_api_key, cohere_model = 'rerank-multilingual-v2.0', top_n=2):
    compressor = CohereRerank(
        cohere_api_key=cohere_api_key, 
        model=cohere_model, 
        top_n=top_n
    )

    retriever_Cohere = ContextualCompressionRetriever(
        base_compressor=compressor,
        base_retriever=base_retriever
    )
    
    return retriever_Cohere

In [37]:
retriever_Cohere_ollama = CohereRerank_retriever(
    base_retriever=base_retriever_ollama, 
    cohere_api_key=cohere_api_key,
    cohere_model="rerank-multilingual-v2.0",  
    top_n=2
)

In [40]:

query = "Extract the job title and company name of the first work experience."

most_relevant_docs = retriever_Cohere_ollama.get_relevant_documents(query)

for i in range(len(most_relevant_docs)):
    print(f"""Most similar doc id : {most_relevant_docs[i].metadata['doc_number']} """)

unknown field: parameter model is not a valid field


Most similar doc id : 0 


In [47]:
def instantiate_LLM(LLM_provider,api_key,temperature=0.5,top_p=0.95,model_name=None):
    """Instantiate LLM in Langchain.
    Parameters:
        LLM_provider (str): the LLM provider; in ["OpenAI","Google","HuggingFace"]
        model_name (str): in ["gpt-3.5-turbo", "gpt-3.5-turbo-0125", "gpt-4-turbo-preview", 
            "gemini-pro", "mistralai/Mistral-7B-Instruct-v0.2"].            
        api_key (str): google_api_key or openai_api_key or huggingfacehub_api_token 
        temperature (float): Range: 0.0 - 1.0; default = 0.5
        top_p (float): : Range: 0.0 - 1.0; default = 1.
    """
    if LLM_provider == "OpenAI":
        llm = ChatOpenAI(
            api_key=api_key,
            model=model_name,
            temperature=temperature,
            model_kwargs={
                "top_p": top_p
            }
        )
    if LLM_provider == "ollama":
        llm = ChatOllama(
            model=model_name,
            temperature=temperature,
            top_p=top_p,
            convert_system_message_to_human=True
        )
    return llm

In [56]:
def set_LLM_and_retriever(provider="ollama",model_name="llama3.1",temperature=0.0,top_p=0.95):
    if provider=="OpenAI":
        llm = instantiate_LLM(
            "OpenAI",
            api_key=openai_api_key,
            temperature=temperature,
            top_p=top_p,
            model_name=model_name
        )
        retriever = retriever_Cohere_openAI
    else: # "Google"
        llm = instantiate_LLM(
            LLM_provider="ollama",
            temperature=temperature,
            api_key = "",
            top_p=top_p, 
            model_name=model_name
        )
        retriever = retriever_Cohere_ollama
        
    return llm,retriever 

In [57]:
llm,retriever = set_LLM_and_retriever(provider="ollama",temperature=0.0)

In [58]:
#####################################################
#                 Prompt Templates
#####################################################

templates = {}

# 2.1 Contact information Section
templates[
    "Contact__information"
] = """Extract and evaluate the contact information. \
Output a dictionary with the following keys:
- candidate__name 
- candidate__title
- candidate__location
- candidate__email
- candidate__phone
- candidate__social_media: Extract a list of all social media profiles, blogs or websites.
- evaluation__ContactInfo: Evaluate in {language} the contact information.
- score__ContactInfo: Rate the contact information by giving a score (integer) from 0 to 100.
"""

# 2.2. Summary Section
templates[
    "CV__summary"
] = """Extract the summary and/or objective section. This is a separate section of the resume. \
If the resume doed not contain a summary and/or objective section, then simply write "unknown"."""

# 2.3. WORK Experience Section

templates[
    "Work__experience"
] = """Extract all work experiences. For each work experience: 
1. Extract the job title.
2. Extract the company.  
3. Extract the start date and output it in the following format: \
YYYY/MM/DD or YYYY/MM or YYYY (depending on the availability of the day and month).
4. Extract the end date and output it in the following format: \
YYYY/MM/DD or YYYY/MM or YYYY (depending on the availability of the day and month).
5. Output a dictionary with the following keys: job__title, job__company, job__start_date, job__end_date.

Format your response as a list of dictionaries.
"""

# 2.4. Projects Section
templates[
    "CV__Projects"
] = """Include any side projects outside the work experience. 
For each project:
1. Extract the title of the project. 
2. Extract the start date and output it in the following format: \
YYYY/MM/DD or YYYY/MM or YYYY (depending on the availability of the day and month).
3. Extract the end date and output it in the following format: \
YYYY/MM/DD or YYYY/MM or YYYY (depending on the availability of the day and month).
4. Output a dictionary with the following keys: project__title, project__start_date, project__end_date.

Format your response as a list of dictionaries.
"""

# 2.5. Education Section
templates[
    "CV__Education"
] = """Extract all educational background and academic achievements.
For each education achievement:
1. Extract the name of the college or the high school. 
2. Extract the earned degree. Honors and achievements are included.
3. Extract the start date and output it in the following format: \
YYYY/MM/DD or YYYY/MM or YYYY (depending on the availability of the day and month).
4. Extract the end date and output it in the following format: \
YYYY/MM/DD or YYYY/MM or YYYY (depending on the availability of the day and month).
5. Output a dictionary with the following keys: edu__college, edu__degree, edu__start_date, edu__end_date.

Format your response as a list of dictionaries.
"""

templates[
    "Education__evaluation"
] = """Your task is to perform the following actions:  
1. Rate the quality of the Education section by giving an integer score from 0 to 100. 
2. Evaluate (in three sentences and in {language}) the quality of the Education section.
3. Output a dictionary with the following keys: score__edu, evaluation__edu.
"""

# 2.6. Skills
templates[
    "candidate__skills"
] = """Extract the list of soft and hard skills from the skill section. Output a list.
The skill section is a separate section.
"""

templates[
    "Skills__evaluation"
] = """Your task is to perform the following actions: 
1. Rate the quality of the Skills section by giving an integer score from 0 to 100.
2. Evaluate (in three sentences and in {language}) the quality of the Skills section.
3. Output a dictionary with the following keys: score__skills, evaluation__skills.
"""

# 2.7. Languages
templates[
    "CV__Languages"
] = """Extract all the languages that the candidate can speak. For each language:
1. Extract the language.
2. Extract the fluency. If the fluency is not available, then simply write "unknown".
3. Output a dictionary with the following keys: spoken__language, language__fluency.

Format your response as a list of dictionaries.
"""

templates[
    "Languages__evaluation"
] = """ Your task is to perform the following actions: 
1. Rate the quality of the language section by giving an integer score from 0 to 100.
2. Evaluate (in three sentences and in {language}) the quality of the language section.
3. Output a dictionary with the following keys: score__language,evaluation__language.
"""

# 2.8. Certifications
templates[
    "CV__Certifications"
] = """Extraction of all certificates other than education background and academic achievements. \
For each certificate: 
1. Extract the title of the certification. 
2. Extract the name of the organization or institution that issues the certification.
3. Extract the date of certification and output it in the following format: \
YYYY/MM/DD or YYYY/MM or YYYY (depending on the availability of the day and month).
4. Extract the certification expiry date and output it in the following format: \
YYYY/MM/DD or YYYY/MM or YYYY (depending on the availability of the day and month).
5. Extract any other information listed about the certification. if not found, then simply write "unknown".
6. Output a dictionary with the following keys: certif__title, certif__organization, certif__date, certif__expiry_date, certif__details.

Format your response as a list of dictionaries.
"""

templates[
    "Certif__evaluation"
] = """Your task is to perform the following actions: 
1. Rate the certifications by giving an integer score from 0 to 100.
2. Evaluate (in three sentences and in {language}) the certifications and the quality of the text.
3. Format your response as a dictionary with the following keys: score__certif,evaluation__certif.
"""


# 3. PROMPTS

PROMPT_IMPROVE_SUMMARY = """Your are given a resume (delimited by <resume></resume>) \
and a summary (delimited by <summary></summary>).
1. In {language}, evaluate the summary (format and content) .
2. Rate the summary by giving an integer score from 0 to 100. \
If the summary is "unknown", the score is 0.
3. In {language}, strengthen the summary. The summary should not exceed 5 sentences. \
If the summary is "unknown", generate a strong summary in {language} with no more than 5 sentences. \
Please include: years of experience, top skills and experiences, some of the biggest achievements, and finally an attractive objective.
4. Format your response as a dictionary with the following keys: evaluation__summary, score__summary, CV__summary_enhanced.

<summary>
{summary}
</summary>
------
<resume>
{resume}
</resume>
"""

PROMPT_IMPROVE_WORK_EXPERIENCE = """you are given a work experience text delimited by triple backticks.
1. Rate the quality of the work experience text by giving an integer score from 0 to 100. 
2. Suggest in {language} how to make the work experience text better and stronger.
3. Strengthen the work experience text to make it more appealing to a recruiter in {language}. \
Provide additional details on responsibilities and quantify results for each bullet point. \
Format your text as a string in {language}.
4. Format your response as a dictionary with the following keys: "Score__WorkExperience", "Comments__WorkExperience" and "Improvement__WorkExperience".

Work experience text: ```{text}```
"""

PROMPT_IMPROVE_PROJECT = """you are given a project text delimited by triple backticks.
1. Rate the quality of the project text by giving an integer score from 0 to 100. 
2. Suggest in {language} how to make the project text better and stronger.
3. Strengthen the project text to make it more appealing to a recruiter in {language}, \
including the problem, the approach taken, the tools used and quantifiable results. \
Format your text as a string in {language}.
4. Format your response as a dictionary with the following keys: Score__project, Comments__project, Improvement__project.

project text: ```{text}```
"""

PROMPT_EVALUATE_RESUME = """You are given a resume delimited by triple backticks. 
1. Provide an overview of the resume in {language}.
2. Provide a comprehensive analysis of the three main strengths of the resume in {language}. \
Format the top 3 strengths as string containg three bullet points.
3. Provide a comprehensive analysis of the three main weaknesses of the resume in {language}. \
Format the top 3 weaknesses as string containg three bullet points.
4. Format your response as a dictionary with the following keys: resume_cv_overview, top_3_strengths, top_3_weaknesses.

The strengths and weaknesses lie in the format, style and content of the resume.

Resume: ```{text}```
"""

In [64]:
def create_prompt_template(resume_sections, language=ASSISTANt_LANGUAGE):
    """Create the promptTemplate for selected resume sections.
    Parameters:
       resume_sections (list): List of resume sections from which information will be extracted.
       language (str): the language of the AI assistant.
    """

    # Create the final template: Add the templates from the 'templates' dictionary where keys = resume_sections
    template = f"""For the following resume delimited by triple backticks, output in {language} the following information:\n\n"""

    for key in resume_sections:
        template += key + ": " + templates[key] + "\n---------\n\n"

    template += "For any requested information, if it is not found, output 'unknown'.\n\n"
    template += (
        """Format the final output as a json dictionary with the following keys: ("""
    )

    for key in resume_sections:
        template += "" + key + ", "
    template = template[:-2] + ")"  # remove the last ", "

    template += """\n\nResume: ```{text}```"""

    # Create the PromptTemplate
    prompt_template = PromptTemplate.from_template(template)

    return prompt_template

In [114]:
# Here is a example:

prompt_template = create_prompt_template(
    ['Contact__information','CV__summary','Work__experience'],
    language=ASSISTANT_LANGUAGE
)

print("\n**LLM prompt example:**\n")
# Format the PromptTemplate
prompt = prompt_template.format_prompt(text="...",language=ASSISTANt_LANGUAGE).text

print(prompt)


**LLM prompt example:**

For the following resume delimited by triple backticks, output in english the following information:

Contact__information: Extract and evaluate the contact information. Output a dictionary with the following keys:
- candidate__name 
- candidate__title
- candidate__location
- candidate__email
- candidate__phone
- candidate__social_media: Extract a list of all social media profiles, blogs or websites.
- evaluation__ContactInfo: Evaluate in english the contact information.
- score__ContactInfo: Rate the contact information by giving a score (integer) from 0 to 100.

---------

CV__summary: Extract the summary and/or objective section. This is a separate section of the resume. If the resume doed not contain a summary and/or objective section, then simply write "unknown".
---------

Work__experience: Extract all work experiences. For each work experience: 
1. Extract the job title.
2. Extract the company.  
3. Extract the start date and output it in the following 

In [115]:
def invoke_LLM(
    llm,
    documents,
    resume_sections: list,
    info_message="",
    language=ASSISTANT_LANGUAGE,
):
    """Invoke LLM and get a response.
    Parameters:
     - llm: the LLM to call
     - documents: the Langchain Documents. 
     - resume_sections (list): List of resume sections to be parsed.
     - info_message (str): display an informational message.
     - language (str): the assistant language. 

     Output:
     - response_content (str): the content of the LLM response.
     - response_tokens_count (int): count of response tokens.
    """

    now = (datetime.datetime.now()).strftime("%H:%M:%S")
    print(f"**{now}** \t{info_message}")  

    # 1. Create the promptTemplate.
    prompt_template = create_prompt_template(
        resume_sections,
        language=language,
    )

    # 2. Format the PromptTemplate 
    if language is not None:
        prompt = prompt_template.format_prompt(text=documents, language=language).text
    else:
        prompt = prompt_template.format_prompt(text=documents).text

    # 3. Invoke the LLM
    response = llm.invoke(prompt)
    
    response_content = response.content[
        response.content.find("{") : response.content.rfind("}") + 1
    ]


    return response_content

In [116]:
%%time

##########################################################################################################
#     CONTACT_INFORMATION: Name, Title, Location, Email, Phone number and Social media profiles.
##########################################################################################################

try:
    response_content = invoke_LLM(
        llm,
        documents,
        resume_sections=["Contact__information"],
        info_message="Extract and evaluate contact information...",
        language=ASSISTANT_LANGUAGE,
    )
    
    try:
        # Load response_content into json dictionary
        CONTACT_INFORMATION = json.loads(response_content, strict=False)
    except Exception as e:
        print("[ERROR] json.loads returns error:", e)
        CONTACT_INFORMATION = {}
        
except Exception as error:
    print("[ERROR]:", error)
    CONTACT_INFORMATION = {}
    
CONTACT_INFORMATION 

**19:32:04** 	Extract and evaluate contact information...
CPU times: total: 15.6 ms
Wall time: 2min 5s


{'candidate_name': 'Khushal Goyal',
 'candidate_title': 'B.Tech',
 'candidate_location': 'unknown',
 'candidate_email': 'khushalgoyal77@gmail.com',
 'candidate_phone': '+91 7610513661',
 'candidate_social_media': [{'platform': 'LinkedIn',
   'url': 'https://www.linkedin.com/in/khushal-goyal-a27b54227/'},
  {'platform': 'GitHub', 'url': 'https://github.com/darkengross'},
  {'platform': 'Portfolio',
   'url': 'https://portfolio-v2-nine-mauve.vercel.app/'}],
 'evaluation_ContactInfo': 'The contact information provided is sufficient for professional purposes.',
 'score_ContactInfo': 80}

In [117]:
def extract_from_text(text,start_tag,end_tag=None):
    """Use start and end tags to extract a substring from text."""
    start_index = text.find(start_tag)
    if end_tag is None:
        extacted_txt = text[start_index+len(start_tag):]
    else:
        end_index = text.find(end_tag) 
        extacted_txt = text[start_index+len(start_tag):end_index]
    
    return extacted_txt

In [126]:
def Extract_contact_information(llm, documents):
    """Extract Contact Information: Name, Title, Location, Email, Phone number and Social media profiles."""

    try:
        response_content= invoke_LLM(
            llm,
            documents,
            resume_sections=["Contact__information"],
            info_message="Extract and evaluate contact information...",
            language=ASSISTANT_LANGUAGE,
        )

        try:
            # Load response_content into json dictionary
            CONTACT_INFORMATION = json.loads(response_content, strict=False)
        except Exception as e:
            print("[ERROR] json.loads returns error:", e)
            print("\n['INFO'] Parse response content...\n")
            list_fields = [{'Contact__information':
                            ['candidate__name','candidate__title','candidate__location',
                             'candidate__email','candidate__phone','candidate__social_media',
                             'evaluation__ContactInfo','score__ContactInfo']
                           }]
            list_rfind = [",\n",",\n",",\n",",\n",",\n",",\n",",\n",",\n","}\n"]
            list_exclude_first_car = [True,True,True,True,True,True,False,True,False]
            CONTACT_INFORMATION = ResponseContent_Parser(response_content,list_fields,list_rfind,list_exclude_first_car)
            # Convert the score to int
            try:
                CONTACT_INFORMATION["Contact__information"]["score__ContactInfo"] = int(
                    CONTACT_INFORMATION["Contact__information"]["score__ContactInfo"]
                )
            except:
                CONTACT_INFORMATION["Contact__information"]["score__ContactInfo"] = -1

    except Exception as exception:
        print(f"[Error] {exception}")
        CONTACT_INFORMATION = {
            "Contact__information": {
                "candidate__name": "unknown",
                "candidate__title": "unknown",
                "candidate__location": "unknown",
                "candidate__email": "unknown",
                "candidate__phone": "unknown",
                "candidate__social_media": "unknown",
                "evaluation__ContactInfo": "unknown",
                "score__ContactInfo": -1,
            }
        }
        
    return CONTACT_INFORMATION

In [127]:
%%time

CONTACT_INFORMATION = Extract_contact_information(llm,documents)
CONTACT_INFORMATION

**20:08:46** 	Extract and evaluate contact information...
CPU times: total: 62.5 ms
Wall time: 2min 58s


{'candidate_name': 'Khushal Goyal',
 'candidate_title': 'B.Tech',
 'candidate_location': 'unknown',
 'candidate_email': 'khushalgoyal77@gmail.com',
 'candidate_phone': '+91 7610513661',
 'candidate_social_media': ['LinkedIn: https://www.linkedin.com/in/khushal-goyal-a27b54227/',
  'GitHub: https://github.com/darkengross',
  'Portfolio: https://portfolio-v2-nine-mauve.vercel.app/'],
 'evaluation_ContactInfo': 'The contact information is well-structured and includes a professional email address, phone number, and social media profiles.',
 'score_ContactInfo': 90}

In [124]:
def Extract_Evaluate_Summary(llm, documents):
    """Extract, evaluate and strengthen the summary."""

    # 1. Extract the summary
    ######################################
    try:
        response_content = invoke_LLM(
            llm,
            documents,
            resume_sections=["CV__summary" ],
            info_message="Extract and evaluate the Summary....",
            language=ASSISTANT_LANGUAGE,
        )
        print("summary response:\n", response_content)
        try:
            # Load response_content into json dictionary
            SUMMARY_SECTION = json.loads(response_content, strict=False)
        except Exception as e:
            print("[ERROR] json.loads returns error:", e)
            print("\n['INFO'] Parse response content...\n")
            
            list_fields = ["CV__summary"]
            list_rfind = ["}\n"]
            list_exclude_first_car = [True]
            SUMMARY_SECTION = ResponseContent_Parser(
                response_content, list_fields, list_rfind, list_exclude_first_car
            )

    except Exception as exception:
        print(f"[Error] {exception}")
        SUMMARY_SECTION = {"CV__summary": "unknown"}

    # 2. Evaluate and improve the summary
    #############################################

    try:
        prompt_template = PromptTemplate.from_template(PROMPT_IMPROVE_SUMMARY)

        prompt = prompt_template.format_prompt(
            resume=documents,
            language=ASSISTANT_LANGUAGE,
            summary=SUMMARY_SECTION["CV__summary"],
        ).text

        # Invoke LLM
        response = llm.invoke(prompt)
        response_content = response.content[
            response.content.find("{") : response.content.rfind("}") + 1
        ]

        try:
            SUMMARY_EVAL = {}
            SUMMARY_EVAL["Summary__evaluation"] = json.loads(response_content, strict=False)
        except Exception as e:
            print("[ERROR] json.loads returns error:", e)
            print("\n['INFO'] Parse response content...\n")
            list_fields = [
                "evaluation__summary",
                "score__summary",
                "CV__summary_enhanced",
            ]
            list_rfind = [",\n", ",\n", "}\n"]
            list_exclude_first_car = [True, False, True]
            SUMMARY_EVAL["Summary__evaluation"] = ResponseContent_Parser(
                response_content, list_fields, list_rfind, list_exclude_first_car
            )
            # Convert score to int
            try:
                SUMMARY_EVAL["Summary__evaluation"]["score__summary"] = int(
                    SUMMARY_EVAL["Summary__evaluation"]["score__summary"]
                )
            except:
                SUMMARY_EVAL["Summary__evaluation"]["score__summary"] = -1

    except Exception as e:
        print(e)
        SUMMARY_EVAL = {
            "Summary__evaluation": {
                "evaluation__summary": "unknown",
                "score__summary": -1,
                "CV__summary_enhanced": "unknown",
            }
        }

    SUMMARY_EVAL["CV__summary"] = SUMMARY_SECTION["CV__summary"]


    return SUMMARY_EVAL

In [125]:
%%time

SUMMARY_EVAL = Extract_Evaluate_Summary(llm,documents)
SUMMARY_EVAL

**19:41:12** 	Extract and evaluate the Summary....
[Error] too many values to unpack (expected 2)
CPU times: total: 31.2 ms
Wall time: 3min 16s


{'Summary__evaluation': {'evaluation__summary': "The summary section of this resume appears to be empty, as indicated by the value 'unknown'.",
  'score__summary': 0,
  'CV__summary_enhanced': {'summary': 'As a highly motivated and detail-oriented B.Tech in Computer Science graduate from LNMIIT, I bring 2+ years of experience in developing algorithms for multiagent systems, machine learning, and deep learning. My top skills include Python, C/C++, SQL, HTML, CSS, Java, PyTorch, TensorFlow, Keras, HuggingFace, Transformers, NLTK, Spacy, Power BI, Git, Docker, Postman, Data Structures, SOLID Principles, Design Patterns, Multiagent systems, machine learning, deep learning, and reinforcement learning. With a strong background in data science and analysis, I have achieved notable results in projects such as multi-class classification using image transformers, Loksabha 2024 data analysis, Anime Face Generator, ASL Convertor, and Automatic Birthday Generation App. As the Head of the Sociotech 

In [130]:
try:
    response_content = invoke_LLM(
        llm,
        documents,
        resume_sections=[
            "CV__Education"
        ],
        info_message="Extract and evaluate education section...",
        language=ASSISTANT_LANGUAGE,
    )

    try:
        # Load response_content into json dictionary
        Education_Language_sections = json.loads(response_content, strict=False)
    except Exception as e:
        print("[ERROR] json.loads returns error:", e)
        Education_Language_sections = {}
        
except Exception as error:
    print("[ERROR]:", error)
    Education_Language_sections = {}

# Education_Language_sections
response_content

**20:30:49** 	Extract and evaluate education section...
[ERROR] json.loads returns error: Extra data: line 6 column 6 (char 208)


'{\n        "edu__college": "The LNM Institute of Information Technology",\n        "edu__degree": "B.Tech in Computer Science",\n        "edu__start_date": "June 2023",\n        "edu__end_date": "July 2023"\n    },\n    {\n        "edu__college": "Aaradhan public school",\n        "edu__degree": "class XII",\n        "edu__start_date": "unknown",\n        "edu__end_date": "unknown"\n    },\n    {\n        "edu__college": "Carmel convent Sr. Sec. School",\n        "edu__degree": "class X",\n        "edu__start_date": "unknown",\n        "edu__end_date": "unknown"\n    }\n]\n```\n\nNote that the dates for Aaradhan public school and Carmel convent Sr. Sec. School are unknown, as they were not specified in the resume.\n\nAlso, here is the extracted information in JSON format:\n\n```\n{\n  "CV__Education": [\n    {\n      "edu__college": "The LNM Institute of Information Technology",\n      "edu__degree": "B.Tech in Computer Science",\n      "edu__start_date": "June 2023",\n      "edu__end

In [131]:
list_fields = ['CV__Education',
               {'Education__evaluation':['score__edu','evaluation__edu']},
               'CV__Languages',
               {'Languages__evaluation':['score__language','evaluation__language']},
              ]

list_rfind = [",\n",",\n",",\n",",\n",",\n",",\n",",\n","\n"] 
list_exclude_first_car = [True,True,False,True,True,True,False,True] 

Education_Language_sections = ResponseContent_Parser(response_content,list_fields,list_rfind,list_exclude_first_car)
Education_Language_sections

{'CV__Education': '{\n      "edu__college": "The LNM Institute of Information Technology",\n      "edu__degree": "B.Tech in Computer Science",\n      "edu__start_date": "June 2023",\n      "edu__end_date": "July 2023"\n    },\n    {\n      "edu__college": "Aaradhan public school",\n      "edu__degree": "class XII",\n      "edu__start_date": "unknown",\n      "edu__end_date": "unknown"\n    },\n    {\n      "edu__college": "Carmel convent Sr. Sec. School",\n      "edu__degree": "class X",\n      "edu__start_date": "unknown',
 'Education__evaluation': {'score__edu': 'u__college": "The LNM Institute of Information Technology",\n        "edu__degree": "B.Tech in Computer Science",\n        "edu__start_date": "June 2023",\n        "edu__end_date": "July 2023"\n    },\n    {\n        "edu__college": "Aaradhan public school",\n        "edu__degree": "class XII",\n        "edu__start_date": "unknown",\n        "edu__end_date": "unknown"\n    },\n    {\n        "edu__college": "Carmel convent S

In [132]:
def convert_text_to_list_of_dicts(text,dict_keys):
    """Convert text to a python list of dicts.
    Parameters:
     - text: string containing a list of dicts
     - dict_keys (list): the keys of the dictionary which will be returned.
    Output:
     - list_of_dicts (list): the list of dicts to return.
     """
    list_of_dicts = []

    if text!="": # if non-empty list
        text_splitted = text.split("},\n")
        dict_keys.append(None)
        
        for i in range(len(text_splitted)):
            dict_i = {}    
            
            for j in range(len(dict_keys)-1):
                key_value = extract_from_text(text_splitted[i],f"\"{dict_keys[j]}\": ",f"\"{dict_keys[j+1]}\": ")
                key_value = key_value[:key_value.rfind(",\n")].strip()[1:-1]      
                dict_i[dict_keys[j]] = key_value    
        
            list_of_dicts.append(dict_i) # add the dict to the list.
        
    return list_of_dicts

In [133]:
languages = Education_Language_sections['CV__Languages']
Education_Language_sections['CV__Languages'] = convert_text_to_list_of_dicts(
    text = languages[languages.find('[')+1:languages.rfind("]")].strip(),
    dict_keys = ['spoken__language','language__fluency']
)
Education_Language_sections['CV__Languages']

[]

In [134]:
education = Education_Language_sections['CV__Education']
Education_Language_sections['CV__Education'] = convert_text_to_list_of_dicts(
    text = education[education.find('[')+1:education.rfind("]")].strip(),
    dict_keys = ['edu__college','edu__degree','edu__start_date','edu__end_date']
)
Education_Language_sections['CV__Education']

[{'edu__college': 'The LNM Institute of Information Technology',
  'edu__degree': 'B.Tech in Computer Science',
  'edu__start_date': 'June 2023',
  'edu__end_date': 'July 2023'},
 {'edu__college': 'Aaradhan public school',
  'edu__degree': 'class XII',
  'edu__start_date': 'unknown',
  'edu__end_date': 'unknown'},
 {'edu__college': 'Carmel convent Sr. Sec. School',
  'edu__degree': 'class X',
  'edu__start_date': 'unk',
  'edu__end_date': '_college": "Carmel convent Sr. Sec. School",\n      "edu__degree": "class X'}]

In [139]:
def Extract_Education_Language(llm, documents):
    """Extract and evaluate education and language sections."""

    try:
        response_content = invoke_LLM(
            llm,
            documents,
            resume_sections=[
                "CV__Education",
                "Education__evaluation",
                "CV__Languages",
                "Languages__evaluation",
            ],
            info_message="Extract and evaluate education and language sections...",
            language=ASSISTANT_LANGUAGE,
        )

        try:
            # Load response_content into json dictionary
            Education_Language_sections = json.loads(response_content, strict=False)
        except Exception as e:
            print("[ERROR] json.loads returns error:", e)
            print("\n['INFO'] Parse response content...\n")

            list_fields = [
                "CV__Education",
                {"Education__evaluation": ["score__edu", "evaluation__edu"]},
                "CV__Languages",
                {"Languages__evaluation": ["score__language", "evaluation__language"]},
            ]

            list_rfind = [",\n", ",\n", ",\n", ",\n", ",\n", ",\n", ",\n", "\n"]
            list_exclude_first_car = [True, True, False, True, True, True, False, True]

            Education_Language_sections = ResponseContent_Parser(response_content, list_fields, list_rfind, list_exclude_first_car)

            # Convert scores to int
            try:
                Education_Language_sections["Education__evaluation"]["score__edu"] = int(
                    Education_Language_sections["Education__evaluation"]["score__edu"]
                )
            except:
                Education_Language_sections["Education__evaluation"]["score__edu"] = -1
                
            try:
                Education_Language_sections["Languages__evaluation"]["score__language"] = int(
                    Education_Language_sections["Languages__evaluation"]["score__language"]
                )
            except:
                Education_Language_sections["Languages__evaluation"]["score__language"] = -1

            # Split languages and educational texts into a Python list of dict
            languages = Education_Language_sections["CV__Languages"]
            Education_Language_sections["CV__Languages"] = (
                convert_text_to_list_of_dicts(
                    text=languages[
                        languages.find("[") + 1 : languages.rfind("]")
                    ].strip(),
                    dict_keys=["spoken__language", "language__fluency"],
                )
            )
            education = Education_Language_sections["CV__Education"]
            Education_Language_sections["CV__Education"] = (
                convert_text_to_list_of_dicts(
                    text=education[
                        education.find("[") + 1 : education.rfind("]")
                    ].strip(),
                    dict_keys=[
                        "edu__college",
                        "edu__degree",
                        "edu__start_date",
                        "edu__end_date",
                    ],
                )
            )
    except Exception as exception:
        print(f"[Error] {exception}")
        Education_Language_sections = {
            "CV__Education": [],
            "Education__evaluation": {
                "score__edu": -1, 
                "evaluation__edu": "unknown"
            },
            "CV__Languages": [],
            "Languages__evaluation": {
                "score__language": -1,
                "evaluation__language": "unknown",
            },
        }

    return Education_Language_sections

In [140]:
%%time

Education_Language_sections = Extract_Education_Language(llm,documents)
Education_Language_sections

**20:53:25** 	Extract and evaluate education and language sections...
CPU times: total: 0 ns
Wall time: 3min 16s


{'CV__Education': [{'edu__college': 'The LNM Institute of Information Technology',
   'edu__degree': 'B.Tech in Computer Science, GPA : 7.63/10',
   'edu__start_date': 'June 2023',
   'edu__end_date': 'July 2023'},
  {'edu__college': 'Aaradhan public school',
   'edu__degree': 'class XII, Percentage : 93/100',
   'edu__start_date': 'unknown',
   'edu__end_date': 'unknown'},
  {'edu__college': 'Carmel convent Sr. Sec. School',
   'edu__degree': 'class X, Percentage : 93/100',
   'edu__start_date': 'unknown',
   'edu__end_date': 'unknown'}],
 'Education__evaluation': {'score__edu': 80,
  'evaluation__edu': "The education section is well-structured and provides a clear overview of the candidate's academic background. However, it would be more effective if the achievements were highlighted in a separate section or with more emphasis. Overall, the section is satisfactory."},
 'CV__Languages': [{'spoken__language': 'Python',
   'language__fluency': 'unknown'},
  {'spoken__language': 'C/C++',

In [143]:
def Extract_Skills_and_Certifications(llm, documents):
    """Extract Skills and certifications and evaluate these sections."""

    try:
        response_content = invoke_LLM(
            llm,
            documents,
            resume_sections=["candidate__skills","Skills__evaluation","CV__Certifications","Certif__evaluation"],
            info_message="Extract and evaluate the skills and certifications...",
            language=ASSISTANT_LANGUAGE,
        )

        try:
            # Load response_content into json dictionary
            SKILLS_and_CERTIF = json.loads(response_content, strict=False)
        except Exception as e:
            print("[ERROR] json.loads returns error:", e)
            print("\n['INFO'] Parse response content...\n")

            skills = extract_from_text(response_content,"\"candidate__skills\": ", "\"Skills__evaluation\":")
            skills = skills.replace("\n  ","\n").replace("],\n","").replace("[\n","")
            score_skills = extract_from_text(response_content,"\"score__skills\": ", "\"evaluation__skills\":")
            evaluation_skills = extract_from_text(response_content,"\"evaluation__skills\": ", "\"CV__Certifications\":")
            
            certif_text = extract_from_text(response_content,"\"CV__Certifications\": ", "\"Certif__evaluation\":")
            certif_score = extract_from_text(response_content,"\"score__certif\": ", "\"evaluation__certif\":")
            certif_eval = extract_from_text(response_content,"\"evaluation__certif\": ", None)


            # Create the dictionary
            SKILLS_and_CERTIF = {}
            SKILLS_and_CERTIF["candidate__skills"] = [skill.strip()[1:-1] for skill in skills.split(",\n")]  
            
            # Convert the score to int
            try:
                score_skills_int = int(score_skills[0 : score_skills.rfind(",\n")])
            except:
                score_skills_int = -1
                
            SKILLS_and_CERTIF["Skills__evaluation"] = {
                "score__skills": score_skills_int,
                "evaluation__skills": evaluation_skills[: evaluation_skills.rfind("}\n")].strip()[1:-1],
            }

            # Convert text to list of dictionaries
            list_certifs = convert_text_to_list_of_dicts(
                text=certif_text[certif_text.find("[") + 1 : certif_text.rfind("]")].strip(),  
                dict_keys=[
                    "certif__title",
                    "certif__organization",
                    "certif__date",
                    "certif__expiry_date",
                    "certif__details",
                ],
            )
            SKILLS_and_CERTIF["CV__Certifications"] = list_certifs
            try:
                certif_score_int = int(certif_score[0 : certif_score.rfind(",\n")])
            except:
                certif_score_int = -1
            SKILLS_and_CERTIF["Certif__evaluation"] = {
                "score__certif": certif_score_int,
                "evaluation__certif": certif_eval[: certif_eval.rfind("}\n")].strip()[1:-1],
            }

    except Exception as exception:
        SKILLS_and_CERTIF = {
            "candidate__skills": [],
            "Skills__evaluation": {
                "score__skills": -1,
                "evaluation__skills": "unknown",
            },
            "CV__Certifications": [],
            "Certif__evaluation": {
                "score__certif": -1,
                "evaluation__certif": "unknown",
            },
        }
        print(f"[Error] {exception}")

    return SKILLS_and_CERTIF

In [144]:
%%time

SKILLS_and_CERTIF = Extract_Skills_and_Certifications(llm,documents)
SKILLS_and_CERTIF

**21:02:59** 	Extract and evaluate the skills and certifications...
[ERROR] json.loads returns error: Extra data: line 5 column 1 (char 234)

['INFO'] Parse response content...

CPU times: total: 46.9 ms
Wall time: 9min 46s


{'candidate__skills': [' 8',
  'evaluation_skills": "The skills section is well-structured and easy to read. However, it would be more effective if the candidate provided specific examples or projects that demonstrate their skills."\n}\n```\n\n**CV Certifications**\n\n* Certificate 1:\n\t+ Title: Volunteer Engagement Executive\n\t+ Organization: BloodConnect Jaipur\n\t+ Date: unknown\n\t+ Expiry date: unknown\n\t+ Details: LOR/Certificate for performing social work in making India blood sufficient.\n* Certificate 2:\n\t+ Title: IBM Data Analysis\n\t+ Organization: IBM, Coursera\n\t+ Date: unknown\n\t+ Expiry date: unknown\n\t+ Details: For completing IBM data Analyst Course\n* Certificate 3:\n\t+ Title: Automatic Birthday Generation App\n\t+ Organization: LNMIIT, Jaipur\n\t+ Date: unknown\n\t+ Expiry date: unknown\n\t+ Details: Received for my contributions in the Avaataran App that is deployed on LNMIIT server.\n\n```json\n  {\n      "certif_title": "Volunteer Engagement Executive',
 

In [145]:
def Extract_PROFESSIONAL_EXPERIENCE(llm, documents):
    """Extract the list of work experience and projects."""

    try:
        response_content  = invoke_LLM(
            llm,
            documents,
            resume_sections=["Work__experience", "CV__Projects"],
            info_message="Extract list of work experience and projects...",
            language=ASSISTANT_LANGUAGE,
        )

        try:
            # Load response_content into json dictionary
            PROFESSIONAL_EXPERIENCE = json.loads(response_content+"}", strict=False)
        except Exception as e:
            print("[ERROR] json.loads returns error:", e)
            print("\n['INFO'] Parse response content...\n")
            work_experiences = extract_from_text(response_content, '"Work__experience": ', '"CV__Projects":')
            projects = extract_from_text(response_content, '"CV__Projects": ', None)

            # Create the dictionary
            PROFESSIONAL_EXPERIENCE = {}
            PROFESSIONAL_EXPERIENCE["Work__experience"] = convert_text_to_list_of_dicts(
                text=work_experiences[work_experiences.find("[") + 1 : work_experiences.rfind("]")].strip()[1:-1],
                dict_keys=["job__title", "job__company", "job__start_date", "job__end_date"],
            )
            PROFESSIONAL_EXPERIENCE["CV__Projects"] = convert_text_to_list_of_dicts(
                text=projects[projects.find("[") + 1 : projects.rfind("]")].strip()[1:-1],
                dict_keys=["project__title", "project__start_date", "project__end_date"],
            )
        # delete 'unknown' projects and work experience
        try:
            for work_experience in PROFESSIONAL_EXPERIENCE["Work__experience"]:
                if work_experience["job__title"] == "unknown":
                    PROFESSIONAL_EXPERIENCE["Work__experience"].remove(work_experience)
        except Exception as e:
            print(e)
        try:
            for project in PROFESSIONAL_EXPERIENCE["CV__Projects"]:
                if project["project__title"] == "unknown":
                    PROFESSIONAL_EXPERIENCE["CV__Projects"].remove(project)
        except Exception as e:
            print(e)

    except Exception as exception:
        PROFESSIONAL_EXPERIENCE = {
            "Work__experience": [],
            "CV__Projects": []
        }
        print(exception)
        
    return PROFESSIONAL_EXPERIENCE

In [146]:
%%time

PROFESSIONAL_EXPERIENCE = Extract_PROFESSIONAL_EXPERIENCE(llm,documents)
PROFESSIONAL_EXPERIENCE

**21:12:45** 	Extract list of work experience and projects...
[ERROR] json.loads returns error: Extra data: line 6 column 6 (char 163)

['INFO'] Parse response content...

CPU times: total: 0 ns
Wall time: 4min 39s


{'Work__experience': [{'job__title': 'Intern',
   'job__company': 'Agent Oriented Development | LNMIIT',
   'job__start_date': '2023/06',
   'job__end_date': '2023/07'},
  {'job__title': 'B.Tech in Computer Science',
   'job__company': 'The LNM Institute of Information Technology',
   'job__start_date': '2021/11',
   'job__end_date': 'Present'}],
 'CV__Projects': [{'project__title': 'Multi class Classification Using Image transformers',
   'project__start_date': '2024/03',
   'project__end_date': 'unknown'},
  {'project__title': 'Loksabha 2024 data analysis: Instagram',
   'project__start_date': '2024/04',
   'project__end_date': 'unknown'},
  {'project__title': 'Anime Face Generator',
   'project__start_date': '2024/01',
   'project__end_date': 'unknown'},
  {'project__title': 'ASL Convertor',
   'project__start_date': '2023/06',
   'project__end_date': 'unknown'}]}

In [150]:
def get_relevant_documents(query,documents,retriever):
    """Retreieve most relevant documents from Langchain documents using the CoherRerank retriever."""

    # 1.1. Get relevant documents using the CohereRerank retriever

    retrieved_docs = retriever.get_relevant_documents(query)

    # 1.2. Keep only relevant documents where (relevance_score >= (max(relevance_scores) - 0.1))

    relevance_scores = [
        retrieved_docs[j].metadata["relevance_score"]
        for j in range(len(retrieved_docs))
    ]
    max_relevance_score = max(relevance_scores)
    threshold = max_relevance_score - 0.1

    relevant_doc_ids = []

    for j in range(len(retrieved_docs)):

        # Keep relevant documents with (relevance_score >= threshold)
        if retrieved_docs[j].metadata["relevance_score"] >= threshold:
            relevant_doc_ids.append(retrieved_docs[j].metadata["doc_number"])

    # Append the next document to the most relevant document, as relevant information may be split between two documents.
    relevant_doc_ids.append(min(relevant_doc_ids[0]+1, len(documents)-1))

    relevant_doc_ids = sorted(set(relevant_doc_ids))  # Sort doc ids
    
    # get the most relevant documents (+ next document)
    relevant_documents = [
        documents[k] for k in relevant_doc_ids
    ]  

    return relevant_documents

In [151]:
def Extract_Job_Responsibilities(llm, documents, retriever, PROFESSIONAL_EXPERIENCE):
    """Extract job responsibilities for each job in PROFESSIONAL_EXPERIENCE."""

    now = (datetime.datetime.now()).strftime("%H:%M:%S")
    print(f"**{now}** \tExtract work experience responsabilities...")

    for i in range(len(PROFESSIONAL_EXPERIENCE["Work__experience"])):
        try:
            Work_experience_i = PROFESSIONAL_EXPERIENCE["Work__experience"][i]
            print(f"\n\n{i}: {Work_experience_i['job__title']}", end=" | ")

            # 1. Query
            query = f"""Extract from the resume delimited by triple backticks \
all the duties and responsabilities of the following work experience: \
(title = '{Work_experience_i['job__title']}'"""
            if str(Work_experience_i["job__company"]) != "unknown":
                query += f" and company = '{Work_experience_i['job__company']}'"
            if str(Work_experience_i["job__start_date"]) != "unknown":
                query += f" and start date = '{Work_experience_i['job__start_date']}'"
            if str(Work_experience_i["job__end_date"]) != "unknown":
                query += f" and end date = '{Work_experience_i['job__end_date']}'"
            query += ")\n"

            # 2. For longer CVs (i.e. number of documents > 2), 
            # use the CohereRerank retriever to find the most relevant documents.
            if len(documents)>2:
                try:
                    relevant_documents = get_relevant_documents(query, documents, retriever)
                except Exception as err:
                    print(f"[ERROR] get_relevant_documents error: {err}")
                    relevant_documents = documents
            else:
                relevant_documents = documents
                
            print(f"relevant docs : {len(relevant_documents)}", end=" | ")

            # 3. Invoke the LLM            
            prompt = (
                query
                + f"""Output the duties in a json dictionary with the following keys (__duty_id__,__duty__). \
Use this format: "1":"duty","2":"another duty".
Resume:\n\n ```{relevant_documents}```"""
            )

            print(f"prompt tokens: {sum(tiktoken_tokens([prompt]))}", end=" | ")

            response = llm.invoke(prompt)
            response_content = response.content[response.content.find("{") : response.content.rfind("}") + 1]
            print(f"""response tokens: {sum(tiktoken_tokens([response_content]))}""")

            try:
                # 4. Convert the response content to json dict and update work_experience
                Work_experience_i["work__duties"] = json.loads(response_content, strict=False)  
            except Exception as e:
                print("\n[ERROR] json.loads returns error:", e, "\n\n")
                print("\n['INFO'] Parse response content...\n")                
                Work_experience_i["work__duties"] = {}
                list_duties = (
                    response_content[response_content.find("{") + 1 : response_content.rfind("}")].strip().split(",\n")
                )
                for j in range(len(list_duties)):
                    try:
                        Work_experience_i["work__duties"][f"{j+1}"] = (list_duties[j].split('":')[1].strip()[1:-1])
                    except:
                        Work_experience_i["work__duties"][f"{j+1}"] = "unknown"

        except Exception as exception:
            print(f"[ERROR] {exception}")
            Work_experience_i["work__duties"] = {}          

    return PROFESSIONAL_EXPERIENCE

In [152]:
%%time

PROFESSIONAL_EXPERIENCE = Extract_Job_Responsibilities(llm,documents,retriever,PROFESSIONAL_EXPERIENCE)
PROFESSIONAL_EXPERIENCE['Work__experience']

**21:21:06** 	Extract work experience responsabilities...


0: Intern | relevant docs : 1 | [ERROR] name 'tiktoken_tokens' is not defined


1: B.Tech in Computer Science | relevant docs : 1 | [ERROR] name 'tiktoken_tokens' is not defined
CPU times: total: 0 ns
Wall time: 13.5 ms


[{'job__title': 'Intern',
  'job__company': 'Agent Oriented Development | LNMIIT',
  'job__start_date': '2023/06',
  'job__end_date': '2023/07',
  'work__duties': {}},
 {'job__title': 'B.Tech in Computer Science',
  'job__company': 'The LNM Institute of Information Technology',
  'job__start_date': '2021/11',
  'job__end_date': 'Present',
  'work__duties': {}}]

In [155]:
def Extract_Project_Details(llm, documents, retriever, PROFESSIONAL_EXPERIENCE):
    """Extract project details for each project in PROFESSIONAL_EXPERIENCE."""

    now = (datetime.datetime.now()).strftime("%H:%M:%S")
    print(f"**{now}** \tExtract project details...")

    for i in range(len(PROFESSIONAL_EXPERIENCE["CV__Projects"])):
        try:
            project_i = PROFESSIONAL_EXPERIENCE["CV__Projects"][i]
            print(f"{i}: {project_i['project__title']}", end=" | ")

            # 1. Extract relevant documents
            query = f"""Extract from the resume (delimited by triple backticks) what is listed about the following project: \
(project title = '{project_i['project__title']}'"""
            if str(project_i["project__start_date"]) != "unknown":
                query += f" and start date = '{project_i['project__start_date']}'"
            if str(project_i["project__end_date"]) != "unknown":
                query += f" and end date = '{project_i['project__end_date']}'"
            query += ")"

            if len(documents)>2:
                try:
                    relevant_documents = get_relevant_documents(query, documents, retriever)
                except Exception as err:
                    print(f"[ERROR] get_relevant_documents error: {err}")
                    relevant_documents = documents
            else:
                relevant_documents = documents
                
            print(f"relevant docs : {len(relevant_documents)}", end=" | ")

            # 2. Invoke the LLM

            prompt = (query + f"""Format the extracted text into a string (with bullet points).
Resume:\n\n ```{relevant_documents}```""" )

            response = llm.invoke(prompt)

            response_content = response.content
            project_i["project__description"] = response_content

        except Exception as exception:
            project_i["project__description"] = "unknown"
            print(exception)

    return PROFESSIONAL_EXPERIENCE

In [156]:
%%time

PROFESSIONAL_EXPERIENCE = Extract_Project_Details(llm,documents,retriever,PROFESSIONAL_EXPERIENCE)
PROFESSIONAL_EXPERIENCE['CV__Projects']

**21:28:11** 	Extract project details...
0: Multi class Classification Using Image transformers | relevant docs : 1 | 1: Loksabha 2024 data analysis: Instagram | relevant docs : 1 | 2: Anime Face Generator | relevant docs : 1 | 3: ASL Convertor | relevant docs : 1 | CPU times: total: 93.8 ms
Wall time: 3min 18s


[{'project__title': 'Multi class Classification Using Image transformers',
  'project__start_date': '2024/03',
  'project__end_date': 'unknown',
  'project__description': 'Here is the extracted text about the project "Multi class Classification Using Image transformers" in a string with bullet points:\n\n* Project: Multi class Classification Using Image transformers\n* Start Date: March 2024\n* Details:\n\t+ An image classification model built with a transformer encoder only architecture.\n\t+ Developed a PyTorch model, achieving 75% accuracy; improved to 95% with transfer learning and fine-tuning a pre-trained vision transformer.'},
 {'project__title': 'Loksabha 2024 data analysis: Instagram',
  'project__start_date': '2024/04',
  'project__end_date': 'unknown',
  'project__description': 'Here is the extracted text about the project "Loksabha 2024 data analysis: Instagram" in a string with bullet points:\n\n* Project Title: Loksabha 2024 data analysis: Instagram\n* Start Date: April 2