In [65]:
import openai
import os
import pickle
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
import json
import hashlib
from typing import List
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
import torch
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
from langchain.globals import set_verbose, set_debug
openai.api_key  = os.getenv('OPENAI_API_KEY')
from IPython.display import display, Markdown

In [None]:
def get_device():
    if os.name == "nt":
        device = torch.device("cuda" if torch.cuda.is_available else "cpu")
    else:
        device = torch.device("mps" if torch.backends.mps.is_available else "cpu")
    return device

In [None]:
def map_skills_to_job(job,topk,model,write_output=True):

    model = SentenceTransformer(model_name,device=get_device())
    vector = model.encode(job, show_progress_bar=True)

    if model_name == "WhereIsAI/UAE-Large-V1":
        collection_name="skills_2024-02-16_UAE-Large-V1"
    if model_name == "thenlper/gte-large":
        collection_name="skills_2024-02-16_gte-large"
    

    qdrant_client = QdrantClient(os.getenv("QDRANT_URL"))
    hits = qdrant_client.search(
        collection_name=collection_name,
        query_vector=vector,
        limit=topk
        )
    top_skills = []
    for hit in hits:
        top_skills.append({"job":job} | {"skill":hit.payload['name']} | {"score":hit.score})
   
    top_skills = pd.DataFrame(top_skills)
    if write_output:
        path = os.path.join(os.getenv("DATA"),"model_outputs",f"""{topk}_skills_for_{job}_{model_name.split("/")[1]}_.csv""")
        print(path)
        top_skills.to_csv(path, index=None)

In [None]:
def map_skills(skill, top_k, model, model_name, write_output=True):

    vector = model.encode(skill, show_progress_bar=True)

    if model_name == "WhereIsAI/UAE-Large-V1":
        collection_name="skills_2024-02-16_UAE-Large-V1"
    if model_name == "thenlper/gte-large":
        collection_name="skills_2024-02-16_gte-large"
    

    qdrant_client = QdrantClient(os.getenv("QDRANT_URL"))
    hits = qdrant_client.search(
        collection_name=collection_name,
        query_vector=vector,
        limit=top_k
        )
    top_skills = []
    for hit in hits:
        #print(f"""{hit.payload['name']}, {hit.score}""")
        top_skills.append({"query_skill":skill} | {"skill":hit.payload['name']} | {"score":hit.score})
   
    return top_skills
    if write_output:
        top_skills = pd.DataFrame(top_skills)
        path = os.path.join(os.getenv("DATA"),"model_outputs",f"""{topk}_skills_for_{skill}_{model_name.split("/")[1]}_.csv""")
        print(path)
        top_skills.to_csv(path, index=None)


In [None]:
def map_jobs_to_skill(skill,topk,model,write_output=True):

    model_name = os.getenv("EMBEDDING_MODEL")
    model = SentenceTransformer(model_name,device=get_device())
    vector = model.encode(skill, show_progress_bar=True)

    if model_name == "WhereIsAI/UAE-Large-V1":
        collection_name="jobs_2024-02-16_UAE-Large-V1"
    if model_name == "thenlper/gte-large":
        collection_name="jobs_2024-02-16_gte-large"
    

    qdrant_client = QdrantClient(os.getenv("QDRANT_URL"))
    hits = qdrant_client.search(
        collection_name=collection_name,
        query_vector=vector,
        limit=topk
        )
    top_jobs = []
    for hit in hits:
        #print(f"""{hit.payload['name']}, {hit.score}""")
        top_jobs.append({"skill":skill} | {"job":hit.payload['name']} | {"score":hit.score})
   
    top_jobs = pd.DataFrame(top_jobs)
    if write_output:
        path = os.path.join(os.getenv("DATA"),"model_outputs",f"""{topk}_jobs_for_{skill}_{model_name.split("/")[1]}_.csv""")
        print(path)
        top_jobs.to_csv(path, index=None)


In [39]:
def generate_assesment(skill, scale, prompt_template_fname, llm):

    """
    Given a skill assess its importance to a job
    """

    #first fetch the top n most similar jobs for the skill
    model = SentenceTransformer(os.getenv("EMBEDDING_MODEL"),device=get_device())
    n = 100
    vector = model.encode(skill,show_progress_bar=True)
    qdrant_client = QdrantClient(os.getenv("QDRANT_URL"))
    hits = qdrant_client.search(
        collection_name="jobs_2024-02-16_UAE-Large-V1",
        query_vector=vector,
        limit=100
        )
    
    response_schemas = []
    
    question_schema =  ResponseSchema(name=f"""Assesment""",
                                          description=f"""Assesment of the importance of skill to the job
                                          on a scale of 1 to {scale}""",
                                          type=int
                                         )
    response_schemas.append(question_schema)
    answer_schema =  ResponseSchema(name=f"""Reasoning""",
                                          description=f"""Reasoning to support the assesment""",
                                          type=str
                                         )
    response_schemas.append(answer_schema)

    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()


    prompt_path = os.path.join(os.getenv("PROMPTS"),prompt_template_fname)
    with open(prompt_path,"r") as file:
        template_string = file.read()
    prompt_template = ChatPromptTemplate.from_template(template_string)


    for hit in hits:
        
        job = hit.payload["name"]
        print(f"""assesing the importance of skill {skill} for job {job}""")

        message = prompt_template.format_messages(
                    skill=skill,
                    job=job,
                    scale=scale,        
                    format_instructions=format_instructions)

        response = llm(message)
        output_dict = output_parser.parse(response.content)
        output_dict["job"] = job
        output_dict["skill"] = skill
        output_dict["similarity"] = hit.score

        print(output_dict)

        #if "/" in job:
            #job.replace('/','_')
        try:
            path = os.path.join(os.getenv("ASSESMENTS"),f"""assesment_{skill}_{job}.json""")
            with open(path, "a",  encoding='utf-8') as file:
                json.dump(output_dict,   file, ensure_ascii=False)
        except Exception as e:
            pass
        

In [None]:
def read_csv_catalogues():
    file_path = os.path.join(os.getenv("DATA"),"Lightcast/Skills","skill_catalogue.csv")
    skill_catalogue = pd.read_csv(file_path)
    file_path = os.path.join(os.getenv("DATA"),"Lightcast/Jobs","skill_catalogue.csv")
    job_catalogue = pd.read_csv(file_path)
    return skill_catalogue, job_catalogue

In [None]:
def get_skill_to_assess():
    skill_to_assess = pd.DataFrame()
    skill_to_assess['job'] = job_catalogue
    skill_to_assess['skill'] = skill_catalogue.head(n=1).values[0][0]
    return skill_to_assess

In [40]:
def extract_skills(jobs, prompt_template_fname, llm):

    """
    Extract skills from job
    """
    #first fetch the top n most similar jobs for the skill
    
    response_schemas = []
    question_schema =  ResponseSchema(name=f"""Skills""",
                                          description=f"""The skills from the job description in a python list"""
                                          
                                         )
    response_schemas.append(question_schema)
    answer_schema =  ResponseSchema(name=f"""Benefits""",
                                          description=f"""The benefits from the job description in a python list"""
                                         )
    response_schemas.append(answer_schema)

    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()


    prompt_path = os.path.join(os.getenv("PROMPTS"),prompt_template_fname)
    with open(prompt_path,"r") as file:
        template_string = file.read()
    prompt_template = ChatPromptTemplate.from_template(template_string)


    for job in jobs:
        
        job_description = job["description"]
        job_id = job["id"]
        print(job_id)

        print(f"""Extracting skills and benefits for job {job_id}""")

        message = prompt_template.format_messages(
                    job_description=job_description,
                    format_instructions=format_instructions)

        response = llm(message)
        output_dict = output_parser.parse(response.content)
        output_dict["id"] = job_id
        print(output_dict)
        
        try:
            path = os.path.join(os.getenv("EXTRACTED_SKILLS"),f"""extracted_skills_{job_id}.json""")
            with open(path, "a",  encoding='utf-8') as file:
                json.dump(output_dict,   file, ensure_ascii=False)
        except Exception as e:
            pass
        

In [58]:
def refine_skills(initial_skill, vector_matches, prompt_template_fname, jobid, llm):

    """
    Refine extracted skills
    """
    #first fetch the top n most similar jobs for the skill
    
    response_schemas = []
    schema =  ResponseSchema(name=f"""Refined Skills""",
                             description=f"""The refined skills from the top 10 after your review in a python list of strings"""
                             )

    response_schemas.append(schema)
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()
    
    

    prompt_path = os.path.join(os.getenv("PROMPTS"),prompt_template_fname)
    with open(prompt_path,"r") as file:
        template_string = file.read()
    prompt_template = ChatPromptTemplate.from_template(template_string)


    message = prompt_template.format_messages(
        client_skill=initial_skill,
        preliminary_matches=vector_matches,
        format_instructions=format_instructions
        )

    response = llm(message)
    output_dict = output_parser.parse(response.content)
    output_dict["id"] = jobid
    output_dict["query_skill"] = initial_skill
    
    return output_dict
'''
    
     try:
        path = os.path.join(os.getenv("EXTRACTED_SKILLS"),f"""extracted_skills_{job_id}.json""")
        with open(path, "a",  encoding='utf-8') as file:
           json.dump(output_dict,   file, ensure_ascii=False)
    except Exception as e:
    pass
'''

'\n    \n     try:\n        path = os.path.join(os.getenv("EXTRACTED_SKILLS"),f"""extracted_skills_{job_id}.json""")\n        with open(path, "a",  encoding=\'utf-8\') as file:\n           json.dump(output_dict,   file, ensure_ascii=False)\n    except Exception as e:\n    pass\n'

In [None]:
def load_job_descriptions():

    jd_path = os.getenv("JOB_DESCRIPTIONS")
    jd_files = os.listdir(jd_path)
    to_extract = []
    for jd_file in tqdm(jd_files):
        with open(os.path.join(jd_path,jd_file), 'r') as file:
            data = json.load(file)
            for item in data:
                to_extract.append({"id":item["id"],
                                   "positionName":item["positionName"],
                                   "description":item["description"]})
            
    
    return to_extract

## Generate skill assesments

In [None]:
model = os.getenv("LLM_URL")
#model = "gpt-3.5-turbo"
#model = "gpt-4-turbo-preview"
#llm = ChatOpenAI(temperature=temperature, model=accepted_answer_model)
scale = 10
temperature = 0.2
max_tokens=512
random_seed = 1
prompt_template_fname = "assess_skill_for_job.txt"

llm = ChatOpenAI(
        base_url=os.getenv("LLM_URL"),
        api_key="not-needed",
        temperature=temperature,
        max_tokens=max_tokens,
        model_kwargs={"seed": random_seed}
        )

generate_assesment("C++", scale, prompt_template_fname, llm)



## Extract skills

In [None]:
model = os.getenv("LLM_URL")

temperature = 0.2
max_tokens=1024
random_seed = 1
prompt_template_fname = "extract_skills_from_job_description.txt"

llm = ChatOpenAI(
        base_url=os.getenv("LLM_URL"),
        api_key="not-needed",
        temperature=temperature,
        max_tokens=max_tokens,
        model_kwargs={"seed": random_seed}
        )
jobs = load_job_descriptions()
extract_skills(jobs, prompt_template_fname, llm)

## Map Extracted skills to Lightcast with refinement

In [59]:
def map_extracted_skills_to_lc_refine():

    set_debug(False)
    
    model_name = os.getenv("EMBEDDING_MODEL")
    model = SentenceTransformer(model_name,device=get_device())
    top_k = 10
    temperature = 0.2
    max_tokens=1024
    random_seed=1
    
    prompt_template_fname = "refine_extracted_skills.txt"
    
    llm = ChatOpenAI(
        base_url=os.getenv("LLM_URL"),
        api_key="not-needed",
        temperature=temperature,
        max_tokens=max_tokens,
        model_kwargs={"seed": random_seed}
        )

    data_path = os.getenv("EXTRACTED_SKILLS")
    files = [f for f in os.listdir(data_path) if f.endswith(".json")]

    mapped_skills = []

    output_data = []
    for f in tqdm(files[:2]):
        # each file is a job with initially extracted skills
        with open(os.path.join(data_path,f), 'r') as data_file:

            refined_skills = []
            try:
                data = json.load(data_file)
                jobid = data["id"]
                print(f"""processing job {jobid}""")
                for skill in data["Skills"]:
                    # do a vector search for the top_k most similar skilles to this skill
                    top_skills = map_skills(skill, top_k, model, model_name, write_output=True)
                    top_skills = pd.DataFrame(top_skills)
                    top_skills["id"] = jobid
                    vector_matches = ','.join(list(top_skills["skill"]))
                    # refine the initial skill matches given the vector matches as context
                    refined_skills.append(refine_skills(skill, vector_matches, prompt_template_fname, jobid, llm))
                    mapped_skills.append(top_skills)
        
                #mapped_skills = pd.concat(mapped_skills)
                refined_skills = pd.DataFrame(refined_skills)
                matrix  = list(refined_skills['Refined Skills'])
                refined_skills = [item for row in matrix for item in row]
            except Exception as e:
                pass
    
        print(f"""refined skills for {jobid}""")
        output_data.append({"id":jobid,"refined_skills":refined_skills})

    output_data = pd.DataFrame(output_data)
    return output_data

In [60]:
output_data = map_extracted_skills_to_lc_refine()

No sentence-transformers model found with name /Users/vasilishatzopoulos/.cache/torch/sentence_transformers/WhereIsAI_UAE-Large-V1. Creating a new one with MEAN pooling.
  0%|          | 0/2 [00:00<?, ?it/s]

processing job 6d70e9dd538a4c45


Batches: 100%|██████████| 1/1 [00:00<00:00, 17.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.25it/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s]
 50%|█████     | 1/2 [00:32<00:32, 32.18s/it]

refined skills for 6d70e9dd538a4c45
processing job 0a07b81587d36976


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.31it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.29it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.33it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.04it/s]
100%|██████████| 2/2 [01:04<00:00, 32.20s/it]

refined skills for 0a07b81587d36976





In [64]:
list(output_data[output_data['id']=='0a07b81587d36976']['refined_skills'])

[['Data Science',
  'Machine Learning',
  'Big Data',
  'Advanced Analytics',
  'Microsoft Office',
  'Telecommuting']]