In [1]:
import openai
import os
import pickle
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
import json

from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
openai.api_key  = os.getenv('OPENAI_API_KEY')

In [2]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        response_format={"type": "json_object" },
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message["content"]

In [3]:
def get_completion_from_messages(messages, model="gpt-3.5-turbo", temperature=0):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, # this is the degree of randomness of the model's output
    )
#     print(str(response.choices[0].message))
    return response.choices[0].message["content"]

In [4]:
def get_completion_and_token_count(messages, 
                                   model="gpt-3.5-turbo", 
                                   temperature=0, 
                                   max_tokens=500):
    
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, 
        max_tokens=max_tokens,
    )
    
    content = response.choices[0].message["content"]
    
    token_dict = {
        'prompt_tokens':response['usage']['prompt_tokens'],
        'completion_tokens':response['usage']['completion_tokens'],
        'total_tokens':response['usage']['total_tokens'],
    }

    return content, token_dict

In [6]:
def job_descriptions_to_jsonl():
    """
    Reads in a pickle for each job description and writes them back as json lines in a single file
    """
    jds = []
    corrupt = 0
    available_job_descriptions = list(filter(lambda f: '.pkl' in f, os.listdir("synthetic_job_descriptions")))
    num_jds = len(available_job_descriptions)

    for f in available_job_descriptions:

        path = os.path.join("synthetic_job_descriptions",f)
        try:
            with open(path, 'rb') as file:
                j = pickle.load(file)
                jds.append(j)
        except EOFError:
            print("Empty or corrupted pickle file")
            corrupt += 1

    print(corrupt)
    # Path to your output file
    # the name of the json lines object starts with the number of job descriptions available
    fname = f"""{num_jds}_job_descriptions.jsonl"""
    path = os.path.join("synthetic_job_descriptions", fname)

    # Write to the file
    with open(path, 'w') as file:
        for jd in jds:
            json.dump(jd, file)
            file.write('\n')
    print(f"""saved {num_jds} to path: {path}""")
    return path


In [28]:
def job_descriptions_to_DataFrame(path):

    # List to hold the dictionaries
    data = []
    #keys = {'Role Overview', 'Key Responsibilities', 'Skills', 'Qualifications', 'Job Title'}
    # Read the file
    with open(path, 'r') as file:
        for line in tqdm(file):
            # Deserialize each line to a Python dictionary
            dictionary = json.loads(line)
            data.append(dictionary)
            
    return pd.DataFrame(data)


In [8]:
def post_process_job_descriptions(data,jobs):
    
    data['Key Responsibilities'] = data['Key Responsibilities'].apply(lambda r: ''.join(r))
    data['Skills'] = data['Skills'].apply(lambda r: ''.join(r))
    data['Qualifications'] = data['Qualifications'].apply(lambda r: ''.join(r))
    
    job_descriptions = []
    corrupt = 0
    for ix, row in data.iterrows():
    
    
        s1 = row["Role Overview"]
        s2 = ':'.join(["Key Responsibilities",row["Key Responsibilities"]])
        s3 = ':'.join(["Skills",row["Skills"]])
        s4 = ':'.join(["Qualifications",row["Qualifications"]])
        try:
            jd = s1 + ' ' + s2 + ' ' +  s3 + ' '  + s4
            job_descriptions.append(jd)
        except TypeError:
            corrupt += 1
            job_descriptions.append("corrupt")

    data["Job Description"] = job_descriptions
    data['TITLE_NAME'] = data['Job Title']
    data = data.drop(columns=['Job Title'])
    
    data = data.merge(jobs, on="TITLE_NAME", how='left')
    data['Posting Count'] = data['COUNT(*)']
    data = data.drop(columns=['COUNT(*)'])
    
    print(f"""num corrupt {corrupt}""")
    
    fname = f"""{data.shape[0]}_job_descriptions.xlsx"""
    path = os.path.join("synthetic_job_descriptions", fname)
    print(f"""writting post processed job descriptions to {path}""")
    data.sort_values(by='Posting Count Rank').to_excel(path,index=None)
    

In [9]:
def generate_job_descriptions_LC(jobs, examples, model, start=1,stop=1):

    """
    Uses a prompt with a number of examples to generate job descriptions as json from job titles.
    Writes job descriptions as pickle objects.
    """

    job_title_schema = ResponseSchema(name="Job Title",description="The Job Title provided.")
    role_overview_schema = ResponseSchema(name="Role Overview",description="The role overview.")
    key_responsibilities_schema = ResponseSchema(name="Key Responsibilities",description="The role key responsibilities.")
    skills_schema = ResponseSchema(name="Skills",description="The role skills.")
    qualifications_schema = ResponseSchema(name="Qualifications",description="The role qualifications.")
    
    response_schemas = [
        job_title_schema, 
        role_overview_schema,
        key_responsibilities_schema,
        skills_schema,
        qualifications_schema
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()

    chat = ChatOpenAI(temperature=0.0, model=model)
    
    template_string = """You are an expert in writing informative, and rich job descriptions like 
        in the examples below where each job description start with the string 'Job Title:' \n\n\
        {jd_examples} \n\n. Using the same style and length as the examples provided please write a
        job description for the job in triple backticks ```{job}```.
        {format_instructions}
        """
    
    prompt_template = ChatPromptTemplate.from_template(template_string)
    
    for idx, job in tqdm(jobs.iloc[start:stop].iterrows()):
        
        message = prompt_template.format_messages(
                    jd_examples=examples,
                    job=job["TITLE_NAME"],
                    format_instructions=format_instructions)
        
        response = chat(message)
        output_dict = output_parser.parse(response.content)
        
        path = os.path.join("synthetic_job_descriptions",f"""{job["TITLE"]}_GTP4_Job_Description.pkl""")
        with open(path, 'wb') as file:
            pickle.dump(output_dict, file)
    


### Load Lightcast TITLE_NAME database and example job descriptions

In [10]:
## Load examples
with open("Job_Title_ Occupational_Therapist.txt",'r') as f:
    jd1 = f.read()
with open("Job_Title_Project Manager.txt",'r') as f:
    jd2 = f.read()
examples = '\n\n\n'.join([jd1,jd2])

# Load the db of Lightcast job titles with counts
jobs = pd.read_csv('Lightcast_TITLE-TITLE_NAME-COUNT.csv')
jobs = jobs[jobs['TITLE_NAME'] != "Unclassified"]

jobs['Posting Count Rank'] = jobs['COUNT(*)'].rank(ascending=False)

#filter by the unmmaped sample
#jobs_subset = pd.read_excel('TITLE_NAME_validation.xlsx')
#print(jobs_subset.columns)
#jobs = jobs[jobs['TITLE_NAME'].isin(list(jobs_subset['TITLE_NAME_ADDITIONAL']))]
jobs.shape

(73267, 4)

## Fetch job descriptions

In [86]:
model = "gpt-3.5-turbo"
model = "gpt-4-turbo-preview"
start = 1400
stop = 1500
generate_job_descriptions_LC(jobs, examples, model, start=start, stop=stop)

100it [44:08, 26.48s/it]


## Read all pickles and write as one jsonl file also store as DataFrame

In [87]:
path = job_descriptions_to_jsonl()

0
saved 1500 to path: synthetic_job_descriptions/1500_job_descriptions.jsonl


In [88]:
data = job_descriptions_to_DataFrame(path)
data.head()

1500it [00:00, 65986.22it/s]


Unnamed: 0,Job Title,Role Overview,Key Responsibilities,Skills,Qualifications
0,Stocker,The Stocker plays a crucial role in retail and...,- Receive and unload inventory from delivery t...,- Strong organizational and time management sk...,- High school diploma or equivalent.\n- Previo...
1,Academic Advisor,The Academic Advisor is essential in guiding s...,- Evaluate students’ educational and career go...,- Strong interpersonal and communication skill...,"- Bachelor’s degree in Education, Counseling, ..."
2,Labor and Delivery Travel Registered Nurse,The Labor and Delivery Travel Registered Nurse...,- Monitor patient's vital signs and contractio...,- Strong clinical skills with a focus on labor...,- Bachelor of Science in Nursing (BSN) or Asso...
3,Construction Superintendent,The Construction Superintendent is a key leade...,- Oversee and direct construction projects fro...,- Proven leadership and human resources manage...,- Proven experience as a construction superint...
4,PHP Developer,The PHP Developer is responsible for creating ...,"- Write clean, well-designed code and produce ...",- Strong knowledge of PHP web frameworks such ...,"- Bachelor's degree in Computer Science, Engin..."


## Post process job descriptions

In [89]:
post_process_job_descriptions(data,jobs)

num corrupt 0
writting post processed job descriptions to synthetic_job_descriptions/1500_job_descriptions.xlsx
