# Categorize raw job titles into pre-defined job functions and job levels

# Load library

In [2]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
import pandas as pd
import json

In [5]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.getenv('OPENAI_API_KEY')

# Define Model and output format

In [7]:
# Temp = 0 so that we get clean information without a lot of creativity
chat_model = ChatOpenAI(temperature=0, max_tokens=1000)

In [8]:
# Define reponse structured as prompt template
response_schemas = [
    ResponseSchema(name="input_job titles", description="This is the raw job titles from the user"),
    ResponseSchema(name="job functions", description="This is the job function you feel is most closely matched to the users input"),
    ResponseSchema(name="job level", description="This is the job level you feel is most closely matched to the users input"),
    ResponseSchema(name="match_score",  description="A score 0-100 of how close you think the match is between user input and your match")
]

# parse your output
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [9]:
# See the prompt template you created for formatting
format_instructions = output_parser.get_format_instructions()
print (output_parser.get_format_instructions())

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "\`\`\`json" and "\`\`\`":

```json
{
	"input_job titles": string  // This is the raw job titles from the user
	"job functions": string  // This is the job function you feel is most closely matched to the users input
	"job level": string  // This is the job level you feel is most closely matched to the users input
	"match_score": string  // A score 0-100 of how close you think the match is between user input and your match
}
```


In [21]:
template = """
You will be given a series of job titles from a user.
Find the best corresponding match on the list of standardized names.
The closest match will be the one with the closest semantic meaning. Not just string similarity.

{format_instructions}

Wrap your final output with cloased and open brackets (a list of json objects)

job_titles INPUT: {job_titles}

job_functions:{job_functions}
job_levels: {job_levels}

YOUR RESPONSE:

"""

prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(template)  
    ],
    input_variables=["job_titles", "job_functions", "job_levels"],
    partial_variables={"format_instructions": format_instructions}
)

# Define Inputs

In [11]:
# standardized job functions and levels
df = pd.read_csv('job_functions.csv')
job_functions = ", ".join(df['functions'].values)
job_functions

'unknown, finance, engineering, customer success, retail, operations, design, sales, marketing, administration, support, product, general management, HR, commercial management, analytics, recruiting, legal'

In [14]:
# standardized job functions and levels
df = pd.read_csv('job_levels.csv')
job_levels = ", ".join(df['level'].values)
job_levels

'Director, line, CXO, VP, Manager'

In [18]:
# Input

df = pd.read_csv('job_titles.csv')
job_titles = ", ".join(df['job_title'].values[0:10])
job_titles

'non executive director, chief scientist, test team lead, director of system test, safety manager, photographe   infographiste, program manager, manаger, supervisor operacional, senior delivery manager'

In [22]:
user_input = job_titles

_input = prompt.format_prompt(job_titles=user_input, job_functions=job_functions, job_levels = job_levels)


print (f"There are {len(_input.messages)} message(s)")
print (f"Type: {type(_input.messages[0])}")
print ("---------------------------")
print (_input.messages[0].content)

There are 1 message(s)
Type: <class 'langchain.schema.HumanMessage'>
---------------------------

You will be given a series of job titles from a user.
Find the best corresponding match on the list of standardized names.
The closest match will be the one with the closest semantic meaning. Not just string similarity.

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "\`\`\`json" and "\`\`\`":

```json
{
	"input_job titles": string  // This is the raw job titles from the user
	"job functions": string  // This is the job function you feel is most closely matched to the users input
	"job level": string  // This is the job level you feel is most closely matched to the users input
	"match_score": string  // A score 0-100 of how close you think the match is between user input and your match
}
```

Wrap your final output with cloased and open brackets (a list of json objects)

job_titles INPUT: non executive director, chief scie

## Run model 


In [23]:
output = chat_model(_input.to_messages())
print (type(output))
print (output.content)

<class 'langchain.schema.AIMessage'>
```json
[
	{
		"input_job titles": "non executive director",
		"job functions": "general management",
		"job level": "Director",
		"match_score": "90"
	},
	{
		"input_job titles": "chief scientist",
		"job functions": "engineering",
		"job level": "CXO",
		"match_score": "95"
	},
	{
		"input_job titles": "test team lead",
		"job functions": "engineering",
		"job level": "line",
		"match_score": "85"
	},
	{
		"input_job titles": "director of system test",
		"job functions": "engineering",
		"job level": "Director",
		"match_score": "95"
	},
	{
		"input_job titles": "safety manager",
		"job functions": "operations",
		"job level": "Manager",
		"match_score": "80"
	},
	{
		"input_job titles": "photographe infographiste",
		"job functions": "design",
		"job level": "line",
		"match_score": "70"
	},
	{
		"input_job titles": "program manager",
		"job functions": "operations",
		"job level": "Manager",
		"match_score": "80"
	},
	{
		"input_job titles": "ma

In [24]:
if "```json" in output.content:
    json_string = output.content.split("```json")[1].strip()
else:
    json_string = output.content

In [28]:
print(output.content)

```json
[
	{
		"input_job titles": "non executive director",
		"job functions": "general management",
		"job level": "Director",
		"match_score": "90"
	},
	{
		"input_job titles": "chief scientist",
		"job functions": "engineering",
		"job level": "CXO",
		"match_score": "95"
	},
	{
		"input_job titles": "test team lead",
		"job functions": "engineering",
		"job level": "line",
		"match_score": "85"
	},
	{
		"input_job titles": "director of system test",
		"job functions": "engineering",
		"job level": "Director",
		"match_score": "95"
	},
	{
		"input_job titles": "safety manager",
		"job functions": "operations",
		"job level": "Manager",
		"match_score": "80"
	},
	{
		"input_job titles": "photographe infographiste",
		"job functions": "design",
		"job level": "line",
		"match_score": "70"
	},
	{
		"input_job titles": "program manager",
		"job functions": "operations",
		"job level": "Manager",
		"match_score": "80"
	},
	{
		"input_job titles": "manаger",
		"job functions": "operation