# Categorize raw job titles into pre-defined job functions and job levels
tutorials: https://github.com/gkamradt/langchain-tutorials/blob/main/data_generation/Clean%20and%20Standardize%20Data.ipynb

# Load library

In [1]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
import pandas as pd
import json

In [2]:
import os
import openai
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.getenv('OPENAI_API_KEY')

# Define Model and output format

In [3]:
# Temp = 0 so that we get clean information without a lot of creativity
chat_model = ChatOpenAI(temperature=0, max_tokens=1000)

In [4]:
# Define reponse structured as prompt template
response_schemas = [
    ResponseSchema(name="input_job titles", description="This is the raw job titles from the user"),
    ResponseSchema(name="job functions", description="This is the job function you feel is most closely matched to the users input"),
    # ResponseSchema(name="job level", description="This is the job level you feel is most closely matched to the users input"),
    ResponseSchema(name="match_score",  description="A score 0-100 of how close you think the match is between user input and your match")
]

# parse your output
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

In [5]:
# See the prompt template you created for formatting
format_instructions = output_parser.get_format_instructions()
print (output_parser.get_format_instructions())

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "\`\`\`json" and "\`\`\`":

```json
{
	"input_job titles": string  // This is the raw job titles from the user
	"job functions": string  // This is the job function you feel is most closely matched to the users input
	"match_score": string  // A score 0-100 of how close you think the match is between user input and your match
}
```


In [7]:
template = """
You will be given a list of job titles from a user, delineated by ";"
For each job title, find the best corresponding match on the list of standardized job functions.
The closest match will be the one with the closest semantic meaning. Not just string similarity.

{format_instructions}

Wrap your final output with cloased and open brackets (a list of json objects). Nothing else. 

job_titles INPUT: {job_titles}

job_functions:{job_functions}

YOUR RESPONSE:

"""

prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(template)  
    ],
    input_variables=["job_titles", "job_functions"],
    partial_variables={"format_instructions": format_instructions}
)

# Define Inputs

In [10]:
# standardized job functions and levels
df = pd.read_csv('job_functions.csv')
# job_functions = ", ".join(df['functions'].values)
df_technical = df.loc[df['func_parent'] == "Technical"]
job_functions = ", ".join(df_technical['subfunctions'].values)
job_functions

'analytics-unknown, bi / business information, data science, marketing, product, design-unknown, ux/product, visual/graphic, ai/ml, backend, data engineering, devops/techops, engineering-unknown, frontend, full-stack, hardware, mobile, qa/testing, security & privacy, product-unknown'

In [8]:
# standardized job functions and levels
df = pd.read_csv('job_levels.csv')
job_levels = ", ".join(df['level'].values)
job_levels

'Director, line, CXO, VP, Manager'

In [30]:
# Input

# df = pd.read_csv('job_titles_uncoded.csv')
# df = pd.read_csv('job_titles_coded.csv')
df = pd.read_csv('job_titles_uncat.csv')
job_titles = ";".join(df['job_title'].values[50:69])
job_titles

'iphone and flash game programmer;product development specialist;senior engineer manager;principal engineer;head of engineer;senior data engineer;director engineer   user interface and api;engineer lead;head of data analytics;chief product officer;senior engineer;director of product and operations;co founder and chief technology officer;employee #3, director of engineer;konsulttjanster med tech startup;technical designer and designer, product development;engineer manager, android;head of product;product designer, creative director'

In [31]:
user_input = job_titles

_input = prompt.format_prompt(job_titles=user_input, job_functions=job_functions)


print (f"There are {len(_input.messages)} message(s)")
print (f"Type: {type(_input.messages[0])}")
print ("---------------------------")
print (_input.messages[0].content)

There are 1 message(s)
Type: <class 'langchain.schema.HumanMessage'>
---------------------------

You will be given a list of job titles from a user, delineated by ";"
For each job title, find the best corresponding match on the list of standardized job functions.
The closest match will be the one with the closest semantic meaning. Not just string similarity.

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "\`\`\`json" and "\`\`\`":

```json
{
	"input_job titles": string  // This is the raw job titles from the user
	"job functions": string  // This is the job function you feel is most closely matched to the users input
	"match_score": string  // A score 0-100 of how close you think the match is between user input and your match
}
```

Wrap your final output with cloased and open brackets (a list of json objects). Nothing else. 

job_titles INPUT: iphone and flash game programmer;product development specialist;senior en

## Run model 


In [33]:
output = chat_model(_input.to_messages())
print (type(output))
print (output.content)

KeyboardInterrupt: 

In [34]:
if "```json" in output.content:
    json_string = output.content.split("```json")[1].strip()
    json_string = json_string.split("```")[0].strip()
else:
    json_string = output.content

In [35]:
print(json_string)

[
	{
		"input_job titles": "iphone and flash game programmer",
		"job functions": "mobile",
		"match_score": "100"
	},
	{
		"input_job titles": "product development specialist",
		"job functions": "product",
		"match_score": "100"
	},
	{
		"input_job titles": "senior engineer manager",
		"job functions": "engineering-unknown",
		"match_score": "50"
	},
	{
		"input_job titles": "principal engineer",
		"job functions": "engineering-unknown",
		"match_score": "50"
	},
	{
		"input_job titles": "head of engineer",
		"job functions": "engineering-unknown",
		"match_score": "50"
	},
	{
		"input_job titles": "senior data engineer",
		"job functions": "data engineering",
		"match_score": "100"
	},
	{
		"input_job titles": "director engineer user interface and api",
		"job functions": "engineering-unknown",
		"match_score": "50"
	},
	{
		"input_job titles": "engineer lead",
		"job functions": "engineering-unknown",
		"match_score": "50"
	},
	{
		"input_job titles": "head of data analytics",
		"j

In [36]:
# output_parser.parse(output.content) Ideally this works but not in all cases
structured_data = json.loads(json_string)
structured_data

[{'input_job titles': 'iphone and flash game programmer',
  'job functions': 'mobile',
  'match_score': '100'},
 {'input_job titles': 'product development specialist',
  'job functions': 'product',
  'match_score': '100'},
 {'input_job titles': 'senior engineer manager',
  'job functions': 'engineering-unknown',
  'match_score': '50'},
 {'input_job titles': 'principal engineer',
  'job functions': 'engineering-unknown',
  'match_score': '50'},
 {'input_job titles': 'head of engineer',
  'job functions': 'engineering-unknown',
  'match_score': '50'},
 {'input_job titles': 'senior data engineer',
  'job functions': 'data engineering',
  'match_score': '100'},
 {'input_job titles': 'director engineer user interface and api',
  'job functions': 'engineering-unknown',
  'match_score': '50'},
 {'input_job titles': 'engineer lead',
  'job functions': 'engineering-unknown',
  'match_score': '50'},
 {'input_job titles': 'head of data analytics',
  'job functions': 'analytics-unknown',
  'match_

In [37]:
# pd.concat([pd.DataFrame(structured_data), df.loc[20:29, 'functions']], axis=1)
pd.DataFrame(structured_data)

Unnamed: 0,input_job titles,job functions,match_score
0,iphone and flash game programmer,mobile,100
1,product development specialist,product,100
2,senior engineer manager,engineering-unknown,50
3,principal engineer,engineering-unknown,50
4,head of engineer,engineering-unknown,50
5,senior data engineer,data engineering,100
6,director engineer user interface and api,engineering-unknown,50
7,engineer lead,engineering-unknown,50
8,head of data analytics,analytics-unknown,100
9,chief product officer,product,100
