In [29]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import textwrap
import re
from bs4 import BeautifulSoup
from openai import OpenAI
import json

# Data inspection and preprocessing

### A look at the raw dataset

In [30]:
df = pd.read_csv("../Engineer_20230826.csv")
df.head()

Unnamed: 0,RequisitionID,OrigJobTitle,JobTitle,JobDescription
0,,Licensed Stationary Engineer,ENGINEER (all other),Licensed Stationary Engineer \n\n Froedtert So...
1,224907.0,Guidance Navigation and Control (GN&C) Enginee...,ENGINEER (all other),**The Boeing Company** is in search of a **L...
2,331804.0,"Propulsion Engineer - Associate, Mid-Level and...",ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
3,336462.0,Senior Process Controls Engineer,ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
4,338951.0,RF/Microwave Engineer (Level 2 or 3),ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."


### Features of dataset

In [31]:
print(f"Columns in data set: {df.columns.tolist()}")

Columns in data set: ['RequisitionID', 'OrigJobTitle', 'JobTitle', 'JobDescription']


### A sample job description

In [32]:
# inspect data
print(f"The elements in the JobDescription column are of data type: {df['JobDescription'].dtype}")

def print_job_desc(col: int):
    job_desc = df.iat[col, 3]
    print(f"Job description at col {col}:\n {textwrap.fill(job_desc, width=175)}")

print_job_desc(col=1000)

The elements in the JobDescription column are of data type: object
Job description at col 1000:
 ATS Company:  PA Solutions \n\n Requisition ID:  10392 \n\n Location:  \n\n Greenville, SC, US, 29615 Lewis Center, OH, US, 43035-9445 Guaynabo, PR, US, 00968-8058 Concord,
NH, US, 3811 Indianapolis, IN, US, 46250 Raleigh, NC, US, 27603 \n\n Date:  Jul 17, 2023 \n\n Automation Engineer \n\nJob Description\n\nProcess Automation Solutions is one of
the leading manufacturer-independent suppliers of complete automation solutions for the process and manufacturing industries. The company currently employs more than1,500
people with a global presence in Europe, the Americas, and Asia. Our operational activities focus on the design of process control systems and their vertical integration into
the overall business process. We offer complete services from the concept to commissioning, from the field level through process control level to corporate management level.
Process Automation Solutions is a 

### Cleaning the dataset

In [33]:
# remove html tags, markdown, string literals, and whitespace from all job descriptions
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # 1. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # 2. Remove Markdown bold/italic markers (**text**, *text*, __text__)
    text = re.sub(r"\*{1,2}|_{1,2}", "", text)
    # 3. Replace literal '\n' and '\t' with space
    text = text.replace("\\n", " ").replace("\\t", " ")
    # 4. Replace any remaining whitespace (spaces, tabs, newlines) with single space
    text = re.sub(r"\s+", " ", text)
    # 5. Strip leading/trailing whitespace
    text = text.strip()
    return text

df['JobDescription'] = df['JobDescription'].apply(clean_text)
print_job_desc(1000)

Job description at col 1000:
 ATS Company: PA Solutions Requisition ID: 10392 Location: Greenville, SC, US, 29615 Lewis Center, OH, US, 43035-9445 Guaynabo, PR, US, 00968-8058 Concord, NH, US, 3811
Indianapolis, IN, US, 46250 Raleigh, NC, US, 27603 Date: Jul 17, 2023 Automation Engineer Job Description Process Automation Solutions is one of the leading manufacturer-
independent suppliers of complete automation solutions for the process and manufacturing industries. The company currently employs more than1,500 people with a global presence
in Europe, the Americas, and Asia. Our operational activities focus on the design of process control systems and their vertical integration into the overall business process.
We offer complete services from the concept to commissioning, from the field level through process control level to corporate management level. Process Automation Solutions is
a company of ATS Corporation. Overview: This position participates in the design and implementation of c

# Creating a job description sentence classifier

## My aproach
Now that the data is processed, I want to create a training set from our job descriptions. I will choose 500 random job postings from our clean set and use GPT-5-nano to process and assign labels to each sentence in the training set. I want to break up the job postings into sentences that will each be classified into the following four categories:
 - Marketing
 - Description
 - Requirements
 - Legal

### Choosing 500 job postings

In [34]:
# Creating a new dataframe of 500 random rows from df
random_postings = df.sample(n=500)
random_postings.head()

Unnamed: 0,RequisitionID,OrigJobTitle,JobTitle,JobDescription
2801,16089,Industrial Controls Engineer,ENGINEER (all other),"At Ford Motor Company, we believe freedom of m..."
10414,44f4814075a5,Field Engineer (Pipe / Cam),ENGINEER (all other),Turner Staffing Group - Project Engineer Do yo...
9825,34865BR,Transportation Engineer 5-Roadway,ENGINEER (all other),34865BR Requisition ID: 34865BR Business Unit:...
5403,2023-266016,Engineer,ENGINEER (all other),Informacin disponible en espaol a continuacin....
12293,648637BR,Avionics Field Engineer - Advanced Programs - ...,ENGINEER (all other),Description: Lockheed Martin Aeronautics . Be ...


### Creating a GPT_5-nano prompt

In [None]:
# load_dotenv(dotenv_path=os.path.join("..", ".env"))

# openai_api_key = os.getenv("OPENAI_API_KEY")
load_dotenv(dotenv_path=os.path.join("../", ".env"))

openai_api_key = os.getenv("OPENAI_API_KEY")
print(openai_api_key)
client = OpenAI(api_key=openai_api_key)

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


In [27]:
def build_categorization_prompt(job_description):
    prompt = f"""
Analyze the following job posting and categorize each sentence into one of these four categories:
- Marketing: Sentences that sell the company, highlight benefits, or promote the organization
- Description: Sentences describing job duties, responsibilities, and day-to-day work
- Requirements: Sentences listing qualifications, skills, experience, or education needed
- Legal: Sentences about EEO statements, compliance, disclaimers, or legal notices

Job Posting:
{job_description}

Return the result as JSON, with keys 'Marketing', 'Description', 'Requirements', 'Legal', 
and each value a list of sentences.
"""
    return prompt

In [28]:
# Testing on one row
row = random_postings.iloc[0]
desc = row['JobDescription']
prompt = build_categorization_prompt(desc)

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a helpful assistant that categorizes job posting sentences."},
        {"role": "user", "content": prompt}
    ]
)

content = response.choices[0].message.content
categorized = json.loads(content)

result = ({
    'RequisitionID': row['RequisitionID'],
    'OrigJobTitle': row['OrigJobTitle'],
    'JobTitle': row['JobTitle'],
    'categorized_sentences': categorized
})

print(result)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
results = []

for _, row in random_postings.iterrows():
    prompt = build_categorization_prompt(row["JobDescription"])

    try:
        response = client.chat.completions.create(
            model="gpt-5-nano",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that categorizes job posting sentences."},
                {"role": "user", "content": prompt}
            ]
        )

        content = response.choices[0].message.content
        categorized = json.loads(content)
        
        results.append({
            'RequisitionID': row['RequisitionID'],
            'OrigJobTitle': row['OrigJobTitle'],
            'JobTitle': row['JobTitle'],
            'categorized_sentences': categorized
        })

    except Exception as e:
        print(f"Error processing {row['RequisitionID']}: {e}")
        results.append({
            'RequisitionID': row['RequisitionID'],
            'error': str(e)
        })