In [None]:
import pandas as pd
import numpy as np
import textwrap
import re
from bs4 import BeautifulSoup
import openai

# Data inspection and preprocessing

### A look at the raw dataset

In [9]:
df = pd.read_csv("Engineer_20230826.csv")
df.head()

Unnamed: 0,RequisitionID,OrigJobTitle,JobTitle,JobDescription
0,,Licensed Stationary Engineer,ENGINEER (all other),Licensed Stationary Engineer \n\n Froedtert So...
1,224907.0,Guidance Navigation and Control (GN&C) Enginee...,ENGINEER (all other),**The Boeing Company** is in search of a **L...
2,331804.0,"Propulsion Engineer - Associate, Mid-Level and...",ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
3,336462.0,Senior Process Controls Engineer,ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."
4,338951.0,RF/Microwave Engineer (Level 2 or 3),ENGINEER (all other),"**Job Description**\n\nAt Boeing, we innovate ..."


### Features of dataset

In [10]:
print(f"Columns in data set: {df.columns.tolist()}")

Columns in data set: ['RequisitionID', 'OrigJobTitle', 'JobTitle', 'JobDescription']


### A sample job description

In [11]:
# inspect data
print(f"The elements in the JobDescription column are of data type: {df['JobDescription'].dtype}")

def print_job_desc(col: int):
    job_desc = df.iat[col, 3]
    print(f"Job description at col {col}:\n {textwrap.fill(job_desc, width=175)}")

print_job_desc(col=1000)

The elements in the JobDescription column are of data type: object
Job description at col 1000:
 ATS Company:  PA Solutions \n\n Requisition ID:  10392 \n\n Location:  \n\n Greenville, SC, US, 29615 Lewis Center, OH, US, 43035-9445 Guaynabo, PR, US, 00968-8058 Concord,
NH, US, 3811 Indianapolis, IN, US, 46250 Raleigh, NC, US, 27603 \n\n Date:  Jul 17, 2023 \n\n Automation Engineer \n\nJob Description\n\nProcess Automation Solutions is one of
the leading manufacturer-independent suppliers of complete automation solutions for the process and manufacturing industries. The company currently employs more than1,500
people with a global presence in Europe, the Americas, and Asia. Our operational activities focus on the design of process control systems and their vertical integration into
the overall business process. We offer complete services from the concept to commissioning, from the field level through process control level to corporate management level.
Process Automation Solutions is a 

### Cleaning the dataset

In [15]:
# remove html tags, markdown, string literals, and whitespace from all job descriptions
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # 1. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # 2. Remove Markdown bold/italic markers (**text**, *text*, __text__)
    text = re.sub(r"\*{1,2}|_{1,2}", "", text)
    # 3. Replace literal '\n' and '\t' with space
    text = text.replace("\\n", " ").replace("\\t", " ")
    # 4. Replace any remaining whitespace (spaces, tabs, newlines) with single space
    text = re.sub(r"\s+", " ", text)
    # 5. Strip leading/trailing whitespace
    text = text.strip()
    return text

df['JobDescription'] = df['JobDescription'].apply(clean_text)
print_job_desc(1000)

Job description at col 1000:
 ATS Company: PA Solutions Requisition ID: 10392 Location: Greenville, SC, US, 29615 Lewis Center, OH, US, 43035-9445 Guaynabo, PR, US, 00968-8058 Concord, NH, US, 3811
Indianapolis, IN, US, 46250 Raleigh, NC, US, 27603 Date: Jul 17, 2023 Automation Engineer Job Description Process Automation Solutions is one of the leading manufacturer-
independent suppliers of complete automation solutions for the process and manufacturing industries. The company currently employs more than1,500 people with a global presence
in Europe, the Americas, and Asia. Our operational activities focus on the design of process control systems and their vertical integration into the overall business process.
We offer complete services from the concept to commissioning, from the field level through process control level to corporate management level. Process Automation Solutions is
a company of ATS Corporation. Overview: This position participates in the design and implementation of c

# Creating a job description sentence classifier

## My aproach
Now that the data is processed, I want to create a training set from our job descriptions. I will choose 500 random job postings from our clean set and use GPT-5-nano to process and assign labels to each sentence in the training set. I want to break up the job postings into sentences that will each be classified into the following four categories:
 - Marketing
 - Description
 - Requirements
 - Legal

### Choosing 500 job postings

In [13]:
# select 500 random job postings
random_postings = df.sample(n=500)
random_postings.head()

Unnamed: 0,RequisitionID,OrigJobTitle,JobTitle,JobDescription
14454,ENGIN002406,Engineer,ENGINEER (all other),Description ICM Solutions launched in 2020 wit...
3849,200475082,SwiftUI Frameworks Engineer,ENGINEER (all other),SwiftUI Frameworks Engineer Santa Clara Valley...
3235,1930-378,Early Career Mechanical Tooling Engineer,ENGINEER (all other),Interested in making an impact on clean energy...
133,00028061,Simulation Design and Evaluation Engineer,ENGINEER (all other),Title: Simulation Design and Evaluation Engine...
9266,31015,ENGINEER 2,ENGINEER (all other),"ENGINEER 2 Location: Newport News, Virginia, U..."


### Creating a GPT_5-nano prompt

In [None]:
client = OpenAI()

def build_prompt(job_title, job_description):
    return f"""
You are given a job posting. Break the text into sentences and assign each 
sentence to one of four categories: Marketing, Description, Requirements, Legal. 
Return the result as JSON, with keys 'Marketing', 'Description', 'Requirements', 'Legal', 
and each value a list of sentences.

Job Title: {job_title}
Job Description: {job_description}
    """

for _, row in random_postings.iterrows():
    prompt = build_prompt(row["JobTitle"], row["JobDescription"])

    response = openai.ChatCompletion.create(
        model="gpt-5-nano",
        input=prompt
    )

try: 
    
except Exception as e:
    print("Error parsing JSON", e)
response = client.responses.create(

)

print(response.output_text)

Under a silver moon, a gentle unicorn curled in a meadow of fireflies and drifted into a peaceful sleep, guarding the dreams of children with a soft lullaby sigh.
