### Setup

In [None]:
import dotenv
import json
import pandas as pd
import tiktoken
import time

from openai import OpenAI
from pydantic import BaseModel
from datetime import datetime

# load environment variables
env_file = '.env'
dotenv.load_dotenv(env_file)

In [None]:
# Read data
df = pd.read_csv('~/Downloads/archive (8)/postings.csv')
df.columns

# Remove embedded newlines
df['description'].replace({r'[\n\r]+': ' '}, regex=True, inplace=True)
df['description'].replace({r'[,]+': ' '}, regex=True, inplace=True)

# Convert types
df['description'] = df['description'].astype(str)
df['skills_desc'] = df['skills_desc'].astype(str)



In [None]:
# slice only top 15,000 rows
df2 = df[:15000]

client = OpenAI()

### Define all required step functions

def build_request(job_id, desc):
    # check that desc is a string
    if not isinstance(desc, str):
        return None
    
    # only take the first 1000 chars of the description to limit token usage
    trunc_desc = desc[:1000]

    # build the completions request
    request = {
        "custom_id": str(job_id),
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "max_tokens": 500,
            "messages": [
                {
                    "role": "system",
                    "content": "Take the following job description and extract only 5 top keywords:"
                },
                {
                    "role": "user",
                    "content": trunc_desc
                }
            ],
            "response_format": {
                "type": "json_schema",
                "json_schema": {
                    "name": "KeywordList",
                    "schema": {  
                        "properties": {
                            "keywords": {
                                "items": {
                                    "type": "string"
                                },
                                "title": "Keywords",
                                "type": "array"
                            }
                        },
                        "required": ["keywords"],
                        "type": "object"
                    }
                }
            }
        }
    }
    return request


# writes a list of requests to a jsonl file
def build_batch_file(file_path, requests):
    with open(file_path, 'w') as file:
        for req in requests:
            json_str = json.dumps(req)
            file.write(json_str + '\n')


# upload the batch file to OpenAI
def upload_batch(batch_file):
    batch_input_file = client.files.create(
        file=open(batch_file, "rb"),
        purpose="batch"
    )
    print(batch_input_file)
    return batch_input_file


# create a Batch object 
def create_batch(batch_input_file):
    batch_input_file_id = batch_input_file.id
    batch = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": batch_input_file.filename
        }
    )
    print('CREATED: {}'.format(batch))
    return batch


# calculate time elapsed since batch creation
def time_elapsed(start_ts):
    # Convert to human-readable date
    created_datetime = datetime.fromtimestamp(start_ts)  
    current_datetime =  datetime.fromtimestamp(time.time())  

    # calculate the the difference
    elapsed = current_datetime - created_datetime

    return elapsed


# monitors the enqued batch process
def process_batch(batch):
    while True:
        try:
            batch = client.batches.retrieve(batch.id)
            status = batch.status
            print(batch)
            if status in ['validating', 'in_progress', 'cancelling', 'finalizing']:
                print('status: ' + status)
                print('time elapsed: {}'.format(time_elapsed(batch.created_at)))
            elif status in ['failed', 'cancelled', 'expired']:
                print('status: ' + status)
                raise Exception('Could not process batch')
            elif status == 'completed':
                print('status: ' + status)
                print('time elapsed: {}'.format(time_elapsed(batch.created_at)))
                retrieve_completions(batch)
                return
            else:
                print('unkown status: ' + status)
            time.sleep(300)
        except Exception as e:
            raise e


# retrieve and save the completions response to file
def retrieve_completions(batch):
    if not batch.output_file_id:
        return 'No output_file_id found'
    file_response = client.files.content(batch.output_file_id)
    output_file = batch.id + '.jsonl'
    if file_response:
        with open(output_file, 'w') as file:
            file.write(file_response.text)
            print("Responses saved to file")
    elif batch.error_file_id:
        err_file_response = client.files.content(batch.error_file_id)
        output_file = 'ERROR_' + batch.id + '.jsonl'
        if err_file_response:
            with open(output_file, 'w') as file:
                file.write(err_file_response.text)
                print("Errors saved to file")


### Chunk the data frame and create the batch files

In [None]:
def split_dataframe(df, chunk_size):
    return [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

# Example usage
chunk_size = 3000
chunks = split_dataframe(df3, chunk_size)

for i, df_chunk in enumerate(chunks):
    print(f"Processing Chunk {i}")
    
    # build all the requests
    requests = []
    for row in df_chunk.itertuples():
        request = build_request(row.job_id, row.description)
        if request:
            requests.append(request)

    # # write to file
    output_file = './batches/batch{}.jsonl'.format(i)
    build_batch_file(output_file, requests)
    

### Upload all the batches to OpenAI

In [None]:
batchesfiles = []
for i in range(0, 37):
    batch_file = './batches/batch{}.jsonl'.format(i)
    batch = upload_batch(batch_file)
    if batch:
        batches.append(batch)
print('Batches count: {}'.format(len(batches)))

### Create and monitor the batches

In [None]:
for bf in batchfiles:
    batch = create_batch(bf)
    try:
        process_batch(batch)
    except Exception as e:
        batch_id = batch.id
        break