In [1]:

%pprint
%matplotlib inline
import sys
import os.path as osp

executable_path = sys.executable; scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts')
py_folder = osp.abspath('../py'); ffmpeg_folder = r'C:\ffmpeg\bin'
if (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if (py_folder not in sys.path): sys.path.insert(1, py_folder)
if (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
from jobpostlib import (crf, cu, datetime, duration, hau, hc, humanize, ihu, lru, nu, osp, scrfcu, slrcu, ssgdcu, su, t0, time, wsu, speech_engine)
from pandas import DataFrame
import re
import pyperclip
import ipywidgets as widgets
from IPython.display import display

Pretty printing has been turned OFF
Utility libraries created in 5 seconds


In [45]:

element_strs_set = set()

In [46]:

import cohere

# Initialize the Cohere client
co_key = wsu.secrets_json['Cohere_API_Key']
co = cohere.Client(co_key)

# Define the base headers for Post Date Headers
base_headers = ['<b>Publication Date:</b>', '<b>Job Posting :</b>', '<b>Job Posting</b>', '<div>Published</div>']
prompt = "Generate (in a python list) professional examples similar to these Post Date Headers (H-PD):\n" + str(base_headers)

# Generate synthetic headers
response = co.generate(
    prompt=prompt,
    model='command-xlarge-nightly',
    num_generations=5,
    max_tokens=200,
    temperature=0.7,
)

print(response.generations[0].text)

unknown field: parameter model is not a valid field


Here are some professional examples similar to the given Post Date Headers:

- ['<b>Date Posted:</b>', '<b>Date of Publication:</b>', '<b>Posted On:</b>']
- ['<b>Application Deadline:</b>', '<b>Closing Date:</b>', '<b>Last Date to Apply:</b>']
- ['<b>Date Announced:</b>', '<b>Announcement Date:</b>', '<b>Announced On:</b>']
- ['<b>Date Updated:</b>', '<b>Last Updated:</b>', '<b>Update Date:</b>']
- ['<b>Release Date:</b>', '<b>Launch Date:</b>', '<b>Product Release:</b>']
- ['<b>Effective From:</b>', '<b>Start Date:</b>', '<b>Commencement Date:</b>']
- ['<b>Event Date:</b


In [47]:

for cohere_generation_obj in response.generations:
    generation_text = cohere_generation_obj.text
    # display(generation_text)
    element_strs_list = re.findall(r"""['`]([^\\'`\]\[]+)['`],?""", generation_text)
    for element_str in element_strs_list:
        html_str = element_str.strip('\'`').strip()
        if html_str.startswith('<') or html_str.endswith('>'):
            element_strs_set.add(hau.get_navigable_children(hau.get_body_soup(html_str), [])[0].strip())
        else:
            element_strs_set.add(html_str)

In [48]:

sorted(element_strs_set)

['<b>Advertisement Date</b>', '<b>Announced On:</b>', '<b>Announcement Date:</b>', '<b>Announcement Date</b>', '<b>Application Deadline:</b>', '<b>Closing Date:</b>', '<b>Closing Date</b>', '<b>Commencement Date:</b>', '<b>Date Announced:</b>', '<b>Date Posted:</b>', '<b>Date Updated:</b>', '<b>Date of Posting:</b>', '<b>Date of Publication:</b>', '<b>Date:</b>', '<b>Effective From:</b>', '<b>Job Ad Date:</b>', '<b>Last Date to Apply:</b>', '<b>Last Updated:</b>', '<b>Launch Date:</b>', '<b>Post Date</b>', '<b>Posted On:</b>', '<b>Posted on:</b>', '<b>Posting Date</b>', '<b>Product Release:</b>', '<b>Recent Posting:</b>', '<b>Recent Update</b>', '<b>Release Date:</b>', '<b>Start Date:</b>', '<b>Time Stamp</b>', '<b>Time of Posting:</b>', '<b>Timestamp:</b>', '<b>Update Date:</b>', '<div>Application Date</div>', '<div>Date Posted</div>', '<div>Date Published</div>', '<div>Date of Publication</div>', '<div>Date:</div>', '<div>Date</div>', '<div>Update Date</div>', '<h3>Publication</h3>',


----

In [3]:

import openai

def generate_post_date_headers(api_key, base_headers, num_variations=10):
    """
    Generate synthetic variations for Post Date Headers (H-PD) using an LLM.

    Args:
        api_key (str): OpenAI API key for authenticating requests.
        base_headers (list): List of existing Post Date Headers to use as reference.
        num_variations (int): Number of synthetic headers to generate.

    Returns:
        list: Combined list of base headers and generated synthetic headers.
    """
    # Validate the input
    if not isinstance(base_headers, list) or not all(isinstance(header, str) for header in base_headers):
        raise ValueError("Base headers must be a list of strings.")
    
    # Construct the prompt for the LLM
    prompt = (
        "Generate unique, creative, and professional examples of Post Date Headers (H-PD) "
        "similar to the following examples:\n"
        f"{base_headers}\n"
        "The output should be a list of headers formatted like the examples provided. "
        "Make sure they are concise, relevant, and follow a similar structure."
    )
    
    try:
        # Set the OpenAI API key
        openai.api_key = api_key

        # Call the OpenAI API to generate text
        response = openai.Completion.create(
            engine="gpt-3.5-turbo-instruct",
            prompt=prompt,
            max_tokens=150,
            n=1,
            temperature=0.7  # Adjust temperature for creativity
        )
        
        # Extract and clean up the generated text
        generated_text = response.choices[0].text.strip()
        
        # Convert generated text to a list of headers
        synthetic_headers = [header.strip() for header in generated_text.split('\n') if header.strip()]
        
        # Combine base headers with the synthetic ones
        return base_headers + synthetic_headers[:num_variations]
    
    except Exception as e:
        print(f"An error occurred while generating synthetic headers: {e}")
        return base_headers

In [4]:

import os

api_key = os.getenv('OPENAI_API_KEY')
synthetic_headers = generate_post_date_headers(api_key, base_headers, num_variations=10)
print(synthetic_headers)

An error occurred while generating synthetic headers: You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.
['<b>Publication Date:</b>', '<b>Job Posting :</b>', '<b>Job Posting</b>', '<div>Published</div>']


In [5]:

from transformers import pipeline

# Load a pre-trained model pipeline for text generation
generator = pipeline("text-generation", model="EleutherAI/gpt-neo-125M")

# Define a prompt to generate post date headers
prompt = f"Generate professional examples of Post Date Headers:\n{base_headers}"
print(prompt)

# Generate synthetic headers
results = generator(prompt, max_length=100, num_return_sequences=1)
print(results[0]['generated_text'])

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Generate professional examples of Post Date Headers:
['<b>Publication Date:</b>', '<b>Job Posting :</b>', '<b>Job Posting</b>', '<div>Published</div>']


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generate professional examples of Post Date Headers:
['<b>Publication Date:</b>', '<b>Job Posting :</b>', '<b>Job Posting</b>', '<div>Published</div>']

I have a list of all the Post Date Headers that I want to create. I have a list of all the Post Date Headers that I want to create. I have a list of all the Post Date
