In [1]:

# Load needed libraries and functions
%matplotlib inline
%pprint
import sys
import os.path as osp

executable_path = sys.executable; scripts_folder = osp.join(osp.dirname(executable_path), 'Scripts')
py_folder = osp.abspath('../py'); ffmpeg_folder = r'C:\ffmpeg\bin'
if (scripts_folder not in sys.path): sys.path.insert(1, scripts_folder)
if (py_folder not in sys.path): sys.path.insert(1, py_folder)
if (ffmpeg_folder not in sys.path): sys.path.insert(1, ffmpeg_folder)
from jobpostlib import (cu, datetime, nu, humanize, time, lru, wsu)
from nltk.tokenize import sent_tokenize
from pandas import DataFrame
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pyperclip
import re

# Download necessary NLTK data
nltk.download('punkt')

Pretty printing has been turned OFF
Utility libraries created in 6 seconds


[nltk_data] Downloading package punkt to C:\Users\daveb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:

# Check if the lru has built its is-qualified classifier
t1 = time.time()
# lru.basic_quals_dict = None; lru.sync_basic_quals_dict()
if not (hasattr(lru, 'ISQUALIFIED_LR') and hasattr(lru, 'ISQUALIFIED_CV')):
    lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified LR elements built in {duration_str}'; print(speech_str)

I have 532,546 is-qualified vocabulary tokens in here
Is-qualified LR elements built in 7 seconds



----

In [3]:

# Get the databased quals with preferred language marked as RQ
cypher_str = '''
    // Get all qualification strings in the database
    MATCH (qs:QualificationStrings)
    RETURN qs
    ORDER BY qs.qualification_str;'''
row_objs_list = []
with cu.driver.session() as session: row_objs_list = session.write_transaction(cu.do_cypher_tx, cypher_str)
if row_objs_list:
    basic_quals_df = DataFrame(
        [{k: v for k, v in row_obj['qs'].items()} for row_obj in row_objs_list]
    ).drop_duplicates()
    shape_tuple = basic_quals_df.shape
    print(f'basic_quals_df.shape: {shape_tuple}')

basic_quals_df.shape: (20914, 2)


In [4]:

import random

search_str = 'prefer'
mask_series = basic_quals_df.qualification_str.map(lambda x: search_str in x) & (basic_quals_df.is_qualified == 1)
qualification_strs_list = basic_quals_df[mask_series].qualification_str.tolist()
qualification_str = random.choice(qualification_strs_list)
print(qualification_str, qualification_str.index(search_str), len(qualification_str))
basic_quals_df.loc[mask_series, f'{search_str}_index'] = basic_quals_df[mask_series].qualification_str.map(lambda x: x.index(search_str))
basic_quals_df.loc[mask_series, f'{search_str}_percentage'] = basic_quals_df[mask_series][f'{search_str}_index'] / basic_quals_df[mask_series].qualification_str.map(lambda x: len(x))

Master’s degree (Ph.D. preferred) in Economics, Mathematics, Statistics, Physics, Experimental Psychology, Biology, or other quantitative discipline, including significant coursework in statistics, econometrics, or a related field 23 230


In [5]:

mask_series = ~basic_quals_df.prefer_percentage.isnull()
df = basic_quals_df[mask_series].sort_values(['prefer_percentage'])
# print(df.prefer_percentage.unique().tolist())
for (qualification_str, prefer_percentage), head_df in df.head(10).tail(5).groupby(['qualification_str', 'prefer_percentage']): print(qualification_str, prefer_percentage)
for (qualification_str, prefer_percentage), tail_df in df.tail(10).head(5).groupby(['qualification_str', 'prefer_percentage']): print(qualification_str, prefer_percentage)

A preference for a self-directed, project-based work style 0.034482758620689655
PhD preferred; MS in Computer Science, Statistics, Math, Engineering, or related field required. 0.041666666666666664
The preferred candidate will have experience working with and/or studying these applications 0.043478260869565216
While our preference lies with candidates who possess real trading experience in these strategies with a track record of at least one year, we are open to considering applicants with simulation-based experience in exceptional cases 0.04329004329004329
You prefer to run an A/B test rather than having long discussions on what changes need to be made in the product. 0.035398230088495575
A Plus in Innovative Model Architecture: Design novel NLP architectures that integrate cutting-edge techniques such as cross-modal attention, graph neural networks, and unsupervised pre-training to solve complex multimodal language understanding tasks is preferred. 0.9622641509433962
Excel in solving

In [None]:

import anthropic

# Defaults to os.environ.get("ANTHROPIC_API_KEY")
client = anthropic.Anthropic(
    api_key=wsu.secrets_json['anthropic']['api_key'],
)

In [10]:

mask_series = (df.prefer_percentage > 0.25) & (df.prefer_percentage < 0.75)
for (qualification_str, prefer_percentage), sample_df in df[mask_series].sample(min(5, df[mask_series].shape[0])).groupby(['qualification_str', 'prefer_percentage']):
    message = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=1000,
        temperature=0,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": f'''Convert this qualification sentence into one minimum requirements sentence and one preferred requirements sentence: "{qualification_str}"'''
                    }
                ]
            }
        ]
    )
    print(qualification_str, prefer_percentage)
    results_list = re.split('(Minimum|Preferred) [Rr]equirements?(?: sentence)?:\s+', message.to_dict()['content'][0]['text'], 0)
    print(results_list)

BA/BS required, Masters/PhD preferred in quantitative discipline or related 0.37333333333333335
['', 'Minimum', "- Bachelor's degree in a quantitative discipline or related field.\n\n", 'Preferred', "- Master's degree or PhD in a quantitative discipline or related field."]
Bachelor s degree (B.S., B.A.), preferably in Statistics, or equivalent, At least 6 years clinical trial experience 0.2782608695652174
['', 'Minimum', "Bachelor's degree (B.S., B.A.) or equivalent, and at least 6 years of clinical trial experience.\n\n", 'Preferred', "Bachelor's degree (B.S., B.A.) in Statistics or a related field."]
Experience with at least one programming language (preference for those commonly used in ML or scientific computing such as Python or C++). 0.3669064748201439
['', 'Minimum', 'Experience with at least one programming language.\n\n', 'Preferred', 'Experience with programming languages commonly used in machine learning or scientific computing, such as Python or C++.']
Expertise in at least