In [None]:
import xml.etree.ElementTree as ET
import json

def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    jobs_list = []

    for job in root.findall('job'):
        job_dict = {
            "title": job.find('title').text,
            "company": job.find('company').text,
            "posted_date": job.find('posted_date').text,
            "job_reference": job.find('job_reference').text,
            "req_number": job.find('req_number').text,
            "url": job.find('url').text,
            "body": job.find('body').text,
            "city": job.find('city').text,
            "state": job.find('state').text,
            "country": job.find('country').text,
            "location": job.find('location').text,
            "function": job.find('function').text,
            "logo": job.find('logo').text,
            "jobtype": job.find('jobtype').text,
            "education": job.find('education').text,
            "experience": job.find('experience').text,
            "salary": job.find('salary').text,
            "requiredlanguages": job.find('requiredlanguages').text,
            "requiredskills": job.find('requiredskills').text
        }
        jobs_list.append(job_dict)
        
    return jobs_list

xml_file = 'job_sample.xml' 
jobs_list = parse_xml(xml_file)

json_output = json.dumps(jobs_list, indent=4)

with open('job_openings.json', 'w') as json_file:
    json_file.write(json_output)

In [10]:
import pandas as pd

# Replace 'path/to/job_openings.json' with the actual path to your JSON file
df = pd.read_json('job_openings.json')

# number of rows
print(df.shape[0])

# Do something with the DataFrame
print(df.head())

1000
                                      title         company posted_date  \
0                             Shift Manager   Buffalo Wings     3/10/23   
1                       Access Provisioning             CTG      9/7/22   
2                             Shift Manager   Buffalo Wings     3/10/23   
3                                Dishwasher  Cracker Barrel     4/28/22   
4  DIRECTOR OF LABOR RELATIONS (NJ, PA, NY)   Compass Group     1/28/23   

   job_reference  req_number  \
0       10794429        3036   
1        8625904        3493   
2       10798555        3036   
3        6989039        4946   
4       10251570        3398   

                                                 url  \
0  https://www.localjobs.com/job/kissimmee-fl-shi...   
1  https://www.localjobs.com/job/san-diego-ca-acc...   
2  https://www.localjobs.com/job/bristol-va-shift...   
3  https://www.localjobs.com/job/kissimmee-fl-dis...   
4  https://www.localjobs.com/job/charlotte-nc-dir...   

              

In [9]:
# Get the number of missing values for each column
missing_values = df.isnull().sum()

# Print the results
print(missing_values)

title                   0
company                 0
posted_date             0
job_reference           0
req_number              0
url                     0
body                    0
city                   27
state                  13
country                 4
location                0
function                5
logo                    0
jobtype                 0
education            1000
experience           1000
salary               1000
requiredlanguages    1000
requiredskills       1000
dtype: int64


In [28]:
import json
import numpy as np
from langchain import PromptTemplate

template = """/You are a naming consultant for new companies.
What is a good name for a company that makes {product}?"""

GPT_JOBS_PROMT = PromptTemplate.from_template('\nYou were given a Job description. Create a valid flatt JSON object parsable by json.loads, with the attributes {fields} and fill in their content from this job description. Set the variable to null if the information is not derivable. Reply with just the JSON object, keep the attribute values short and if appropriate in keywords.')

with open('../job_openings.json','r') as j:
    docs, tokens = np.sum([[1, len((body + GPT_JOBS_PROMT.format(fields=' field'*5)).split(' '))] if (body := o['body']) else 0 for o in json.load(j)], axis=0)
    tokens_k = tokens/1000
    print(f"Aproximate total docs {docs}, tokens {tokens}:\n {tokens_k * 0.0015}$ (0.0015$ inp)\n {docs * 0.6 * 0.002}$ (0.002$ out, 0.5k tk per doc)")

Aproximate total docs 1000, tokens 754544:
 1.131816$ (0.0015$ inp)
 1.2$ (0.002$ out, 0.5k tk per doc)


In [44]:
import yaml
OPENAI_API_KEY = None
with open("../../key.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    OPENAI_API_KEY = config["openai_key"]

In [47]:
import os
import html2text
from langchain.llms import OpenAI
from langchain.llms.fake import FakeListLLM
import html2text
import re
import asyncio

REPARSE_JOBS = True
MAX_RESPONSE_TOKENS = 600

llm =  FakeListLLM(responses=[
    """{
  "education": null,
  "experience": "Preferably, you have 2 years of restaurant or bar experience.",
  "salary": null,
  "requiredlanguages": null,
  "requiredskills": "Exceptional time management, attention to detail, and guest service skills."
}""",
"""{
  "education": "Bachelors degree or equivalent real-life experience",
  "experience": "3+ years of IAM Experience\\n3+ years of IT experience",
  "salary": null,
  "requiredlanguages": null,
  "requiredskills": [
    "Understanding of core IT service and support practices",
    "High degree of integrity and trust along with the ability to work independently",
    "Strong policy and process knowledge, IT auditing skills and expertise to deal with a variety of technologies and customers",
    "Ability to identify security risks and escalate where appropriate",
    "Demonstrated exceptional organization, troubleshooting and documentation skills",
    "Proven interpersonal and consultative skills to achieve security goals including ability to communicate well with IT teams and customer, both written and verbally",
    "Working knowledge of ITIL processes (Foundations certification preferred)",
    "Experience with IT Service Management Tools (ServiceNow preferred)",
    "Experience administering the following: Active Directory, Quest Active Roles, Quest Change Auditor, LDAP, SAP, RSA",
    "Intermediate level PowerShell and scripting skills"
  ]
}
""",
'\n\n{\n    "education": "MD with additional training in one or more of the following areas: infectious disease, public health, epidemiology, vaccinology, clinical development, or closely related field",\n    "experience": "At least 5 years of experience in a relevant field such as vaccinology, epidemiology, infectious diseases, or public health",\n    "salary": "The annual base salary for this position ranges from $219,600.00 to $365,800.00.",\n    "requiredlanguages": "Fluent in English writing, reading, speaking",\n    "requiredskills": "Ability to discuss scientific aspects of vaccines such as development, safety, implementation, decision-making processes, etc., Scientific expertise as demonstrated by lead or senior authorship of at least 5 peer-reviewed publications in English, Expertise in vaccines and immunization programs with an established international KOL network; maternal immunization program experience highly preferred, strong interpersonal skills, demonstrated strong work ethic, and proven track record of delivering high quality products within timelines"\n}'
])

llm = OpenAI(openai_api_key=OPENAI_API_KEY, max_tokens=MAX_RESPONSE_TOKENS)


async def parse_fields(job, fields):
  job_body = job['body']
  prompt = f'{html2text.html2text(job_body if job_body else "")}' + GPT_JOBS_PROMT.format(fields=','.join(fields))
  response = await llm.agenerate([prompt])
  print(f'Ttokens: {len(prompt.split(" "))} {len(response.generations[0][0].text.split(" "))}')
  try:
    return job, json.loads(re.sub(r"(?<=\w)\n(?=\w)", "\\\\n", response.generations[0][0].text)), None
  except:
    print(f'Couldn\'t parse:\n{response.generations[0][0].text}')
    return job, {}, response.generations[0][0].text

async def transform_jobs(jobs, fields=['education', 'experience', 'salary', 'requiredlanguages', 'requiredskills']):
  tasks = [
    parse_fields(i, fields=[k for k, v in i.items() if v is None and k in fields])
    for i in jobs[650:750]
  ]
  return [(dict(j, **i), t) for j, i, t in await asyncio.gather(*tasks)]

if not os.path.exists('../job_openings_full.json') or REPARSE_JOBS:
  patch = '650_750'
  jo_path = '../job_openings'
  with open(f'{jo_path}/job_openings.json','r') as j, open(f'{jo_path}/job_openings_full_{patch}.json', 'w') as f, open(f'{jo_path}/job_openings_full_failed_{patch}.json', 'w') as b:
    jobs = json.load(j)
    parsed = await transform_jobs(jobs)
    json.dump([p[0] for p in parsed], f, indent=4)
    b.writelines(f'{json.dumps([p, t])}\n' for p,t in parsed if t is not None)



Ttokens: 312 35
Couldn't parse:


{
    "education": null,
    "experience": "serving food, preparing beverages",
    "salary": null,
    "requiredLanguages": null,
    "requiredSkills": "time management, attention to detail, guest service",
}
Ttokens: 243 31
Ttokens: 971 10
Ttokens: 660 23
Ttokens: 653 33
Ttokens: 712 28
Ttokens: 408 44
Ttokens: 321 18
Ttokens: 487 29
Ttokens: 653 21
Ttokens: 346 48
Ttokens: 634 42
Ttokens: 542 34
Ttokens: 189 16
Ttokens: 719 23
Ttokens: 653 31
Ttokens: 728 28
Ttokens: 1175 26
Ttokens: 660 29
Ttokens: 669 27
Ttokens: 758 23
Ttokens: 189 37
Ttokens: 190 26
Ttokens: 629 47
Ttokens: 630 39
Couldn't parse:


```
{
    "education": "Bachelor's",
    "experience": "2 Years",
    "salary": null,
    "requiredLanguages": null,
    "requiredSkills": "clinical informatics, healthcare IT, clinical information systems/technology, organizational skills, problem-solving skills, critical thinking"
}
```
Ttokens: 660 31
Ttokens: 352 40
Ttokens: 851 26
Ttokens: 321 40

In [None]:

from langchain.embeddings import HuggingFaceEmbeddings
import json
import faiss

with open('../job_openings.json', 'r') as f:
    data = json.load(f)

embeddings = HuggingFaceEmbeddings()
# Loop over the objects and create a string for each one
strings = []
for obj in data:
    string = json.dumps(obj)

    strings.append(string)

doc_result = embeddings.embed_documents(strings)



In [12]:
index = faiss.index_factory(len(doc_result[0]), "Flat")
index.train(doc_result)
index.add(doc_result)

In [14]:
import numpy as np
query_result = embeddings.embed_query("software engineer")
distances, neighbors = index.search(query_result.reshape(1,-1).astype(np.float32), k=5)

In [16]:
for neighbor in neighbors[0]:
    print(strings[neighbor])

{"title": "Application Developer - Junior", "company": "CTG", "posted_date": "4/5/23", "job_reference": "11052592", "req_number": "3493", "url": "https://www.localjobs.com/job/indianapolis-in-application-developer-junior", "body": "<p>Application Developer - Junior</p>\n<p>United States</p>\n<p>Information Technology</p>\n<p>Apr 05, 2023Post Date</p>\n<p>23200577Requisition #</p>\n<p>Apply for JobShare this JobSign Up for Job Alerts</p>\n<p>Also known as: System Engineer, Programmer, System Analyst</p>\n<p>TYPICAL WORK PERFORMED:</p>\n<p>> Designs or assists in the design of applications including mobile applications</p>\n<p>> Creates, and/or modifies existing applications</p>\n<p>> Plans, prepares & analyzes unit tests to detect technical or logic errors</p>\n<p>> Develops test data. Tests modules and analyzes results. Verifies validity of new or modified systems.</p>\n<p>> Analyzes tasks and provides accurate estimates for level of effort required to complete</p>\n<p>> Assists, revie

In [None]:
from faiss import write_index, read_index
write_index(index, "large.index")
index = read_index("large.index")

In [18]:
import pickle
import os

from langchain.embeddings.base import Embeddings

from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file


def save_embeddings(
    embeddings: Embeddings,
    saving_embeddings_file_name: str = os.getenv("SAVING_EMBEDDINGS_FILE_NAME"),
    saving_embeddings_directory: str = os.getenv("SAVING_EMBEDDINGS_DIRECTORY"),
) -> None:
    """
    Save embeddings to a binary file with the specified file name and directory path.

    Args:
        - embeddings (Embeddings): The embeddings to be saved.
        - saving_embeddings_file_name (str): The name of the file to save the embeddings to.
        - saving_embeddings_directory (str): The path to the directory where the file will be saved.

    Returns:
        - None
    """

    directory = os.path.join(os.getcwd(), saving_embeddings_directory)
    if not os.path.exists(directory):
        os.makedirs(directory)
    file_path = os.path.join(directory, saving_embeddings_file_name + ".pkl")

    # Save embeddings to binary file
    with open(file_path, "wb") as f:
        pickle.dump(embeddings, f)

In [20]:
save_embeddings(doc_result, saving_embeddings_file_name="saved_embeddings", saving_embeddings_directory="../embeddings")

In [23]:
def load_embeddings(embeddings_path) -> Embeddings:
    """
    Loads embeddings from the specified file path using pickle.

    Args:
        - embeddings_path (str): Path to file containing embeddings.

    Returns:
        - Embeddings: Loaded embeddings.
    """

    with open(embeddings_path, "rb") as f:
        embeddings: Embeddings = pickle.load(f)

    return embeddings

In [24]:
loaded_embeddings = load_embeddings("../embeddings/saved_embeddings.pkl")