## Init

In [3]:
from dotenv import load_dotenv
import os
import logging
import requests

from src.mycareersfuture import MyCareersFutureListings


logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


# Load environment variables from .env
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# change these
data = {
    "sessionId": "",
    "search": "data",
    "salary": 6000,
    "positionLevels": ["Executive", "Junior Executive", "Fresh/entry level"],
    "postingCompany": []
}

start_url = "https://api.mycareersfuture.gov.sg/v2/search?limit=20&page=0"

json_save_file = "./jobslist.json"

SLEEP_DELAY = 0.5 # secs


## Run the scrape and save to file

{'metadata': {'jobPostId': 'MCF-2023-0802627', 'updatedAt': '2023-10-20T08:15:38', 'newPostingDate': '2023-10-20', 'totalNumberJobApplication': 0, 'isPostedOnBehalf': False, 'isHideSalary': False, 'isHideHiringEmployerName': False, 'jobDetailsUrl': 'https://www.mycareersfuture.gov.sg/job/information-technology/data-analystdata-engineer-lyneer-corp-a2cc2d749776ff08d1df8ac0d7c2ecfd'}, 'hiringCompany': None, 'address': {'overseasCountry': None, 'foreignAddress1': None, 'foreignAddress2': None, 'block': '3', 'street': 'SHENTON WAY', 'floor': None, 'unit': None, 'building': 'SHENTON HOUSE', 'postalCode': '068805', 'isOverseas': False, 'districts': [{'id': 1, 'location': 'D01 Cecil, Marina, People’s Park, Raffles Place', 'region': 'Central', 'sectors': ['01', '02', '03', '04', '05', '06'], 'regionId': 'Central'}], 'lat': 1.27854545481515, 'lng': 103.850090052093}, 'positionLevels': [{'id': 9, 'position': 'Executive'}], 'schemes': [], 'postedCompany': {'uen': '202228958E', 'name': 'LYNEER COR

## Start here and load from file

In [4]:
lst = MyCareersFutureListings(sleep_delay=SLEEP_DELAY)
listings = lst.load_json(json_load_file=json_save_file)

print(listings[0])

{'metadata': {'jobPostId': 'MCF-2023-0802627', 'updatedAt': '2023-10-20T08:15:38', 'newPostingDate': '2023-10-20', 'totalNumberJobApplication': 0, 'isPostedOnBehalf': False, 'isHideSalary': False, 'isHideHiringEmployerName': False, 'jobDetailsUrl': 'https://www.mycareersfuture.gov.sg/job/information-technology/data-analystdata-engineer-lyneer-corp-a2cc2d749776ff08d1df8ac0d7c2ecfd'}, 'hiringCompany': None, 'address': {'overseasCountry': None, 'foreignAddress1': None, 'foreignAddress2': None, 'block': '3', 'street': 'SHENTON WAY', 'floor': None, 'unit': None, 'building': 'SHENTON HOUSE', 'postalCode': '068805', 'isOverseas': False, 'districts': [{'id': 1, 'location': 'D01 Cecil, Marina, People’s Park, Raffles Place', 'region': 'Central', 'sectors': ['01', '02', '03', '04', '05', '06'], 'regionId': 'Central'}], 'lat': 1.27854545481515, 'lng': 103.850090052093}, 'positionLevels': [{'id': 9, 'position': 'Executive'}], 'schemes': [], 'postedCompany': {'uen': '202228958E', 'name': 'LYNEER COR

# Drop unnecessary fields

`listings` is still a lot of metadata, still deciding what fields relevant to reduce it:

In [5]:
reduced = []
for listing in listings:
    reduced.append({
        'url' : listing['metadata']['jobDetailsUrl'],
        'job_title' : listing['title'],
        'job_desc' : listing['job_desc'],
        'company' : listing['postedCompany']['name'],
        'salary_min' : listing['salary']['minimum'],
        'salary_max' : listing['salary']['maximum'],
        'skills' : ', '.join([skill['skill'] for skill in listing['skills']]),
    })

reduced[:2]


[{'url': 'https://www.mycareersfuture.gov.sg/job/information-technology/data-analystdata-engineer-lyneer-corp-a2cc2d749776ff08d1df8ac0d7c2ecfd',
  'job_title': 'Data Analyst/Data Engineer',
  'job_desc': 'Job Description:\n\n  Provides development and analytical support on various wealth products to ensure project goals are met. Adaptable to internal frameworks. Participates in the rollout of company-wide pilot programs developed as a result of programmed models. Duties primarily include the regular use of discretion, independent judgment, the ability to communicate with multiple levels of management and the utilization of core PRIDE behaviors.\n\nRequired and desired skills/qualifications:\n\n  Desired candidate should be having around 3-5 years of experience.\n  Have strong technical foundation with in-depth knowledge in Big Data Hadoop, Data Reporting, Data Design, Data Analysis, Data governance, Data integration and Data quality.\n  Experience in monitoring, Tuning tasks on Clouder

In [6]:
for listing in reduced:
    print(listing['job_desc'])
    print("\n\n\n")

Job Description:

  Provides development and analytical support on various wealth products to ensure project goals are met. Adaptable to internal frameworks. Participates in the rollout of company-wide pilot programs developed as a result of programmed models. Duties primarily include the regular use of discretion, independent judgment, the ability to communicate with multiple levels of management and the utilization of core PRIDE behaviors.

Required and desired skills/qualifications:

  Desired candidate should be having around 3-5 years of experience.
  Have strong technical foundation with in-depth knowledge in Big Data Hadoop, Data Reporting, Data Design, Data Analysis, Data governance, Data integration and Data quality.
  Experience in monitoring, Tuning tasks on Cloudera distribution.
  Deep and extensive knowledge with HDFS, Spark, MapReduce, Hive, HBase, Sqoop, Yarn, Airflow.
  Thorough knowledge on Hadoop architecture and various components such as HDFS, Name Node, Data Node,

In [7]:
user_resume = """
Hanafi Haffidz
(Mohammad Hanafi Bin Md Haffidz)
CAIE Associate AI Engineer (Certified)
English, Malay (written, spoken) | Singaporean
hanafi.haffidz@gmail.com | +65 9150 6451
portfolio: gammaraysky.github.io
SUMMARY
Ex-filmmaker turned AI Engineer, with over ten years
experience in media and advertising, both as a freelancer
and company owner.
Experience in data engineering, machine learning, MLOps,
project management, design thinking, training, content
development, storytelling and communication.
EXPERIENCE
AI Apprentice (apprenticeship programme)
AI Singapore (Feb 2023 - Nov 2023)
● Core developer on an automated speech recognition project, collaborating closely with principal investigator
NTU Speechlab to develop a voice activity detection model for far-field dialogue recognition.
● Collaborated with a multidisciplinary team, effectively communicating findings, insights, and technical
concepts to both technical and non-technical stakeholders. Participated in sprint planning, sprint reviews,
technical 'Brown Bag' sharing sessions, and code reviews.
● Conducted exploratory data analysis and feature engineering on audio datasets, utilising a custom Gradio
frontend. This involved visual clustering of speech and non-speech classes for far-field vs. near-field audio
using t-SNE. Experimented with various features, including MFCC, mel-spectrograms, and Wav2Vec 2.0
embeddings.
● Authored a data engineering and orchestration pipeline for ETL processes on audio datasets. Pipeline
encompassed cleaning, validation, format conversions, data chunking, sampling, augmentation, model
training, and evaluation. Tools used included Kedro and Apache Airflow.
● Conducted literature review of current state-of-the-art models and performed model selection and evaluation.
Compared Wav2Vec 2.0, NeMo MarbleNet, PyAnNet, ZFF-VAD, and Robust VAD models with experiment
tracking using MLFlow, Tensorboard, and Pytorch Lightning. Further fine-tuned the selected model through
data augmentation and hyperparameter optimization using Optuna.
● Implemented unit, integration, and endpoint testing, as well as CI/CD pipelines using Gitlab.
● Developed a scalable model-serving application (can run locally or cloud deployed for larger workloads).
Technologies used include FastAPI, Celery, RabbitMQ, Redis, Docker, and Kubernetes.
● Mentored junior apprentices during their deep-skilling phase, focusing on computer vision and classical
supervised learning topics.
● Worked on several side projects, including:
○ Examination question generation, both open-ended and multiple-choice questions (MCQ), utilising
both OpenAI GPT API and open-source large language models.
○ Implemented Retrieval Augmented Generation using a custom fine-tuned Llama 2.0 and LlamaIndex
for a question-answering chatbot proof-of-concept.
○ Developed a multimodal hate speech detection system for image and textual data using ImageBind.
○ Worked on plant disease detection (Object Detection) using smaller CNN models designed for
deployment on edge devices.
Data Analyst/Research Coordinator, Special Projects (freelance/ad hoc)
Singapore Association of Motion Pictures Professionals (2019 - 2023)
● Coordinated focus group studies and townhall meet ups for media industry practitioners & companies to
gather snapshot of media industry (film/TV/advertising).
● Conduct exploratory data analysis, data cleaning, aggregation, visualisations and creations of reports.
Assisted executive committee in mapping out roadmap and key objectives for association, and presentation of
findings to IMDA.
● Consulted on database selection, data gathering, and analytics-related decisions for the association's ongoing
development of a specialised jobs portal platform exclusively for the local media industry. Acted as product
owner, defining design goals, prioritising features, and working closely with the development team throughout
the development of the jobs platform app.
Web Designer/Webmaster/Copywriter (freelance/ad hoc)
Singapore Association of Motion Pictures Professionals (2019 - 2020)
● Designed and maintained association website (https://www.sampp.org.sg). Tech stack included Wordpress,
jQuery, MySQL.
● Planning and copywriting of content for microsite (https://www.sampp.org.sg/mph) to educate media
practitioners on many issues ranging from contracts, legal rights and statuses as a freelancer, to available
government grants or information on permits such as for filming with aerial unmanned aircraft. Also consulted
with lawyers during this process to draft boilerplate legal contracts that are free to use/modify.
● Setup & maintenance of domain, webhost accounts, linking of APIs (e.g. Google Drive, emails, with onsite
newsletters/forms), scripted custom Wordpress plugins.
Documentary Video Producer (freelance) (2017 - 2021)
● Applied design thinking framework and worked closely with clients to scope out requirements and messaging.
● Craft story/script, organise production timeline, budget, hire additional crew or on-camera talent.
● Conduct and film interviews, B-roll footage, produce graphical assets. Edit content with several rounds of
review with client, filming additional content as needed.
● Clients included Facebook, Twitter, Cisco Systems, A*STAR, Ministry of Home Affairs, Channel News Asia,
VICE
● Highlights:
○ Over 30 videos produced for Facebook APAC across Singapore, Malaysia, Japan, Hong Kong,
Taiwan.
○ Docu-series for CNA ("Into the Vault"), MHA ("Frontliners")

"""

reduced[0]['job_desc']



'Job Description:\n\n  Provides development and analytical support on various wealth products to ensure project goals are met. Adaptable to internal frameworks. Participates in the rollout of company-wide pilot programs developed as a result of programmed models. Duties primarily include the regular use of discretion, independent judgment, the ability to communicate with multiple levels of management and the utilization of core PRIDE behaviors.\n\nRequired and desired skills/qualifications:\n\n  Desired candidate should be having around 3-5 years of experience.\n  Have strong technical foundation with in-depth knowledge in Big Data Hadoop, Data Reporting, Data Design, Data Analysis, Data governance, Data integration and Data quality.\n  Experience in monitoring, Tuning tasks on Cloudera distribution.\n  Deep and extensive knowledge with HDFS, Spark, MapReduce, Hive, HBase, Sqoop, Yarn, Airflow.\n  Thorough knowledge on Hadoop architecture and various components such as HDFS, Name Node,

# Employing LLMs for Semantic Similarity/RAG/Summarization

- need some experimentation to find best path. based on the following constraints:
- each JD is probably 3-4 paragraphs of text
- the user's resume they may wish to put in is also at least a 1 pager of text
- do we try summarizing each of those first to attempt semantic similarity of the embeddings?
  - but summarization quality also varies, some models i've tested, asking it to summarize only the skills required, just returned 'data engineer'
- do we try to split every sentence, make a list of embeddings, and try to score every resume sentence to every JD sentence, and somehow only save maximum scores/similarities (this sounds complicated)


### Flan T5 XXL : "Extract the skills required for the below job description"

In [6]:
!pip install -U langchain

Collecting langchain
  Obtaining dependency information for langchain from https://files.pythonhosted.org/packages/ad/b1/6bb5006471264b5d75fcf0e3d7ed8d0bfc4ec335e08e05abf5900c42aa43/langchain-0.0.325-py3-none-any.whl.metadata
  Using cached langchain-0.0.325-py3-none-any.whl.metadata (16 kB)
Using cached langchain-0.0.325-py3-none-any.whl (1.9 MB)
Installing collected packages: langchain
  Attempting uninstall: langchain
    Found existing installation: langchain 0.0.220
    Uninstalling langchain-0.0.220:
      Successfully uninstalled langchain-0.0.220
Successfully installed langchain-0.0.325


DEPRECATION: pytorch-lightning 1.6.5 has a non-standard dependency specifier torch>=1.8.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llama-index 0.8.53.post3 requires nest-asyncio<2.0.0,>=1.5.8, but you have nest-asyncio 1.5.6 which is incompatible.

[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
# API_URL = "https://api-inference.huggingface.co/models/togethercomputer/RedPajama-INCITE-Chat-3B-v1"
# headers = {"Authorization": f"Bearer {HF_TOKEN}"}
API_URL = "https://api-inference.huggingface.co/models/google/flan-t5-xxl"
headers = {"Authorization": f"Bearer {HF_TOKEN}"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()

for listing in reduced[:4]:
    output = query({
        "inputs": f"Extract the skills required for the below job description: \n{listing['job_desc']}",
    })

    print(output[0]['generated_text'])


Data Architecture, Docker, S3, Git, PySpark, Kubernetes
Data Engineer
Data Analysis, Catalogs, Data Management, Data Quality, SQL, SAP, Data Migration, Attention
Data Engineer


### Flan T5 XXL : "Summarize the job skills requirements in 200 words"

In [6]:
for listing in reduced[:4]:
    output = query({
        "inputs": f"Summarize the job skills requirements in 200 words: \n{listing['job_desc']}",
    })

    print(output[0]['generated_text'])

Data Engineer - Design, Architect, Deploy, and maintain solutions on AWS and
Data Engineer - Microsoft Azure - Town Area - MNC, good corporate culture and 5-
Data Analyst for a global mining company. In the capacity of a Data Analyst, your primary
Data Engineer - Fintech - New York, NY - 5+ years of experience in
