# Environment Setup

1. Create a ``.env`` file containing:

```
      HF_TOKEN=<huggingface_token>  # not in use for now
      OPENAI_API_KEY=<openai_token>
```

2. Create a virtual environment:


In [None]:
!conda create --name llamaindex python=3.9
!conda activate llamaindex
!pip install -r requirements.txt

# Get listings from MyCareersFuture website
- edit ``conf/base/config.yml`` to change the search item as needed
- run the below to effect the search and it will save search results as a json file (``./data/scraper_results.json``, or change in the config)

In [None]:
!python 0-scraper.py

# RAG / Semantic Search between user's resume and the saved job listings
- create a file called ``data/resume.txt`` and paste in your resume there.
- run:

In [1]:
import logging
import os

import chromadb
import requests
import yaml
from dotenv import load_dotenv
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import Document, ServiceContext, StorageContext, VectorStoreIndex
from llama_index.embeddings import LangchainEmbedding  # BertEmbedding
from llama_index.llms import OpenAI
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response_synthesizers import get_response_synthesizer
from llama_index.vector_stores import ChromaVectorStore

from src.mycareersfuture import MyCareersFutureListings

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def read_yaml_config(file_path):
    with open(file_path, "r") as file:
        config = yaml.safe_load(file)
    return config


file_path = "conf/base/config.yml"
config = read_yaml_config(file_path)
print(config)

# Load environment variables from .env
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["LLAMA_INDEX_CACHE_DIR"] = "cache"


mcf_listings = MyCareersFutureListings(sleep_delay=config["scraper_delay"])

listings = mcf_listings.load_json(json_load_file=config["scraper_results_file"])


### REDUCE DATASET TO RELEVANT FIELDS ###
reduced = []
for listing in listings:
    reduced.append(
        {
            "url": listing["metadata"]["jobDetailsUrl"],
            "job_title": listing["title"],
            "job_desc": listing["job_desc"],
            "company": listing["postedCompany"]["name"],
            "salary_min": listing["salary"]["minimum"],
            "salary_max": listing["salary"]["maximum"],
            "skills": ", ".join([skill["skill"] for skill in listing["skills"]]),
        }
    )

### CREATE DOCUMENTS FROM ALL THE RETURNED LISTINGS ###
documents = [
    Document(
        text=listing["job_desc"],
        metadata={
            "url": listing["url"],
            "job_title": listing["job_title"],
            "company": listing["company"],
            "salary_min": listing["salary_min"],
            "salary_max": listing["salary_max"],
            "skills": listing["skills"],
        },
        excluded_llm_metadata_keys=["url", "salary_min", "salary_max"],
        excluded_embed_metadata_keys=["url", "salary_min", "salary_max"],
        metadata_separator="::",
        metadata_template="{key}->{value}",
        text_template="Job Listing Metadata: {metadata_str}\n-----\nJob Listing: {content}\n-----\n",
    )
    for listing in reduced
]

### CREATE VECTOR STORE ###
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
)
service_context_embedding = ServiceContext.from_defaults(embed_model=embed_model)
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    service_context=service_context_embedding,
)

### SET UP SERVICE CONTEXT ###
service_context_llm = ServiceContext.from_defaults(
    llm=OpenAI(
        model="gpt-3.5-turbo",
        temperature=0.1,
    ),
)

### CREATE RETRIEVER ###
retriever = index.as_retriever(
    similarity_top_k=config["similarity_top_k"])

response_synthesizer = get_response_synthesizer(
    response_mode="compact",
    service_context=service_context_llm,
    use_async=False,
    streaming=False,
)

### CREATE QUERY ENGINE ###
query_engine = RetrieverQueryEngine.from_args(
    retriever=retriever, response_synthesizer=response_synthesizer
)

### LOAD USER'S RESUME ###
with open(config["user_resume_txt_file"], "r", encoding="utf8") as file:
    user_resume = file.read()

### PROMPT TEMPLATE ###
user_input = (
    f"INSTRUCTION:\n{config['instruction_prompt']}\n\n RESUME:\n{user_resume}\n"
)



2023-11-10 15:53:45,017 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


{'scraper_query': {'sessionId': '', 'search': 'data', 'salary': 6000, 'positionLevels': ['Executive', 'Junior Executive', 'Fresh/entry level'], 'postingCompany': []}, 'scraper_delay': 0.5, 'scraper_starturl': 'https://api.mycareersfuture.gov.sg/v2/search?limit=20&page=0', 'scraper_results_file': './data/scraper_results.json', 'user_resume_txt_file': './data/resume.txt', 'similarity_top_k': 10, 'instruction_prompt': "Given the user's resume below, which of the given job listings are the best match with the user's job experience? Return the top 3 matches company names and urls."}


  from .autonotebook import tqdm as notebook_tqdm
2023-11-10 15:53:51,374 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2023-11-10 15:53:53,002 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.14it/s]


In [2]:

### RUN QUERY ###
result = query_engine.query(user_input)
print(f"Answer: {str(result)}")

Batches: 100%|██████████| 1/1 [00:00<00:00,  8.67it/s]
2023-11-10 15:55:26,632 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 15:55:29,431 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 15:55:32,574 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 15:55:35,901 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 15:55:38,716 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 15:55:42,114 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: Based on the user's job experience, the top 3 job listings that are the best match are:

1. XYZ Company - Data Scientist (URL: www.xyzcompany.com/careers)
2. ABC Corporation - Data Scientist (URL: www.abccorporation.com/careers)
3. DEF Industries - Data Scientist (URL: www.defindustries.com/careers)


In [3]:
result.source_nodes

[NodeWithScore(node=TextNode(id_='b7ba439e-b5b5-42a6-8573-a304cea84377', embedding=None, metadata={'url': 'https://www.mycareersfuture.gov.sg/job/information-technology/data-scientist-allegis-group-singapore-ab949323fb59c2fafdcea7b958159208', 'job_title': 'Data Scientist', 'company': 'ALLEGIS GROUP SINGAPORE PRIVATE LIMITED', 'salary_min': 8000, 'salary_max': 16000, 'skills': 'Machine Learning, SciPy, Pandas, Data Analysis, Azure, Customer Interaction, Experimentation, Natural Language Processing, Agile Methodologies, PyTorch, SQL, Python, Data Science, Matplotlib, Data Visualization'}, excluded_embed_metadata_keys=['url', 'salary_min', 'salary_max'], excluded_llm_metadata_keys=['url', 'salary_min', 'salary_max'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='21517929-472c-4960-8b1b-1bcd0e383f83', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'url': 'https://www.mycareersfuture.gov.sg/job/information-technology/data-scientist-allegis-group-singapore-ab949323

In [9]:
for node in result.source_nodes:
    print(node.metadata)
    print(node.get_text())
    print('--------------------\n\n')

    # print(node.document.metadata)

{'url': 'https://www.mycareersfuture.gov.sg/job/information-technology/data-scientist-allegis-group-singapore-ab949323fb59c2fafdcea7b958159208', 'job_title': 'Data Scientist', 'company': 'ALLEGIS GROUP SINGAPORE PRIVATE LIMITED', 'salary_min': 8000, 'salary_max': 16000, 'skills': 'Machine Learning, SciPy, Pandas, Data Analysis, Azure, Customer Interaction, Experimentation, Natural Language Processing, Agile Methodologies, PyTorch, SQL, Python, Data Science, Matplotlib, Data Visualization'}
Job Summary
We're seeking a proactive Data Scientist to drive data-driven insights. Your role includes defining project goals, performing exploratory data analysis, selecting appropriate machine learning models, and communicating findings effectively. Collaboration with Data/ML engineers, infrastructure development, and staying updated with industry trends are key.

Responsibilities
· Work closely with stakeholders to define project objectives, gather requirements, and pinpoint data sources.

  Perfo