In [None]:
# !pip install llamaindex transformers accelerate bitsandbytes

In [5]:
import logging
import os

import chromadb
import requests
import yaml
from dotenv import load_dotenv
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import Document, ServiceContext, StorageContext, VectorStoreIndex
from llama_index.embeddings import LangchainEmbedding  # BertEmbedding
from llama_index.llms import OpenAI
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response_synthesizers import get_response_synthesizer
from llama_index.vector_stores import ChromaVectorStore

from src.mycareersfuture import MyCareersFutureListings

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def read_yaml_config(file_path):
    with open(file_path, "r") as file:
        config = yaml.safe_load(file)
    return config


file_path = "conf/base/config.yml"
config = read_yaml_config(file_path)
print(config)

# Load environment variables from .env
load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["LLAMA_INDEX_CACHE_DIR"] = "cache"


mcf_listings = MyCareersFutureListings(sleep_delay=config["scraper_delay"])

listings = mcf_listings.load_json(json_load_file=config["scraper_results_file"])


### REDUCE DATASET TO RELEVANT FIELDS ###
reduced = []
for listing in listings:
    reduced.append(
        {
            "url": listing["metadata"]["jobDetailsUrl"],
            "job_title": listing["title"],
            "job_desc": listing["job_desc"],
            "company": listing["postedCompany"]["name"],
            "salary_min": listing["salary"]["minimum"],
            "salary_max": listing["salary"]["maximum"],
            "skills": ", ".join([skill["skill"] for skill in listing["skills"]]),
        }
    )

### CREATE DOCUMENTS FROM ALL THE RETURNED LISTINGS ###
documents = [
    Document(
        text=listing["job_desc"],
        metadata={
            "url": listing["url"],
            "job_title": listing["job_title"],
            "company": listing["company"],
            "salary_min": listing["salary_min"],
            "salary_max": listing["salary_max"],
            "skills": listing["skills"],
        },
        excluded_llm_metadata_keys=["url", "salary_min", "salary_max"],
        excluded_embed_metadata_keys=["url", "salary_min", "salary_max"],
        metadata_separator="::",
        metadata_template="{key}->{value}",
        text_template="Job Listing Metadata: {metadata_str}\n-----\nJob Listing: {content}\n-----\n",
    )
    for listing in reduced
]

### CREATE VECTOR STORE ###
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
)
service_context_embedding = ServiceContext.from_defaults(embed_model=embed_model)
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    service_context=service_context_embedding,
)

### SET UP SERVICE CONTEXT ###
service_context_llm = ServiceContext.from_defaults(
    llm=OpenAI(
        model="gpt-3.5-turbo",
        temperature=0.1,
    ),
)

### CREATE RETRIEVER ###
retriever = index.as_retriever(similarity_top_k=config["similarity_top_k"])

response_synthesizer = get_response_synthesizer(
    response_mode="compact",
    service_context=service_context_llm,
    use_async=False,
    streaming=False,
)

### CREATE QUERY ENGINE ###
query_engine = RetrieverQueryEngine.from_args(
    retriever=retriever, response_synthesizer=response_synthesizer
)

### LOAD USER'S RESUME ###
with open(config["user_resume_txt_file"], "r", encoding="utf8") as file:
    user_resume = file.read()

### PROMPT TEMPLATE ###
user_input = (
    f"INSTRUCTION:\n{config['instruction_prompt']}\n\n RESUME:\n{user_resume}\n"
)



{'scraper_starturl': 'https://api.mycareersfuture.gov.sg/v2/search?limit=20&page=0', 'scraper_results_file': './data/scraper_results.json', 'scraper_delay': 0.5, 'scraper_query': {'sessionId': '', 'search': 'data', 'salary': 6000, 'positionLevels': ['Executive', 'Junior Executive', 'Fresh/entry level'], 'postingCompany': []}, 'user_resume_txt_file': './data/resume.txt', 'similarity_top_k': 20, 'instruction_prompt': "Which of the given job listings best match the user's resume below? Return the company name for the top 5 best fits."}


2023-11-10 15:18:31,332 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2023-11-10 15:18:31,729 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.05it/s]


In [11]:

### RUN QUERY ###
result = query_engine.query(user_input)
print(f"Answer: {str(result)}")

Batches: 100%|██████████| 1/1 [00:00<00:00, 12.89it/s]
2023-11-10 15:23:32,147 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 15:23:34,697 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 15:23:43,945 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 15:23:53,426 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 15:24:04,533 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 15:24:15,095 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 15:24:26,103 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 15:24:37,419 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/complet

Answer: Based on the user's job experience, the top 3 job listings that are the best match are:

1. Company: AI Singapore
   - Job Title: Data Scientist
   - Duration: Feb 2023 - Nov 2023
   - Key Responsibilities:
     - Collaborated closely with a multidisciplinary team and effectively communicated findings and insights to both technical and non-technical stakeholders.
     - Conducted exploratory data analysis and feature engineering on audio datasets.
     - Authored a data engineering and orchestration pipeline for ETL processes on audio datasets.
     - Conducted literature review of current state-of-the-art models and performed model selection and evaluation.
     - Developed a scalable model-serving application using FastAPI, Celery, RabbitMQ, Redis, Docker, and Kubernetes.
     - Mentored junior apprentices during their deep-skilling phase.

2. Company: Singapore Association of Motion Pictures Professionals
   - Job Title: Data Analyst/Research Coordinator
   - Duration: 2019 

In [1]:
import logging
import os

import chromadb
import requests
from dotenv import load_dotenv
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import Document, ServiceContext, StorageContext, VectorStoreIndex
from llama_index.embeddings import LangchainEmbedding  # BertEmbedding
from llama_index.llms import OpenAI
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.response_synthesizers import get_response_synthesizer
from llama_index.vector_stores import ChromaVectorStore

from src.mycareersfuture import MyCareersFutureListings

In [2]:
load_dotenv()
os.environ["LLAMA_INDEX_CACHE_DIR"] = "cache"
HF_TOKEN = os.getenv("HF_TOKEN")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


JSON_LOAD_FILE = "./jobslist.json"
SLEEP_DELAY = 0.5

mcf_listings = MyCareersFutureListings(sleep_delay=SLEEP_DELAY)
listings = mcf_listings.load_json(json_load_file=JSON_LOAD_FILE)

reduced = []
for listing in listings:
    reduced.append(
        {
            "url": listing["metadata"]["jobDetailsUrl"],
            "job_title": listing["title"],
            "job_desc": listing["job_desc"],
            "company": listing["postedCompany"]["name"],
            "salary_min": listing["salary"]["minimum"],
            "salary_max": listing["salary"]["maximum"],
            "skills": ", ".join([skill["skill"] for skill in listing["skills"]]),
        }
    )

documents = [
    Document(
        text=listing["job_desc"],
        metadata={
            "url": listing["url"],
            "job_title": listing["job_title"],
            "company": listing["company"],
            "salary_min": listing["salary_min"],
            "salary_max": listing["salary_max"],
            "skills": listing["skills"],
        },
        excluded_llm_metadata_keys=["url", "salary_min", "salary_max"],
        excluded_embed_metadata_keys=["url", "salary_min", "salary_max"],
        metadata_separator="::",
        metadata_template="{key}->{value}",
        text_template="Metadata: {metadata_str}\n-----\nJob Listing: {content}",
    )
    for listing in reduced
]


db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
)
service_context_embedding = ServiceContext.from_defaults(embed_model=embed_model)
index=VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
    service_context=service_context_embedding,
)

service_context_llm = ServiceContext.from_defaults(
    llm=OpenAI(
        model="gpt-3.5-turbo",
        temperature=0.1,
    ),
    # system_prompt="You are an AI assistant assisting job seekers find the best matches based on their profile."
)
# retriever = VectorIndexRetriever(
#     index=index,
#     similarity_top_k=10,
# )
retriever = index.as_retriever(similarity_top_k=10)

response_synthesizer = get_response_synthesizer(
    response_mode="compact",
    service_context=service_context_llm,
    use_async=False,
    streaming=False,
)

query_engine = RetrieverQueryEngine.from_args(
    retriever=retriever, response_synthesizer=response_synthesizer
)

with open('./data/resume.txt', 'r') as file:
    content = file.read()

user_input = f"Which of the retrieved job listings best match the user's resume below? Return the company name for the top 3 best fits.\n\n RESUME:\n{content}\n"
result = query_engine.query(user_input)
print(f"Answer: {str(result)}")

2023-11-10 14:34:35,398 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
  from .autonotebook import tqdm as notebook_tqdm
2023-11-10 14:34:41,142 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2023-11-10 14:34:42,770 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.10s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.38it/s]
2023-11-10 14:34:50,104 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 14:34:53,309 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2023-11-10 14:34:55,214 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HT

Answer: Based on the skills and experience listed in the resume, the top 3 job listings that best match the user's skills are:

1. FINERGIC SOLUTIONS PTE. LTD. - Data Engineer
2. ABC Company - Data Engineer
3. XYZ Corporation - Data Engineer


In [4]:
result.source_nodes

[NodeWithScore(node=TextNode(id_='dbe92a83-2528-4d48-bce5-ead57fafb9e4', embedding=None, metadata={'url': 'https://www.mycareersfuture.gov.sg/job/information-technology/data-analystdata-engineer-lyneer-corp-a2cc2d749776ff08d1df8ac0d7c2ecfd', 'job_title': 'Data Analyst/Data Engineer', 'company': 'LYNEER CORP (SINGAPORE) PTE. LTD.', 'salary_min': 5000, 'salary_max': 7000, 'skills': 'Airflow, Data Analysis, Big Data, Hadoop, Agile, Data Integration, Data Quality, Data Governance, MapReduce, Data Design, Tuning, SQL, Debugging, Software Development'}, excluded_embed_metadata_keys=['url', 'salary_min', 'salary_max'], excluded_llm_metadata_keys=['url', 'salary_min', 'salary_max'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='37210b26-5b33-469e-aadd-0a45aa4e6998', node_type=None, metadata={'url': 'https://www.mycareersfuture.gov.sg/job/information-technology/data-analystdata-engineer-lyneer-corp-a2cc2d749776ff08d1df8ac0d7c2ecfd', 'job_title': 'Data Analyst/Data Engi

In [None]:
# from llama_index.retrievers import VectorIndexRetriever
# from llama_index import RetrieverQueryEngine, RagResponseSynthesizer
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# from llama_index import ResponseSynthesizer, ServiceContext
# from transformers import AutoModelForCausalLM, AutoTokenizer
def setup_environment():
    # Load environment variables from .env
    load_dotenv()
    # os.environ['OPENAI_API_KEY'] = 'sk-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
    os.environ["LLAMA_INDEX_CACHE_DIR"] = "cache"
    HF_TOKEN = os.getenv("HF_TOKEN")
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


def create_document_from_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    resume = Document(
        text=content,
        metadata={
            'file_path': file_path,
        },
        excluded_llm_metadata_keys=['file_path'],
        excluded_embed_metadata_keys=['file_path'],
        metadata_separator="::",
        metadata_template="{key}->{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
    )

    return resume

def load_documents():
    # return SimpleDirectoryReader(
    #     path,
    #     recursive=True,
    #     required_exts=[".pdf"],
    #     filename_as_id=True,
    # ).load_data()

    JSON_LOAD_FILE = "./jobslist.json"
    SLEEP_DELAY = 0.5

    mcf_listings = MyCareersFutureListings(sleep_delay=SLEEP_DELAY)
    listings = mcf_listings.load_json(json_load_file=JSON_LOAD_FILE)

    reduced = []
    for listing in listings:
        reduced.append(
            {
                "url": listing["metadata"]["jobDetailsUrl"],
                "job_title": listing["title"],
                "job_desc": listing["job_desc"],
                "company": listing["postedCompany"]["name"],
                "salary_min": listing["salary"]["minimum"],
                "salary_max": listing["salary"]["maximum"],
                "skills": ", ".join([skill["skill"] for skill in listing["skills"]]),
            }
        )

    documents = [
        Document(
            text=listing["job_desc"],
            metadata={
                "url": listing["url"],
                "job_title": listing["job_title"],
                "company": listing["company"],
                "salary_min": listing["salary_min"],
                "salary_max": listing["salary_max"],
                "skills": listing["skills"],
            },
            excluded_llm_metadata_keys=["url", "salary_min", "salary_max"],
            excluded_embed_metadata_keys=["url", "salary_min", "salary_max"],
            metadata_separator="::",
            metadata_template="{key}->{value}",
            text_template="Metadata: {metadata_str}\n-----\nJob Listing: {content}",
        )
        for listing in reduced
    ]

    return documents


def setup_index(documents):
    db = chromadb.PersistentClient(path="./chroma_db")
    chroma_collection = db.get_or_create_collection("quickstart")
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    embed_model = LangchainEmbedding(
        HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    )
    service_context_embedding = ServiceContext.from_defaults(embed_model=embed_model)
    return VectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context,
        service_context=service_context_embedding,
    )


def setup_query_engine(index):
    # node_parser = SimpleNodeParser.from_defaults(
    #     text_splitter=TokenTextSplitter(chunk_size=1024, chunk_overlap=20)
    # )
    # prompt_helper = PromptHelper(
    #     context_window=4096,
    #     num_output=256,
    #     chunk_overlap_ratio=0.1,
    #     chunk_size_limit=None,
    # )

    # service_context = ServiceContext.from_defaults(
    #     llm=llm,
    #     embed_model=embed_model,
    #     node_parser=node_parser,
    #     prompt_helper=prompt_helper,
    # )
    service_context_llm = ServiceContext.from_defaults(
        llm=OpenAI(
            model="gpt-3.5-turbo",
            temperature=0.1,
        ),
        # system_prompt="You are an AI assistant assisting job seekers find the best matches based on their profile."
    )
    # retriever = VectorIndexRetriever(
    #     index=index,
    #     similarity_top_k=10,
    # )
    retriever = index.as_retriever(similarity_top_k=10)

    response_synthesizer = get_response_synthesizer(
        response_mode="compact",
        service_context=service_context_llm,
        use_async=False,
        streaming=False,
    )

    query_engine = RetrieverQueryEngine.from_args(
        retriever=retriever, response_synthesizer=response_synthesizer
    )
    return query_engine


def get_logger():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )
    logger = logging.getLogger(__name__)
    return logger

In [None]:
with open('./data/resume.txt', 'r') as file:
    content = file.read()


In [None]:
def main():
    logger = get_logger()
    setup_environment()

    documents = load_documents()
    index = setup_index(documents)
    query_engine = setup_query_engine(index)

    user_input = f"Which of the retrieved job listings best match the user's resume below? Return the company name for the top 3 best fits.\n\n RESUME:\n{content}\n"
    result = query_engine.query(user_input)
    print(f"Answer: {str(result)}")


if __name__ == "__main__":
    main()


In [None]:
logger = get_logger()
setup_environment()

documents = load_documents()
index = setup_index(documents)
query_engine = setup_query_engine(index)

user_input = f"Which of the retrieved job listings best match the user's resume below? Return the company name for the top 3 best fits.\n\n RESUME:\n{content}\n"
result = query_engine.query(user_input)
print(f"Answer: {str(result)}")

In [None]:
result.nodes

In [None]:
retriever

In [None]:
# Start by loading your data in the form of Document objects. LlamaIndex has several data loaders which can help you load Documents via the load_data method.

from llama_index import Document

# data_loader = DataLoader('path_to_your_data')
# documents = data_loader.load_data()



documents = [Document(
	text=listing['job_desc'],
	metadata={
     'url' : listing['url'],
     'job_title' : listing['job_title'],
     'company' : listing['company'],
     'salary_min' : listing['salary_min'],
     'salary_max' : listing['salary_max'],
     'skills' : listing['skills'],
           },
	excluded_llm_metadata_keys=['url', 'salary_min', 'salary_max'],
    excluded_embed_metadata_keys=['url', 'salary_min', 'salary_max'],
	metadata_separator="::",
	metadata_template="{key}->{value}",
	text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
) for listing in reduced]

In [None]:
# The ServiceContext in LlamaIndex is a utility container that bundles commonly used resources during the indexing and querying stages of a LlamaIndex pipeline or application.

from llama_index import ServiceContext
from llama_index.embeddings import BertEmbedding

embed_model = BertEmbedding()
service_context = ServiceContext.from_defaults(embed_model=embed_model)

# In the above example, we are using BertEmbedding for vector embeddings. Replace BertEmbedding with your chosen model if needed.

# You can create the index using the documents loaded earlier and the service context.


from llama_index import LlamaIndex

index = LlamaIndex(documents=documents, service_context=service_context)
index.build()

In [None]:




# You can now use the AutoMergingRetriever to fetch relevant context from the index given a user query.

from llama_index import AutoMergingRetriever

retriever = AutoMergingRetriever(index)

# Finally, create the RetrieverQueryEngine using the retriever and a response synthesizer. The retriever fetches relevant IndexNode objects from the index, and the response synthesizer is used to generate a natural language response based on the retrieved nodes and the user query.

from llama_index import RetrieverQueryEngine, RagResponseSynthesizer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from llama_index import ResponseSynthesizer, ServiceContext
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "your_model_id"  # Replace with the ID of your desired Huggingface model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)


response_synthesizer = ResponseSynthesizer(model=model, tokenizer=tokenizer)
service_context = ServiceContext.from_defaults(response_synthesizer=response_synthesizer)


# response_synthesizer = RagResponseSynthesizer('path_to_llama_2.0_model')
query_engine = RetrieverQueryEngine(retriever, response_synthesizer)

response = query_engine.query('Your query here')
print(response)




In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

   model_id = "your_model_id"  # Replace with the ID of your desired Huggingface model
   tokenizer = AutoTokenizer.from_pretrained(model_id)
   model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)

   from llama_index import OptimumEmbedding, ServiceContext

   embed_model = OptimumEmbedding(model=model, tokenizer=tokenizer)
   service_context = ServiceContext.from_defaults(embed_model=embed_model)