This notebook uses Sentence Transformers to finetune an embedding model on the indeed jobs dataset. In essence, the following steps are used:

1. Sample a set of documents from Indeed jobs database
2. Generate synthetic questions using gpt-4o mini
3. Finetune an embedding model on the synthetic data



## Install Dependencies

In [1]:
!pip install -qU sentence_transformers datasets pyarrow==15.0.2

In [2]:
!pip install -qU langchain_openai langchain_huggingface langchain_core langchain langchain_community langchain-text-splitters

In [3]:
!pip install -qU faiss-cpu unstructured==0.15.7 python-pptx==1.0.2 nltk==3.9.1

In [4]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter Your OpenAI API Key: ")

Enter Your OpenAI API Key: ··········


In [5]:
import os
from typing import Tuple
import torch
from operator import itemgetter
from langchain_core.prompts.chat import SystemMessagePromptTemplate, ChatPromptTemplate, PromptTemplate
from langchain_core.messages import SystemMessage, ChatMessage
from langchain_core.runnables import Runnable, RunnableParallel, RunnablePassthrough
from langchain.agents import Tool, AgentType, initialize_agent
from langchain.memory import ConversationBufferMemory, ConversationSummaryBufferMemory
from langchain_openai.chat_models import ChatOpenAI
from langchain.agents import AgentExecutor
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader
from langchain.agents.format_scratchpad import format_log_to_str
from langchain.agents.output_parsers import ReActSingleInputOutputParser
from langchain.tools.render import render_text_description
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.conversation.base import ConversationChain
from langchain.agents import AgentExecutor, create_react_agent
from langchain.agents.tools import tool
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
import numpy as np
from typing import List
from langchain_core.documents.base import Document
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def get_median_document_length(documents: List[Document]):
    """Returns the median length of `documents`"""
    assert len(documents), "Documents cannot be empty"
    lengths = [len(d.page_content) for d in documents]
    return np.median(lengths)

loader = CSVLoader('jobs.csv')
documents = loader.load()
median_len = get_median_document_length(documents)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=median_len,
    chunk_overlap=median_len//2,
    length_function=len
  )
documents = text_splitter.split_documents(documents)

In [7]:
from uuid import uuid4
def assign_ids(documents):
  """Assigns a unique id to each document"""
  seen = set()
  for d in documents:
    id = str(uuid4())
    while id in seen:
      id = str(uuid4())
    d.metadata['id'] = id

assign_ids(documents)

In [9]:
SAMPLE_SIZE = 3000
random_idxs = np.random.permutation(len(documents))[:SAMPLE_SIZE]
training_documents = np.array(documents)[random_idxs].tolist()
assert len(training_documents) == SAMPLE_SIZE

## Generating Questions using ChatGPT

In [10]:
prompt = ChatPromptTemplate.from_template("""
Given the context below, please generate questions which can be answered using the provided context.

You must generate exactly {n} questions per context in the following format:

1. Question# 1
2. Question# 2
.. and so on

Context:
{context}

""")

In [11]:
llm = ChatOpenAI(temperature=0, model='gpt-4o-mini')
question_chain = prompt | llm

In [12]:
# Sample run
r = question_chain.invoke({"n": 2, "context": training_documents[15].page_content})

In [13]:
r.content

'1. What is the primary role of a Guest Experience Specialist at Market 24/7?\n2. What types of products does Market 24/7 offer in its self-checkout vending markets?'

In [14]:
from tqdm.auto import tqdm
from collections import defaultdict
def get_questions_and_contexts(question_chain, documents, num_questions=2):
  """Given `documents` generates questions which can be answered using the `documents`
  Args:
  documents - The given context documents
  Returns:
  question_store, document_store
  question_store: A dictionary from unique id->generated question
  document_store: A dictionary from unqie question id -> document id
  """
  question_store = {}
  document_store = {}
  seen_ids = set()

  for d in tqdm(documents):
    response = question_chain.invoke({
        "n": num_questions,
        "context": d.page_content
        })
    questions = [q[3:].strip() for q in response.content.split("\n")]
    for q in questions:
      id = str(uuid4())
      while id in seen_ids:
        id = str(uuid4())
      seen_ids.add(id)
      question_store[id] = q
      document_store[id] = d.metadata['id']

  return question_store, document_store

In [15]:
from sklearn.model_selection import train_test_split
training_documents, val_documents = train_test_split(training_documents, test_size=0.20)
val_documents, test_documents = train_test_split(val_documents, test_size=0.50)

In [20]:
training_question_store, training_document_store = get_questions_and_contexts(question_chain, training_documents)

  0%|          | 0/2400 [00:00<?, ?it/s]

In [21]:
val_question_store, val_document_store = get_questions_and_contexts(question_chain, val_documents)

  0%|          | 0/300 [00:00<?, ?it/s]

In [22]:
test_question_store, test_document_store = get_questions_and_contexts(question_chain, test_documents)

  0%|          | 0/300 [00:00<?, ?it/s]

In [23]:
import json
training_corpus = {d.metadata['id']: d.page_content for d in training_documents}
val_corpus = {d.metadata['id']: d.page_content for d in val_documents}
test_corpus = {d.metadata['id']: d.page_content for d in test_documents}


def write_data(data, filename):
  with open(filename, 'w') as f:
    f.write(json.dumps(data))

training_data = {
    "questions": training_question_store,
    "contexts": training_document_store,
    "corpus": training_corpus
}
write_data(training_data, "training_data.jsonl")

val_data = {
  "questions": val_question_store,
    "contexts": val_document_store,
    "corpus": val_corpus
}
write_data(val_data, "val_data.jsonl")

test_data = {
  "questions": test_question_store,
    "contexts": test_document_store,
    "corpus": test_corpus
}
write_data(test_data, "test_data.jsonl")

In [None]:
import json
with open('training_data.jsonl', 'r') as f:
  training_data = json.loads(f.read())

training_question_store = training_data["questions"]
training_document_store = training_data["contexts"]
training_corpus = training_data["corpus"]

with open('val_data.jsonl', 'r') as f:
  val_data = json.loads(f.read())

val_question_store = val_data["questions"]
val_document_store = val_data["contexts"]
val_corpus = val_data["corpus"]


with open('test_data.jsonl', 'r') as f:
  test_data = json.loads(f.read())

test_question_store = test_data["questions"]
test_document_store = test_data["contexts"]
test_corpus = test_data["corpus"]

In [24]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample
from sentence_transformers import SentenceTransformer


model_id = "Snowflake/snowflake-arctic-embed-m"
model = SentenceTransformer(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/84.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/107 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [25]:
examples = []
for query_id, query in training_question_store.items():
    doc_id = training_document_store[query_id]
    text = training_corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [26]:
# https://huggingface.co/blog/matryoshka
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss
BATCH_SIZE = 32
train_dl = DataLoader(
    examples, batch_size=BATCH_SIZE
)

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [27]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_corpus
queries = val_question_store
relevant_docs = {}

# Convert to format needed by retriever
for qid, doc_id in val_document_store.items():
  relevant_docs[qid] = [doc_id]

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [30]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [31]:
EPOCHS = 50

# https://huggingface.co/blog/how-to-train-sentence-transformers
model.fit(
    train_objectives=[(train_dl, train_loss)],
    epochs=EPOCHS,
    warmup_steps=0,
    output_path='jobs_arctic',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100,Dot Accuracy@1,Dot Accuracy@3,Dot Accuracy@5,Dot Accuracy@10,Dot Precision@1,Dot Precision@3,Dot Precision@5,Dot Precision@10,Dot Recall@1,Dot Recall@3,Dot Recall@5,Dot Recall@10,Dot Ndcg@10,Dot Mrr@10,Dot Map@100
50,No log,No log,0.776144,0.861111,0.888889,0.908497,0.776144,0.287037,0.177778,0.09085,0.776144,0.861111,0.888889,0.908497,0.844977,0.824361,0.827473,0.776144,0.861111,0.888889,0.908497,0.776144,0.287037,0.177778,0.09085,0.776144,0.861111,0.888889,0.908497,0.844977,0.824361,0.827473
100,No log,No log,0.78268,0.875817,0.893791,0.921569,0.78268,0.291939,0.178758,0.092157,0.78268,0.875817,0.893791,0.921569,0.854801,0.833166,0.836258,0.78268,0.875817,0.893791,0.921569,0.78268,0.291939,0.178758,0.092157,0.78268,0.875817,0.893791,0.921569,0.854801,0.833166,0.836258
150,No log,No log,0.792484,0.882353,0.898693,0.919935,0.792484,0.294118,0.179739,0.091993,0.792484,0.882353,0.898693,0.919935,0.85973,0.840151,0.843507,0.792484,0.882353,0.898693,0.919935,0.792484,0.294118,0.179739,0.091993,0.792484,0.882353,0.898693,0.919935,0.85973,0.840151,0.843507
153,No log,No log,0.792484,0.882353,0.898693,0.918301,0.792484,0.294118,0.179739,0.09183,0.792484,0.882353,0.898693,0.918301,0.859535,0.840336,0.843924,0.792484,0.882353,0.898693,0.918301,0.792484,0.294118,0.179739,0.09183,0.792484,0.882353,0.898693,0.918301,0.859535,0.840336,0.843924
200,No log,No log,0.784314,0.879085,0.895425,0.918301,0.784314,0.293028,0.179085,0.09183,0.784314,0.879085,0.895425,0.918301,0.854762,0.834095,0.837901,0.784314,0.879085,0.895425,0.918301,0.784314,0.293028,0.179085,0.09183,0.784314,0.879085,0.895425,0.918301,0.854762,0.834095,0.837901
250,No log,No log,0.794118,0.888889,0.905229,0.929739,0.794118,0.296296,0.181046,0.092974,0.794118,0.888889,0.905229,0.929739,0.865165,0.844125,0.847211,0.794118,0.888889,0.905229,0.929739,0.794118,0.296296,0.181046,0.092974,0.794118,0.888889,0.905229,0.929739,0.865165,0.844125,0.847211
300,No log,No log,0.79085,0.883987,0.908497,0.923203,0.79085,0.294662,0.181699,0.09232,0.79085,0.883987,0.908497,0.923203,0.861438,0.84106,0.844416,0.79085,0.883987,0.908497,0.923203,0.79085,0.294662,0.181699,0.09232,0.79085,0.883987,0.908497,0.923203,0.861438,0.84106,0.844416
306,No log,No log,0.795752,0.892157,0.910131,0.926471,0.795752,0.297386,0.182026,0.092647,0.795752,0.892157,0.910131,0.926471,0.866089,0.846102,0.849288,0.795752,0.892157,0.910131,0.926471,0.795752,0.297386,0.182026,0.092647,0.795752,0.892157,0.910131,0.926471,0.866089,0.846102,0.849288
350,No log,No log,0.785948,0.882353,0.911765,0.934641,0.785948,0.294118,0.182353,0.093464,0.785948,0.882353,0.911765,0.934641,0.864163,0.841237,0.844105,0.785948,0.882353,0.911765,0.934641,0.785948,0.294118,0.182353,0.093464,0.785948,0.882353,0.911765,0.934641,0.864163,0.841237,0.844105
400,No log,No log,0.79085,0.888889,0.911765,0.937908,0.79085,0.296296,0.182353,0.093791,0.79085,0.888889,0.911765,0.937908,0.867064,0.844105,0.846612,0.79085,0.888889,0.911765,0.937908,0.79085,0.296296,0.182353,0.093791,0.79085,0.888889,0.911765,0.937908,0.867064,0.844105,0.846612


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [32]:
!zip -r jobs_arctic.zip jobs_arctic

  adding: jobs_arctic/ (stored 0%)
  adding: jobs_arctic/model.safetensors (deflated 8%)
  adding: jobs_arctic/2_Normalize/ (stored 0%)
  adding: jobs_arctic/special_tokens_map.json (deflated 80%)
  adding: jobs_arctic/config_sentence_transformers.json (deflated 38%)
  adding: jobs_arctic/tokenizer.json (deflated 71%)
  adding: jobs_arctic/1_Pooling/ (stored 0%)
  adding: jobs_arctic/1_Pooling/config.json (deflated 57%)
  adding: jobs_arctic/tokenizer_config.json (deflated 74%)
  adding: jobs_arctic/README.md (deflated 72%)
  adding: jobs_arctic/vocab.txt (deflated 53%)
  adding: jobs_arctic/config.json (deflated 48%)
  adding: jobs_arctic/modules.json (deflated 62%)
  adding: jobs_arctic/sentence_bert_config.json (deflated 4%)


In [33]:
from langchain_community.vectorstores import FAISS

def evaluate(data, embedding):
  questions = data['questions']
  corpus = data['corpus']
  contexts = data['contexts']
  documents = [Document(page_content=content, metadata={"id": doc_id}) for doc_id, content in corpus.items()]
  vectorstore = FAISS.from_documents(documents, embedding)
  retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

  num_hits = 0
  for q_id, question in questions.items():
    results = retriever.invoke(question)
    expected_id = contexts[q_id]
    is_hit = expected_id in [d.metadata['id'] for d in results]
    num_hits += int(is_hit)
  return num_hits/len(questions)

In [34]:
baseline_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-m")
baseline_hit_rate = evaluate(test_data, baseline_embeddings)

  baseline_embeddings = HuggingFaceEmbeddings(model_name="Snowflake/snowflake-arctic-embed-m")


In [35]:
print(baseline_hit_rate)

0.6057692307692307


In [36]:
finetuned_embeddings = HuggingFaceEmbeddings(model_name="jobs_arctic")
finetuned_hit_rate = evaluate(test_data, finetuned_embeddings)

Some weights of BertModel were not initialized from the model checkpoint at jobs_arctic and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
print(finetuned_hit_rate)

0.9230769230769231


In [38]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [44]:
model = SentenceTransformer("jobs_arctic")
model.push_to_hub("deman539/snowflake-arctic-embed-m-finetuned-indeed-jobs", exist_ok=True,  commit_message="finetuned model")

Some weights of BertModel were not initialized from the model checkpoint at jobs_arctic and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

'https://huggingface.co/deman539/snowflake-arctic-embed-m-finetuned-indeed-jobs/commit/e22674e7cf04e628800619d384110b7637e30fb7'