### RAGAS


In [1]:
from dotenv import load_dotenv
import os
# load .env file to environment
load_dotenv()

OPEN_API_KEY = os.getenv("OPENAI_API_KEY")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")


In [7]:
from langchain.document_loaders import JSONLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient

# Load JSON data
loader = JSONLoader(
    file_path="careers.json",
    jq_schema=".[] | {job_name: .job_name, description: .description, spec: .spec}",
    text_content=False  # We extract structured fields
)
documents = loader.load()

In [8]:
training_documents = documents

In [9]:
import uuid

id_set = set()

for document in training_documents:
  id = str(uuid.uuid4())
  while id in id_set:
    id = uuid.uuid4()
  id_set.add(id)
  document.metadata["id"] = id

In [60]:
training_split_documents = training_documents[:len(training_documents) - 7]
val_split_documents = training_documents[len(training_documents) - 7:10-3]
test_split_documents = training_documents[10-3:]

In [11]:
from langchain_openai import ChatOpenAI

qa_chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0
)

In [12]:
from langchain_core.prompts import ChatPromptTemplate

qa_prompt = """\
Given the following context, you must generate questions based on only the provided context.

You are to generate {n_questions} questions which should be provided in the following format:

1. QUESTION #1
2. QUESTION #2
...

Context:
{context}
"""

qa_prompt_template = ChatPromptTemplate.from_template(qa_prompt)

In [13]:
question_generation_chain = qa_prompt_template | qa_chat_model

In [22]:
from tqdm import tqdm
import asyncio

async def create_questions(documents, n_questions, question_generation_chain):
  questions = {}
  relevant_docs = {}
  # Process documents asynchronously
      # Async tasks for question generation
  tasks = []
  doc_ids = []  # Store document IDs in parallel
  for doc in documents:
      doc_id = doc.metadata["id"]
      context_text = doc.page_content  # Extract page content from Document object
      doc_ids.append(doc_id)

      # Prepare async task
      task = question_generation_chain.ainvoke(
          {"n_questions": n_questions, "context": context_text}
      )
      tasks.append(task)

  # Execute all tasks asynchronously
  results = await asyncio.gather(*tasks)

  # Process results and structure them into required output format
  for doc_id, result in tqdm(zip(doc_ids, results), total=len(documents)):
      generated_questions = result.content.split("\n")

      for question in generated_questions:
          if question.strip():
              question_text = question.strip().lstrip("0123456789. ")  # Remove numbering
              question_id = str(uuid.uuid4())  # Generate a unique ID

              questions[question_id] = question_text
              relevant_docs.setdefault(question_id, []).append(doc_id)

  return questions, relevant_docs

In [61]:
training_questions, training_relevant_contexts = await create_questions(training_split_documents, 2,question_generation_chain)

100%|██████████| 3/3 [00:00<00:00, 4044.65it/s]


In [62]:
val_questions, val_relevant_contexts = await create_questions(val_split_documents, 2, question_generation_chain)

100%|██████████| 4/4 [00:00<00:00, 16416.06it/s]


In [63]:
test_questions, test_relevant_contexts = await create_questions(test_split_documents, 2, question_generation_chain)

100%|██████████| 3/3 [00:00<00:00, 2794.34it/s]


In [65]:
import json

training_corpus = {train_item.metadata["id"] : train_item.page_content for train_item in training_split_documents}

train_dataset = {
    "questions" : training_questions,
    "relevant_contexts" : training_relevant_contexts,
    "corpus" : training_corpus
}

with open("training_dataset.jsonl", "w") as f:
  json.dump(train_dataset, f)

In [67]:
val_corpus = {val_item.metadata["id"] : val_item.page_content for val_item in val_split_documents}

val_dataset = {
    "questions" : val_questions,
    "relevant_contexts" : val_relevant_contexts,
    "corpus" : val_corpus
}

with open("val_dataset.jsonl", "w") as f:
  json.dump(val_dataset, f)

In [68]:
val_corpus

{'ada7cec4-c749-46a4-9fc7-f817314193c5': '{"job_name": "Ing\\u00e9nieur DevOps", "description": "L\\u2019ing\\u00e9nieur DevOps automatise les processus de d\\u00e9veloppement, d\\u00e9ploiement et gestion des infrastructures. Il utilise des outils comme Docker, Kubernetes, Terraform, CI/CD, Ansible pour assurer la scalabilit\\u00e9 et la s\\u00e9curit\\u00e9 des syst\\u00e8mes. Il travaille en collaboration avec les d\\u00e9veloppeurs et les administrateurs syst\\u00e8me pour int\\u00e9grer les bonnes pratiques et optimiser les ressources cloud.", "spec": "Salari\\u00e9 / Consulting possible | Remote partiel | Travail en \\u00e9quipe avec DevOps et d\\u00e9veloppeurs"}',
 '66c17982-81da-45d8-bd2c-d42164b5f1fd': '{"job_name": "Ing\\u00e9nieur en Cloud Computing", "description": "L\\u2019ing\\u00e9nieur Cloud d\\u00e9ploie et g\\u00e8re les infrastructures cloud sur AWS, Azure, GCP. Il optimise la scalabilit\\u00e9 et la performance des services cloud et met en place des architectures s

In [69]:
train_corpus = {test_item.metadata["id"] : test_item.page_content for test_item in test_split_documents}

test_dataset = {
    "questions" : test_questions,
    "relevant_contexts" : test_relevant_contexts,
    "corpus" : train_corpus
}

with open("test_dataset.jsonl", "w") as f:
  json.dump(test_dataset, f)

In [71]:
from sentence_transformers import SentenceTransformer

model_id = "Snowflake/snowflake-arctic-embed-l"
model = SentenceTransformer(model_id)

In [72]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sentence_transformers import InputExample

In [73]:
BATCH_SIZE = 10

In [74]:
corpus = train_dataset['corpus']
queries = train_dataset['questions']
relevant_docs = train_dataset['relevant_contexts']

examples = []
for query_id, query in queries.items():
    doc_id = relevant_docs[query_id][0]
    text = corpus[doc_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [75]:
loader = DataLoader(
    examples, batch_size=BATCH_SIZE
)

In [76]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [768, 512, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [77]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

corpus = val_dataset['corpus']
queries = val_dataset['questions']
relevant_docs = val_dataset['relevant_contexts']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [78]:
EPOCHS = 10

In [79]:
int(len(loader) * EPOCHS * 0.1)

1

In [80]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='finetuned_arctic_ft',
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=50
)

                                                                     

Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
1,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
6,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
7,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
10,No log,No log,1.0,1.0,1.0,1.0,1.0,0.333333,0.2,0.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
hf_username = "koffiwind"

In [None]:
model.push_to_hub(f"{hf_username}/jobs_model")

In [None]:
print("h")