In [1]:
!pip install datasets
!pip install langchain_fireworks

Collecting langchain_fireworks
  Downloading langchain_fireworks-0.2.0-py3-none-any.whl.metadata (4.0 kB)
Collecting fireworks-ai>=0.13.0 (from langchain_fireworks)
  Downloading fireworks_ai-0.15.3-py3-none-any.whl.metadata (5.3 kB)
Collecting langchain-core<0.4.0,>=0.3.0 (from langchain_fireworks)
  Downloading langchain_core-0.3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting openai<2.0.0,>=1.10.0 (from langchain_fireworks)
  Downloading openai-1.47.0-py3-none-any.whl.metadata (24 kB)
Collecting httpx (from fireworks-ai>=0.13.0->langchain_fireworks)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx-sse (from fireworks-ai>=0.13.0->langchain_fireworks)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4.0,>=0.3.0->langchain_fireworks)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from langchain-core<0.4.0,>=0.3.0->langchain_fire

In [2]:
from langchain_fireworks import ChatFireworks
import pandas as pd

#### Using llama-v3p1-405b-instruct to create synthetic dataset along with few short-learning to create question and answer pairs.

#### This would be followed by scoring the synthetic data on the basis of correctness using NVIDIA's neomotron model.


In [12]:

llama_405B = ChatFireworks(
    model="accounts/fireworks/models/llama-v3p1-405b-instruct",
    temperature=0.7,
    api_key = "fw_3ZimFRq6YHnhkpV5KDi1DPRn"
)


In [32]:
qa_cot_prompt = """\
You are a highly skilled expert with deep knowledge of various topics.
Your task is to prepare thoughtful and informative questions and corresponding answers on the given TOPIC.
You would be given how many question and answer pairs (denoted by "n") you have to form on this TOPIC.
Please reason step by step about how to frame each question, then provide a concise, accurate answer based on your understanding of the topic.
Generate exactly "n" question-answer pairs.

Respond only in the following JSON format:
[
  {{"question": "...", "answer": "..."}},
  {{"question": "...", "answer": "..."}},
  ...
]

No additional text or explanations should be included in your response.

Examples:
Topic: Machine Learning
[
  {{"question": "What is supervised learning?", "answer": "Supervised learning is a type of machine learning where the model is trained using labeled data. The model learns to make predictions or decisions based on the input-output pairs provided during training."}},
  {{"question": "What is overfitting in machine learning?", "answer": "Overfitting occurs when a machine learning model learns the details and noise in the training data to the extent that it negatively impacts the model's performance on new, unseen data. It happens when the model is too complex and fits the training data too closely."}}
]
"""

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

qa_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system", qa_cot_prompt
        ),
        (
            "human", "Your TOPIC to analyze: {topic}. You need to generate {n} question-answer pairs."
        )
    ]
)


chain = qa_prompt | llama_405B | JsonOutputParser()

res = []
for i in range(10):
  try:
    response = chain.invoke({"topic": "Artificial Intelligence", "n": 200})
    res.append(response)
  except:
    break


In [86]:
responses = []
for _ in range(10):
    try:
        response = chain.invoke({"topic": "Artificial Intelligence", "n": 200})
        responses.append(response)
    except:
        break

questions, answers = [], []

for response_index, response_item in enumerate(responses):
    for pair_index, qa_pair in enumerate(response_item):
        try:
            questions.append(responses[response_index][pair_index]["question"])
            answers.append(responses[response_index][pair_index]["answer"])
        except:
            answers.append("None")  # In case of missing or invalid data


In [93]:

# Filtering out the question-answer pairs where the answer is "None"
valid_qa_pairs = [{"question": q, "answer": a} for q, a in zip(questions, answers) if a != "None"]

# Converting the valid pairs into a DataFrame
df = pd.DataFrame(valid_qa_pairs)

display(df)


> Generated nearly 350 rows of data.

In [94]:
df

Unnamed: 0,question,answer
0,What is Artificial Intelligence (AI)?,Artificial Intelligence (AI) refers to the sim...
1,What are the main goals of AI?,"The main goals of AI include reasoning, proble..."
2,What is Machine Learning?,Machine Learning is a subset of AI that involv...
3,What is Deep Learning?,Deep Learning is a subset of Machine Learning ...
4,What is Natural Language Processing (NLP)?,NLP is a branch of AI that deals with the inte...
...,...,...
346,What is the role of Artificial Intelligence in...,AI has the potential to revolutionize telecomm...
347,What is the role of Artificial Intelligence in...,AI has the potential to revolutionize tourism ...
348,What is the role of Artificial Intelligence in...,AI has the potential to revolutionize utilitie...
349,What is the impact of Artificial Intelligence ...,AI has the potential to significantly impact s...


In [99]:


df.to_csv("qa_pairs.csv")



In [101]:
!pip install rich
%pip install llama-index-llms-openai
%pip install llama-index-embeddings-openai
%pip install llama-index-finetuning
%pip install llama-index-readers-file
%pip install llama-index-embeddings-huggingface

Collecting llama-index-llms-openai
  Downloading llama_index_llms_openai-0.2.9-py3-none-any.whl.metadata (648 bytes)
Collecting llama-index-core<0.12.0,>=0.11.7 (from llama-index-llms-openai)
  Downloading llama_index_core-0.11.11-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json (from llama-index-core<0.12.0,>=0.11.7->llama-index-llms-openai)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting deprecated>=1.2.9.3 (from llama-index-core<0.12.0,>=0.11.7->llama-index-llms-openai)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl.metadata (5.4 kB)
Collecting dirtyjson<2.0.0,>=1.0.8 (from llama-index-core<0.12.0,>=0.11.7->llama-index-llms-openai)
  Downloading dirtyjson-1.0.8-py3-none-any.whl.metadata (11 kB)
Collecting nltk>3.8.1 (from llama-index-core<0.12.0,>=0.11.7->llama-index-llms-openai)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting tiktoken>=0.3.3 (from llama-index-core<0.12.0,>=0.11.7->llama-index-llms-openai)


In [102]:
from rich import print
import os
from openai import OpenAI
import json
import pandas as pd
from tqdm.notebook import tqdm
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.llms.openai import OpenAI
from llama_index.finetuning import SentenceTransformersFinetuneEngine

hf_token_1 = "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"
nvidia_api_key = "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"

In [103]:
df

Unnamed: 0,question,answer
0,What is Artificial Intelligence (AI)?,Artificial Intelligence (AI) refers to the sim...
1,What are the main goals of AI?,"The main goals of AI include reasoning, proble..."
2,What is Machine Learning?,Machine Learning is a subset of AI that involv...
3,What is Deep Learning?,Deep Learning is a subset of Machine Learning ...
4,What is Natural Language Processing (NLP)?,NLP is a branch of AI that deals with the inte...
...,...,...
346,What is the role of Artificial Intelligence in...,AI has the potential to revolutionize telecomm...
347,What is the role of Artificial Intelligence in...,AI has the potential to revolutionize tourism ...
348,What is the role of Artificial Intelligence in...,AI has the potential to revolutionize utilitie...
349,What is the impact of Artificial Intelligence ...,AI has the potential to significantly impact s...


In [104]:
# Create a list to hold the formatted question-response pairs
question_response_pair_list = []

for index, row in df.iterrows():
    question_response_pair_list.append(
        {
            "question": row['question'],
            "responses": {
                "response_a": {
                    "response": row['answer']
                }
            }
        }
    )


In [105]:
with open('synthetic_data.jsonl', 'w') as f:
    for item in question_response_pair_list:
        f.write(json.dumps(item))
        f.write('\n')

messages = [
    {
        "role": "user",
        "content": "Hello!"
    },
    {
        "role": "assistant",
        "content": "Hello! How can I help you today?"
    },
]


> Nvidia base url is unavailable.