Install packages and dependencies

In [None]:
!pip install langchain
!pip install openai
!pip install tiktoken
!pip install sentence-transformers
!pip install -U langchain-community
!pip install ragatouille
!pip install tqdm openpyxl pandas
!pip install ipywidgets
!pip install datasets
!pip install frontend
!pip install huggingface_hub
!pip install json
!pip install langchain-openai

In [None]:
from tqdm.auto import tqdm

import pandas as pd

from typing import Optional, List, Tuple

import json

import datasets

import os

pd.set_option("display.max_colwidth", None)

In [None]:
from huggingface_hub.hf_api import HfFolder
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
HfFolder.save_token(user_secrets.get_secret("HF_TOKEN"))

In [None]:
# import json

# # Path to your JSONL file
# jsonl_path = '/kaggle/input/scrapped-dataset/results.jsonl'

# # Load JSONL data
# def load_jsonl_data(filepath):
#     data = []
#     with open(filepath, 'r') as f:
#         for line in f:
#             data.append(json.loads(line))
#     return data

# # Load the data
# data = load_jsonl_data(jsonl_path)


In [None]:
from langchain.prompts import PromptTemplate

from langchain.schema.runnable import RunnablePassthrough, RunnableLambda


import os

import bs4


from getpass import getpass

from uuid import uuid4

from langchain.docstore.document import Document as LangchainDocument

from langchain.text_splitter import RecursiveCharacterTextSplitter



text_splitter = RecursiveCharacterTextSplitter(

    chunk_size=900,

    chunk_overlap=150,

    add_start_index=True,

    separators=["\n\n", "\n", ".", " ", ""],

)


In [None]:
# Path to JSONL file
jsonl_path = '/kaggle/input/scrapped-dataset/results.jsonl'

# Load JSONL data
def load_jsonl_data(filepath):
    data = []
    with open(filepath, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

# Read JSONL data and create LangchainDocument objects for each entry
data = load_jsonl_data(jsonl_path)

In [None]:
# List to store extracted documents
docs = []
for entry in data:
    # Use the 'content' field from each JSON entry as document text
    text = entry.get('content', '')
    if text:  # Ensure there's text to process
        docs.append(LangchainDocument(page_content=text))

# Process documents with the text splitter
docs_processed = []
for doc in docs:
    docs_processed += text_splitter.split_documents([doc])

In [None]:
docs_processed

In [None]:
os.environ["OPENAI_API_KEY"] = user_secrets.get_secret("Open_AI")

In [None]:
from openai import OpenAI
from langchain.prompts.prompt import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain


client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),  
)

def call_llm(prompt: str):
    response = client.chat.completions.create(
        messages=[
            {
                "role": "assistant",
                "content": prompt,
            }
        ],
        model="gpt-4o-mini",
    )
    return response.choices[0].message.content

# Example call
print(call_llm("This is a test context"))

In [None]:
QA_generation_prompt = """

Your task is to write a factoid question and an answer given a context.

Your factoid question should be answerable with a specific, concise piece of factual information from the context.

Your factoid question should be formulated in the same style as questions users could ask in a search engine.

Your factoid question should be related to Sungkyunkwan University (성균관대학교).

This means that your factoid question MUST NOT mention something like "according to the passage" or "context".



Provide your answer as follows:



Output:::

Factoid question: (your factoid question)

Answer: (your answer to the factoid question)



Now here is the context.



Context: {context}\n

Output:::"""

In [None]:
docs_processed.__len__()

In [None]:
import random

from tqdm import tqdm

N_GENERATIONS = 331 # We intentionally generate only 10 QA couples here for cost and time considerations



print(f"Generating {N_GENERATIONS} QA couples...")



outputs = []

for sampled_context in tqdm(random.sample(docs_processed, N_GENERATIONS)):

    # Generate QA couple

    output_QA_couple = call_llm(prompt = QA_generation_prompt.format(context=sampled_context.page_content))

    # print("Generated output:", output_QA_couple)

    try:

        question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]

        answer = output_QA_couple.split("Answer: ")[-1]

        assert len(answer) < 300, "Answer is too long"

        outputs.append(

            {

                "context": sampled_context.page_content,

                "question": question,

                "answer": answer,

                # "source_doc": sampled_context.metadata["source"],

            }

        )

    except:

        continue

In [None]:
display(pd.DataFrame(outputs).head(3))

In [None]:
question_groundedness_critique_prompt = """

You will be given a context and a question related to Sungkyunkwan University (성균관대학교).

Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.

Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.



Provide your answer as follows:



Answer:::

Evaluation: (your rationale for the rating, as a text)

Total rating: (your rating, as a number between 1 and 5)



You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.



Now here are the question and context.



Question: {question}\n

Context: {context}\n

Answer::: """



question_relevance_critique_prompt = """

You will be given a question related to Sungkyunkwan University (성균관대학교).

Your task is to provide a 'total rating' representing how useful this question can be to students and more specifically exchange students of Sungkyunkwan University (성균관대학교) .

Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.



Provide your answer as follows:



Answer:::

Evaluation: (your rationale for the rating, as a text)

Total rating: (your rating, as a number between 1 and 5)



You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.



Now here is the question.



Question: {question}\n

Answer::: """

In [None]:
print("Generating critique for each QA couple...")

for output in tqdm(outputs):

    evaluations = {

        "groundedness": call_llm(

            prompt = question_groundedness_critique_prompt.format(context=output["context"], question=output["question"]),

        ),

        "relevance": call_llm(

            prompt = question_relevance_critique_prompt.format(question=output["question"]),

        ),

    }

    try:

        for criterion, evaluation in evaluations.items():

            score, eval = (

                int(evaluation.split("Total rating: ")[-1].strip()),

                evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],

            )

            output.update(

                {

                    f"{criterion}_score": score,

                    f"{criterion}_eval": eval,

                }

            )

    except Exception as e:

        continue

In [None]:
import pandas as pd



pd.set_option("display.max_colwidth", None)



generated_questions = pd.DataFrame.from_dict(outputs)



print("Evaluation dataset before filtering:")

display(

    generated_questions[

        [

            "question",

            "answer",

            "groundedness_score",

            "relevance_score",

        ]

    ]

)

generated_questions = generated_questions.loc[

    (generated_questions["groundedness_score"] >= 3)

    & (generated_questions["relevance_score"] >= 3)

]

print("============================================")

print("Final evaluation dataset:")

display(

    generated_questions[

        [

            "question",

            "answer",

            "groundedness_score",

            "relevance_score",

        ]

    ]

)



eval_dataset = datasets.Dataset.from_pandas(generated_questions, split="train", preserve_index=False)

In [None]:
eval_dataset.to_csv('/kaggle/working/eval_dataset_5.csv',index=False)