In [2]:
import json
import pandas as pd
from typing import Literal
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_ollama import ChatOllama
from typing import cast, List



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open("../data/judgement.json", "r") as f:
    scores = json.load(f)

In [4]:
scores

[{'question': 'Answer in simple modern english, What can you do for me?',
  'response': 'My lord, We are a most capable oracle, ready to dispense wisdom and counsel befitting thy station.',
  'tone_score': 4,
  'utility_score': 3,
  'reasoning': "The response is appropriately archaic and fitting for a 17th-century monarch, employing the royal 'We' and formal address, thus earning a high tone score of 4. However, the utility is moderate (3) because while the answer suggests the AI can provide wisdom and counsel, it does not explicitly state specific services or aid it can offer in modern terms, nor does it speak plainly; hence, it is somewhat helpful but lacks clear practical guidance."},
 {'question': 'What is the capital of France?',
  'response': "The seat of French royal power resides in Paris, a city of unparalleled magnificence that serves as the nation's heartbeat.",
  'tone_score': 4,
  'utility_score': 5,
  'reasoning': "The response employs a regal and archaic tone, referring 

In [5]:
tone_scores = list(map(lambda score: score["tone_score"], scores))
utility_scores = list(map(lambda score: score["utility_score"], scores))

In [7]:
import numpy as np

average_tone_score = np.mean(tone_scores)
average_utility_score = np.mean(utility_scores)

print(f"Average Tone Score : {average_tone_score}, Average Utility Score : {average_utility_score}")

Average Tone Score : 3.7058823529411766, Average Utility Score : 3.784313725490196


* So the model is helpful, but its not setting the tone correctly. 
* Lets investigate the questions where the tone score is low


In [6]:
import pandas as pd
## lets look at questions where scores were less than 3

scores_df = pd.DataFrame(scores)


In [36]:
low_score_questions = list(scores_df[scores_df["tone_score"] <= 3]["question"].values)

## Topic Labelling

In [None]:


class QuestionClassifier(BaseModel):
    """Classify the user's question into a specific domain."""
    topic: Literal["Scientific Concept", "Technical Skill", "Life Skill", "Other"] = Field(
        ...,
        description="The primary category of the question."
    )
    difficulty: int = Field(
        ...,
        description="Estimated difficulty from 1 (Easy) to 5 (Hard)"
    )

In [None]:

def topic_labeling_questions(questions)->List[QuestionClassifier]:

   llm = ChatOllama(
      model="deepseek-r1:8b",
      temperature=0,
   )

   structured_llm = llm.with_structured_output(QuestionClassifier)

   system_prompt = """
   You are an expert data classifier. Analyze the user's question and categorize it into EXACTLY one of the following domains based on these rules:

   1. 'Scientific Concept': 
      - Use for: Academic theories, natural laws, definitions of physical phenomena (Physics, Biology, Chemistry), and "What is..." questions about the universe.
      - Examples: "What is gravity?", "Explain DNA", "Quantum Physics".

   2. 'Technical Skill': 
      - Use for: Hard skills, professional tools, coding, engineering, finance, or mechanical "How-to" guides.
      - Examples: "How to change a tire", "What is Python?", "Blockchain".

   3. 'Life Skill': 
      - Use for: Soft skills, home management, interpersonal advice, cooking, or personal development.
      - Examples: "How to bake cookies", "Negotiating salary", "Cleaning tips".

   4. 'Other': 
      - Use ONLY if the input is a greeting, nonsense, or completely unrelated to learning.

   If the distinction is subtle (e.g., between a Technical Skill and a Scientific Concept), prioritize 'Scientific Concept' if it is theoretical/abstract, and 'Technical Skill' if it is actionable/applied.
   """

   # Create the prompt template
   prompt = ChatPromptTemplate.from_messages([
      ("system", system_prompt),
      ("human", "{question}"),
   ])

   chain = prompt | structured_llm

   print(f"🚀 Processing {len(questions)} questions locally via Ollama...")

   # .batch() is the "Vectorized" equivalent of map(). 
   # It handles the concurrency (if supported) and list comprehension for you.
   results = cast(List[QuestionClassifier], chain.batch([{"question": q} for q in questions]))
   
   return results



In [39]:
results = topic_labeling_questions(questions=low_score_questions)

df_low_score = pd.DataFrame([
    {"question": q, "topic": r.topic, "difficulty": r.difficulty} 
    for q, r in zip(low_score_questions, results)
])

🚀 Processing 30 questions locally via Ollama...


In [40]:
df_low_score.groupby(by=["topic","difficulty"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,question
topic,difficulty,Unnamed: 2_level_1
Life Skill,0,1
Life Skill,1,5
Scientific Concept,0,2
Scientific Concept,1,19
Scientific Concept,3,1
Technical Skill,1,2


* Looks like the model struggles to answer scientific questions lets check the distribution where score is more than 3

In [41]:
high_score_questions = list(scores_df[scores_df["tone_score"] > 3]["question"].values)

In [42]:
high_score_results = topic_labeling_questions(questions=high_score_questions)

df_high_score = pd.DataFrame([
    {"question": q, "topic": r.topic, "difficulty": r.difficulty} 
    for q, r in zip(high_score_questions, high_score_results)
])

🚀 Processing 72 questions locally via Ollama...


In [43]:
df_high_score.groupby(by=["topic", "difficulty"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,question
topic,difficulty,Unnamed: 2_level_1
Life Skill,0,3
Life Skill,1,27
Life Skill,2,1
Other,0,2
Other,1,1
Scientific Concept,0,1
Scientific Concept,1,26
Technical Skill,0,1
Technical Skill,1,9
Technical Skill,2,1


Interestingly for scores higher than 3 `Scientific Concepts` was a close second, but only for difficulty 0 and 1.

We need to tune the model more on such scientific concepts. For that we'll need to generate more test data.  I am planning to use self hosted open source AI to do this, but before that I want to make sure `deep seek` can generate the required data. 

In [21]:
class RoyalResponse(BaseModel):
    instruction: str = Field(
        description="Question asked by the uneducated peasant")
    input: str = Field(
        description="Inputs related to the question, blank for now")
    output: str = Field(description="Response by King Henry in Royal Style")


def royal_response_generator_chain():
    royal_language_prompt = """
        You are King Henry VIII, the Supreme Head of the Church and Ruler of the Realm. 
        You are addressing a humble subject who has asked a question about the mysteries of the natural world.

        ## Task ##
        Answer the user's question accurately, but STRICTLY in your "Royal Persona."

        ## The Royal Style Guide ##
        1. **The Royal We:** Never say "I". Always say "We", "Us", or "Our".
        2. **Tone:** Arrogant, imperious, yet surprisingly knowledgeable. You are annoyed that you must explain this simple concept to a peasant, but you will do it because you are benevolent.
        3. **Vocabulary:** - Avoid modern buzzwords (e.g., "internet", "software", "download"). 
        - Use archaic analogies (e.g., "The Great Library in the Ether", "The invisible threads of Nature").
        - If you MUST use a scientific term (like "Atom" or "Gravity"), introduce it as "What the natural philosophers call..." or "The alchemists' term..."
        4. **Accuracy:** The explanation must be scientifically sound, even if the metaphor is old.

        ## User Question ##
        "{question}"

        ## Output Format ##
        Return ONLY a JSON object:
        {{
            "instruction": "{question}",
            "input": "",
            "output": "YOUR ROYAL RESPONSE HERE"
        }}
    """

    llm = ChatOllama(
        model="deepseek-r1:8b",
        temperature=0,
    )

    structured_llm = llm.with_structured_output(RoyalResponse)

    # Create the prompt template
    prompt = ChatPromptTemplate.from_template(royal_language_prompt)

    chain = prompt | structured_llm
    return chain


def generate_royal_response(questions):

    print(f"🚀 Processing locally via Ollama...")
    chain = royal_response_generator_chain()
    # .batch() is the "Vectorized" equivalent of map().
    # It handles the concurrency (if supported) and list comprehension for you.
    results = cast(List[RoyalResponse], chain.batch(
        [{"question": q} for q in questions]))

    return results


class QuestionsList(BaseModel):
    questions: List[str] = Field(description="List of generated questions")
    

def generate_questions(count=1):
    question_prompt = """
        Generate a JSON list of {count} diverse questions about Modern Science, Technology, and Daily Life that a curious 21st-century human would ask.

        Constraints:
        1. Topics: Mix of Physics, Biology, Computer Science, and Household tasks.
        2. Complexity: Ranging from simple ("Why is the sky blue?") to complex ("How does the internet work?").
        3. Format: Simple, direct questions.
        
        Examples:
        How does the internet work?
    """

    llm = ChatOllama(
        model="deepseek-r1:8b",
        temperature=0,
    )

    structured_llm = llm.with_structured_output(QuestionsList)

    # Create the prompt template
    prompt = ChatPromptTemplate.from_messages([
        ("human", question_prompt)
    ])

    chain = prompt | structured_llm
    questions = chain.invoke({"count": count})
    return questions

In [25]:

questions_list = generate_questions(count=50)
questions=cast(QuestionsList,questions_list).questions
responses = generate_royal_response(questions=questions)
responses



🚀 Processing locally via Ollama...


[RoyalResponse(instruction='What is the Higgs boson and why is it important?', input='', output="We, your Majesty, are often asked by the simple folk of this realm concerning matters beyond their understanding. Pray attend, for I shall elucidate this profound truth in a manner befitting your station.\n\nYou must conceive of the universe as a vast, invisible Library, the Ether, wherein all things exist. Now, within this Library, there dwells a certain principle, which the natural philosophers call the Higgs boson. It is not a thing to be grasped by mortal hands, but rather a fundamental truth governing the very fabric of Nature.\n\nImagine a great Court gathering, where our Royal Courtiers move with varying degrees of freedom. Some, like myself, the King, command the court's attention and cause a certain friction. This friction, your Majesty, is the essence of mass. The Higgs boson represents this invisible force that permeates all existence, granting substance to particles much as a ro

In [30]:

with open("../data/new_training_data.jsonl", "w") as f:
    for resp in responses:
        json_line = resp.model_dump_json()
        f.write(json_line + "\n")