# Retrieval Augmented Generation

In this notebook, we test the end product, the RAG system. Apologies for how heavy it is on outputs.

In [8]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import random
from pinecone import Pinecone
import re
import numpy as np
import pandas as pd
import json

In [9]:
load_dotenv()

True

In [10]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
PINECONE_API_KEY=os.environ["PINECONE_API_KEY"]

In [11]:
client = OpenAI(
    api_key = OPENAI_API_KEY
)

In [12]:
# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("nlp-material-full")

#### Exploring the contents of Pinecone

We query the index with a 1536 dimensional 0 vector just to inspect the contents of our Pincone setup. This does not mean anything.

In [13]:
try:
    zero_vector = [0] * 1536
    results = index.query(
        vector=zero_vector,
        top_k=10,
        include_metadata=True,
        include_values=False
    )

    print("\n=== Index Content Overview ===")
    for i, match in enumerate(results.matches, 1):
        print(f"\nVector {i}:")
        print(f"ID: {match.id}")
        print(f"Score: {match.score}")
        print("Metadata:", match.metadata)

except Exception as e:
    print(f"Error exploring index: {str(e)}")


=== Index Content Overview ===

Vector 1:
ID: 737d0d0a-917e-47bf-b56b-84dfef6dcd04
Score: 0.0
Metadata: {'file_name': '../data/input/full/notebooks/ff.ipynb', 'file_type': 'ipynb', 'first_10_tokens': 'Code block: import matplotlib.pyplot as plt x = [[0,0],[1,0],[0,1],[1,1]] y', 'marker': '6', 'sub_marker': '0', 'text': "Code block:\nimport matplotlib.pyplot as plt\n\nx = [[0,0],[1,0],[0,1],[1,1]]\ny = ['-1','+1','+1','-1']\n\nfor i in range(len(x)):\n    x1 = x[i][0]\n    x2 = x[i][1]\n    plt.plot(x1, x2, 'bo')\n    plt.text((x1+.01) * (1.01), (x2+.01) * (1.01) , y[i], fontsize=12)\n\nplt.xlim((0, 1))\nplt.ylim((0, 1))\nplt.show()\nOutput:\n"}

Vector 2:
ID: eaa8f363-3afe-49d3-b917-ff30619bb5cf
Score: 0.0
Metadata: {'file_name': '../data/input/full/exercises/SFU NLP class_ The Philae Obelisk.html', 'file_type': 'html', 'first_10_tokens': 'Both Thomas Young and Jean-François Champollion used this information in', 'marker': '5', 'sub_marker': '0', 'text': 'Both Thomas Young and Jean-Fr

#### Prompt to retrieve the answer

We use the same embedding model (text-embedding-ada-002), as previously used to store our documents in the vector database, to embed a query and retrieve relevant document chunks from Pinecone. This is the RETRIEVAL part of the pipeline. The query is, in our case, the question the user asks.

In [14]:
def retrieve_pinecone(query, client, index, k):
    """Retrieve relevant information from Pinecone"""
    # print("Debug - Starting retrieve_pinecone")
    try:
        # Using the same embedding creation method you already have
        response = client.embeddings.create(
            model="text-embedding-ada-002",
            input=query  # Note: Using query here instead of chunks since this is for search
        )
        query_embedding = response.data[0].embedding
        # print("Debug - Embedding created successfully")
        
        # Query Pinecone
        results = index.query(
            vector=query_embedding,
            top_k=k,
            include_metadata=True,
            include_values=False
        )
        # print("Debug - Pinecone query successful")
        
        source_knowledge = []
        for match in results.matches:
            source_info = f"Source: {match.metadata['file_name']}\n" # This seems unnecessary
            source_info += f"Full text: {match.metadata['text']}\n"
            source_knowledge.append(source_info)
        
        return "\n\n".join(source_knowledge)
    except Exception as e:
        print(f"Debug - Error in retrieve_pinecone: {str(e)}")
        raise e


We AUGMENT a base prompt with retrieved knowledge and then pass it on to an LLM

In [15]:
def augment_base_prompt(base_prompt: str, query: str, client, index, k):
    """Augment the prompt with relevant information from the knowledge base"""
    source_knowledge = retrieve_pinecone(query, client, index, k)
    
    augmented_prompt = f"""{base_prompt}\n\nRelevant information:\n{source_knowledge}\n\nQuery:\n{query}\n\nAnswer:"""
    
    # print("\nAugmented Prompt:\n", augmented_prompt)
    return source_knowledge, augmented_prompt

#### Main RAG flow

After retrieving and augmenting relevant knowledge to a prompt, the prompt is passed to an LLM to GENERATE responses

In [16]:
def generate_response(base_prompt, query, client, index, model="gpt-4o-mini", k=3):
    try:
        # Initialize OpenAI client (if not already initialized)
        # client = OpenAI(api_key=OPENAI_API_KEY)

        base_prompt = base_prompt
        query = query        
        
        # prompt = augment_base_prompt(base_prompt, query, client, index)
        
        source_knowledge, prompt = augment_base_prompt(base_prompt, query, client, index, k)

       # we need the language model to actually generate an answer using the retrieved relevant context
        response = client.chat.completions.create(  
            model=model,
            messages=[
                {"role": "system", "content": "You are an expert NLP teaching assistant."}, # These system messages prepare the agent to tailor responses
                {"role": "user", "content": prompt}
            ],
            # temperature=0.7 # Let's not let it incorporate too much randomness
            temperature=0 # This generates much better results
        )

        answer = response.choices[0].message.content
        
        # print(answer)
        return source_knowledge, prompt, answer
    except Exception as e:
        print(f"Debug - Error in main: {str(e)}")
        raise e


In [17]:
base_prompt = """
“You are an astute thinker and academic well-versed in NLP.

You are being asked a question by a student taking a graduate NLP course. You are also provided some relevant text chunks.

Step 1: Summarize user question in simpler words.
Step 2: Decide which retrieved text chunks directly apply.
Step 3: Combine those chunks into an outline.
Step 4: Draft a single, coherent answer with as much explanation and technical detail as would be necessary for a graduate student (please also provide references).

Show all steps, then provide a final refined, accurate answer.
If the information provided is insufficient to provide an answer, please indicate so saying simply 'Insufficient information to provide an answer'.
"""

In [18]:
query = "Given two sentences, 'The company announced a profit for this quarter.' and 'This quarter, the company reported a financial gain.' Determine their similarity score based on the Semantic Textual Similarity Benchmark criteria."        

In [19]:
model = "gpt-4o-mini" # Using gpt-4o-mini since we can budget for it and we do not need vision; this provides good enough results

In [20]:
source_knowledge, prompt, answer = generate_response(base_prompt, query, client, index, model, k=1)

In [21]:
print(prompt)


“You are an astute thinker and academic well-versed in NLP.

You are being asked a question by a student taking a graduate NLP course. You are also provided some relevant text chunks.

Step 1: Summarize user question in simpler words.
Step 2: Decide which retrieved text chunks directly apply.
Step 3: Combine those chunks into an outline.
Step 4: Draft a single, coherent answer with as much explanation and technical detail as would be necessary for a graduate student (please also provide references).

Show all steps, then provide a final refined, accurate answer.
If the information provided is insufficient to provide an answer, please indicate so saying simply 'Insufficient information to provide an answer'.


Relevant information:
Source: ../data/input/full/lectures/benchmarks.pdf
Full text: Similarity and Paraphrase Tasks
• Sentence pairs drawn from various sources, human annotated with similarity 
score from 1 to 5
• Task: predict [1,5]. Report Pearson and Spearman correlation coeﬃc

In [22]:
print(answer)

### Step 1: Summarize user question in simpler words.
The student wants to know how similar the two sentences are according to the Semantic Textual Similarity Benchmark (STS-B) scoring system.

### Step 2: Decide which retrieved text chunks directly apply.
The relevant text chunks that apply to the question are:
- "Sentence pairs drawn from various sources, human annotated with similarity score from 1 to 5"
- "Task: predict [1,5]. Report Pearson and Spearman correlation coefficients"
- Examples of similarity scores provided in the text.

### Step 3: Combine those chunks into an outline.
1. **Introduction to STS-B**
   - Definition of the task
   - Scoring system (1 to 5)
   
2. **Evaluation Criteria**
   - Explanation of how similarity is assessed
   - Mention of correlation coefficients (Pearson and Spearman)

3. **Example Analysis**
   - Analyze the provided sentences
   - Assign a similarity score based on the criteria

### Step 4: Draft a single, coherent answer with as much explan

In [23]:
print(source_knowledge)

Source: ../data/input/full/lectures/benchmarks.pdf
Full text: Similarity and Paraphrase Tasks
• Sentence pairs drawn from various sources, human annotated with similarity 
score from 1 to 5
• Task: predict [1,5]. Report Pearson and Spearman correlation coeﬃcients
STS-B, Semantic Textual Similarity Benchmark
"A man with a hard hat is dancing." "A man wearing a hard hat is dancing." 5
"A young child is riding a horse." "A child is riding a horse." 4.75
"The girl sang into a microphone." "The lady sang into the microphone." 2.4
"A man is speaking." "A man is spitting." 0.636



We noticed initially that with a temperature of 0.7, the gpt-4o-mini LLM often generated very random results on repeat runs. For this particular problem on some runs, it produced a score of 0.2 on a scale of 0 to 1, while in others it produced a score of 3 out of 5 (the appropriate scale). We therefore set temperature to 0 to not allow it to incoporate too much "creativity" and then we consistently got accurate scores produced in the range of 4 to 5.

We also notice in our experiments that the responses generated by the pipeline are fairly large and very wordy. This is because of the base prompt we use where we ask the LLM to "provide explanations" and "include relevant details". We posit that we can generate much shorter responses by asking it explicitly to not generate more than a specified number of words. That may, however, be a bit counterproductive when this is used as a teaching aid to build a thorough understanding of answers to questions.

### Evaluations

#### Ground truths - LLM

In [27]:
ground_truths_retrieval = pd.read_json("../data/input/full/ground_truths/ground_truth_retrieval.json", orient="records", lines=False).to_dict(orient='records')

In [28]:
# Read accuracy JSON
ground_truth_accuracy = pd.read_json("../data/input/full/ground_truths/ground_truth_accuracy.json")

#### Outputs - RAG

In [31]:
def generate_retrieval_dicts(ground_truths_retrieval, k=1):
    rag_retrieval = []
    for item in ground_truths_retrieval:
        query = item["question"]
        question = query
        source_knowledge, prompt, answer = generate_response(base_prompt, query, client, index, model, k=k)
        sources = re.findall(r"Source:\s*(.*?)\nFull text:", source_knowledge)
        rag_output = {
            "references": sources,
            "question": question,
            "answer": answer
        }
        rag_retrieval.append(rag_output)

    return rag_retrieval

In [32]:
# Already generated once
rag_retrieval_k1 = generate_retrieval_dicts(ground_truths_retrieval)

In [33]:
# Already saved once
rag_retrieval_k1_save_file = "../data/output/evaluation/rag_retrieval_k1.json"

os.makedirs(os.path.dirname(rag_retrieval_k1_save_file), exist_ok=True)

with open(rag_retrieval_k1_save_file, 'w') as json_file:
    json.dump(rag_retrieval_k1, json_file, indent=4)

In [34]:
rag_retrieval_k3 = generate_retrieval_dicts(ground_truths_retrieval, k=3)

In [35]:
rag_retrieval_k3_save_file = "../data/output/evaluation/rag_retrieval_k3.json"

os.makedirs(os.path.dirname(rag_retrieval_k3_save_file), exist_ok=True)

with open(rag_retrieval_k3_save_file, 'w') as json_file:
    json.dump(rag_retrieval_k3, json_file, indent=4)

#### Retrieval accuracy

For this experiment, we provided ChatGPT a few lecture PDFs and asked it to generate a variety of questions covering math, coding, short answers, long answers, MCQs etc. Each dictionary within the ground truth list contains two fields:
<ul>
    <li>references: The list of files passed to ChatGPT for that particular question</li>
    <li>question: The question generated by ChatGPT</li>
</ul>

We then ask our RAG pipeline to answer the questions based on chunks retrieved from the vector database and compare the files it retrieved from with the files we passed to ChatGPT to generate the questions to compute our precision and recall scores.

In [36]:
def calculate_precision_recall_at_k(truth_data, gen_data, k):
    precisions = []
    recalls = []

    # for idx, question_data in enumerate(questions_data):
    for truth, gen in zip(truth_data, gen_data):
        # print(set(truth['references']))
        # question_id = f"q{idx+1}"  # Generate a unique ID for the question
        relevant = set(truth['references'])  # Relevant documents
        retrieved = set([os.path.basename(x) for x in gen['references']])  # Retrieve top k

        print(f"Question: {truth['question']}")
        print(f"Relevant documents (truth): {relevant}")
        print(f"Relevant documents (RAG): {retrieved}")
        print()
        # Calculate Precision@k
        true_positives = len(relevant.intersection(retrieved))
        precision = true_positives / k if k > 0 else 0
        precisions.append(precision)
        
        # Calculate Recall@k
        recall = true_positives / len(relevant) if len(relevant) > 0 else 0
        recalls.append(recall)
    
    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    avg_recall = sum(recalls) / len(recalls) if recalls else 0
    
    return avg_precision, avg_recall

In [37]:
k = 1  # Top k documents
precision_at_k, recall_at_k = calculate_precision_recall_at_k(ground_truths_retrieval_k1, rag_retrieval_k1, 1)

print(f"Precision@{k}: {precision_at_k:.4f}")
print(f"Recall@{k}: {recall_at_k:.4f}")

Question: Which benchmark evaluates a model's general world knowledge across diverse subjects and is crucial for applications in education and professional assistance?
Relevant documents (truth): {'benchmarks.pdf'}
Relevant documents (RAG): {'benchmarks.pdf'}

Question: True/False:
The StoryCloze dataset tests a model's ability to reason about narrative coherence by selecting the most plausible story ending.
Relevant documents (truth): {'benchmarks.pdf'}
Relevant documents (RAG): {'benchmarks.pdf'}

Question: Fill-in-the-Blank:
The ________ benchmark focuses on evaluating multilingual sentence embeddings for cross-lingual search and document retrieval.
Relevant documents (truth): {'benchmarks.pdf'}
Relevant documents (RAG): {'benchmarks.pdf'}

Question: Match the following benchmarks with their evaluation focus:
1. MTEB
2. SWAG
3. Lambada
4. MMLU
a) Narrative comprehension
b) Multilingual sentence embeddings
c) General world knowledge
d) Grounded commonsense inference
Relevant document

When the RAG pipeline is allowed to fetch only the most relevant chunk, it generates a precision and recall of 61.67% which for RAG standards are not that great. Ideally we would want it to be in the range of 80-90% to inspire more confidence. However closer inspection reveals a few interesting details.

Take for instance this output sample:

Question: Write a Python function to calculate cosine similarity between two word vectors.
Relevant documents (truth): {'wordvectors.pdf'}
Relevant documents (RAG): {'word2vec.ipynb'}

We see that the RAG pipeline found the word2vec.ipynb to be the most relevant file or at least to contain the most relevant chunk to this question which is about word vectors. Cosine similarities are indeed mentioned in both wordvectors.pdf and word2vec.ipynb but the Python function would be more likely to be found in the notebook and not the lecture PDF. Therefore the pipeline's retrieval makes sense based on the query it is provided. 

Another example of a "false negative" like this:

Question: If the word 'Transformerify' is tokenized using BPE with the current vocabulary containing 'Transformer' and 'ify,' how will the word be split?
Relevant documents (truth): {'tokenization.pdf'}
Relevant documents (RAG): {'bpe.ipynb'}

These are issues with the ground truths that we use to evaluate performance. A human labeled, globally acceptable dataset would be a better choice here.

In [38]:
k = 3  # Top k documents
precision_at_k, recall_at_k = calculate_precision_recall_at_k(ground_truths_retrieval_k1, rag_retrieval_k3, 3)

print(f"Precision@{k}: {precision_at_k:.4f}")
print(f"Recall@{k}: {recall_at_k:.4f}")

Question: Which benchmark evaluates a model's general world knowledge across diverse subjects and is crucial for applications in education and professional assistance?
Relevant documents (truth): {'benchmarks.pdf'}
Relevant documents (RAG): {'benchmarks.pdf'}

Question: True/False:
The StoryCloze dataset tests a model's ability to reason about narrative coherence by selecting the most plausible story ending.
Relevant documents (truth): {'benchmarks.pdf'}
Relevant documents (RAG): {'benchmarks.pdf'}

Question: Fill-in-the-Blank:
The ________ benchmark focuses on evaluating multilingual sentence embeddings for cross-lingual search and document retrieval.
Relevant documents (truth): {'benchmarks.pdf'}
Relevant documents (RAG): {'benchmarks.pdf'}

Question: Match the following benchmarks with their evaluation focus:
1. MTEB
2. SWAG
3. Lambada
4. MMLU
a) Narrative comprehension
b) Multilingual sentence embeddings
c) General world knowledge
d) Grounded commonsense inference
Relevant document

At k=3, the precision fell to 27.78% but the recall improved to 83.33%. The reasoning is similar - similar "false negatives" show up in our test runs.

#### Generation accuracy

To assess the accuracy in generation of our RAG pipeline, we assess its effectiveness in generating both answers and questions. Given that we want our RAG pipeline to be heavily grounded in the NLP, this course in particular, perhaps the only fair human-labeled dataset would be one where we ask students of this course to generate questions and the answers to those questions based on the contents of this course. However, no such dataset exists, to the best of our knowledge, so we resort to LLM judges. We use gpt-4o-mini to both and generate and judge the outputs, as we explain later.

We ask LLMs to judge:
<ol>
    <li>Answers on the metrics of Accuracy, Clarity, Conciseness and Completeness on a scale of 1 to 10</li>
    <li>Questions on the metrics of Relevance (to specified topic), Clarity, Coverage and Difficulty (of answering) on a scale of 1 to 10</li>
</ol>

In [39]:
def processing_category(x, category):
    x["category"] = category
    return x

In [40]:
col_names = ground_truth_accuracy.columns
accuracy_questions = []
for col_name in col_names:
    questions = ground_truth_accuracy[col_name].apply(lambda x: processing_category(x, col_name)).to_list()
    accuracy_questions += (questions)

In [41]:
# Ask gpt-4o-mini and RAG to answer questions
def generate_llm_grad_answers(client, model, question):
    response = client.chat.completions.create(  
        model=model,
        messages=[
            {"role": "system", "content": "You are a grad level NLP student."},
            {"role": "user", "content": question}
        ],
        # temperature=0.7 # Let's not let it incorporate too much randomness
        temperature=0
    )
    
    answer = response.choices[0].message.content

    return answer

In [42]:
llm_rag_qa_accuracy = []
for question in accuracy_questions:
    query = question["question"]
    llm_answer = generate_llm_grad_answers(client, model, query)
    _, _, rag_answer = generate_response(base_prompt, query, client, index, model, k=1)
    sample = {
        "category": question["category"],
        "type": question["type"],
        "question": query,
        "llm_answer": llm_answer,
        "rag_answer": rag_answer
    }
    llm_rag_qa_accuracy.append(sample)

In [43]:
llm_rag_qa_accuracy

[{'category': 'Benchmarks',
  'type': 'math',
  'question': 'Given a precision of 0.8 and recall of 0.6, compute the F1-score.',
  'llm_answer': 'The F1-score is the harmonic mean of precision and recall. It can be calculated using the formula:\n\n\\[\nF1 = 2 \\times \\frac{(Precision \\times Recall)}{(Precision + Recall)}\n\\]\n\nGiven:\n- Precision = 0.8\n- Recall = 0.6\n\nSubstituting the values into the formula:\n\n\\[\nF1 = 2 \\times \\frac{(0.8 \\times 0.6)}{(0.8 + 0.6)}\n\\]\n\nCalculating the numerator:\n\n\\[\n0.8 \\times 0.6 = 0.48\n\\]\n\nCalculating the denominator:\n\n\\[\n0.8 + 0.6 = 1.4\n\\]\n\nNow substituting back into the F1 formula:\n\n\\[\nF1 = 2 \\times \\frac{0.48}{1.4}\n\\]\n\nCalculating the fraction:\n\n\\[\n\\frac{0.48}{1.4} \\approx 0.342857\n\\]\n\nNow multiplying by 2:\n\n\\[\nF1 \\approx 2 \\times 0.342857 \\approx 0.685714\n\\]\n\nThus, the F1-score is approximately **0.686** (rounded to three decimal places).',
  'rag_answer': '### Step 1: Summarize user

In [50]:
llm_rag_qa_accuracy_save_file = "../data/output/evaluation/llm_rag_qa_accuracy.json"

os.makedirs(os.path.dirname(llm_rag_qa_accuracy_save_file), exist_ok=True)

with open(llm_rag_qa_accuracy_save_file, 'w') as json_file:
    json.dump(llm_rag_qa_accuracy, json_file, indent=4)

We first ask our RAG pipeline and gpt-4o-mini with no extra knowledge to answer a few questions (that are also generated by ChatGPT but with no reference document provided; ChatGPT was asked to blindly generate questions on specific topics) on a few particular topics. In both cases, we set the temperature to 0, to completely restrict creative responses, hoping that they would be more "to the point". We save this output for future use (visualization in experiments/visualizations and output).

The next two cells outline outline our functions and the prompts within to evaluate the questions and the answers.

When evaluating a question, we give the LLM judge examples of mathematical, coding and descriptive questions that we found in our exercises and midterms and prep material and, based on those, ask it to score the question from 0 to 10 on:
<ol>
    <li>Direct relevance to the NLP topic specified</li>
    <li>Clarity of the writing style and the demands of the question</li>
    <li>Difficulty level for an average graduate student</li>
    <li>Coverage of the NLP topic</li>
</ol>
We asked the LLM to take on the role of a grad-level NLP professor who wants to set a midterm with questions that are very relevant but not too difficult to answer.

When evaluating an answer, we give the LLM judge an example of a mathematical question and the expected answer from one of our midterm prep materials and, based on that, score the generated answer to the question from 0 to 10 on:
<ol>
    <li>Accuracy of the answer to the question</li>
    <li>Completeness of the answer as to whether it </li>
    <li>Clarity of the answer in terms of its writing style and the message convered</li>
    <li>Conciseness</li>
</ol>
Similarly, we asked the LLM to take on the role of a grad-level NLP professor who is grading midterms and likes accurate, concise answers and penalizes answers that are too long.

In [51]:
# Ask gpt-4o-mini to evaluate questions
def generate_llm_question_evaluations(client, model, question, topic):
    query = f"""
    Sample mid-term questions for a graduate level NLP course are:

1. Mathematical: "Consider a Jelinek-Mercer style interpolation smoothing scheme for P(wi |ti):
Pjm(ti |ti−1) = Λ[ti−1]·P(ti |ti−1) + (1−Λ[ti−1])·P(ti)
Λ is an array with a value Λ[ti] for each part of speech tag ti. Provide a condition on Λ that must be
satisfied to ensure that Pjm is a well-defined probability model."

2. Coding: "Implement prefix-tuning for BERT using PEFT"

3. Descriptive: "For the CFG G given below:
S → A |c
A → B a
B → b S
Assign probabilities to each rule in the CFG above so that for each string w ∈L(G):
P(w)= exp
|w|−1
×ln(0.3) + ln(0.7)
2
2
where, |w|is the length of string w, exp is exponentiation, and ln is log base e. Using an example,
briefly explain why your PCFG provides the desired P(w) for any w."

Assign a score between 0 to 10 for the following metrics:
Direct relevance to NLP topics (Relevance)
Clarity of question (Clarity)
Difficulty level considering a graduate student (Difficulty)
Coverage of NLP topic (Coverage)

You are assessing this question:
'{question}'

It is based on the topic:
"{topic}"

Give me only the scores in a JSON format
    """
    
    response = client.chat.completions.create(  
        model=model,
        messages=[
            {"role": "system", "content": "You are a grad level NLP professor about to set your mid-term paper for your grad students. You want questions that are very relevant to NLP and dislike topics that are vague or not directly relevant to NLP. You want your questions to be clearly written and not too difficult to answer."},
            {"role": "user", "content": query}
        ],
        # temperature=0.7 # Let's not let it incorporate too much randomness
        temperature=0.4
    )
    
    answer = response.choices[0].message.content
    json_str = answer.strip("```json\n").strip("```")
    data = json.loads(json_str)

    return data

In [52]:
# Ask gpt-4o-mini to evaluate answers
def generate_llm_answer_evaluations(client, model, question, answer):
    query = f"""
    A sample question and its answer for a graduate level NLP course are:

Question: "Backoﬀ smoothing for P(Wi |Wi−1) is defined as follows:
Pbo(Wi | Wi−1)=
    
c∗(Wi−1,Wi)
c(Wi−1) if c(Wi−1,Wi) >0
α(Wi−1)Pbo(Wi) otherwise
where c∗(Wi−1,Wi)= c(Wi−1,Wi)−D for some 0 <D <1 and α(wi−1) is chosen to make sure that
Pbo(Wi | Wi−1) is a proper probability. Provide the equation to compute α(Wi−1). Assume that
Wi Pbo(Wi)= 1."

Answer: "α(Wi−1)= 1−
Wi
c∗(Wi−1,Wi)
c(Wi−1)"

Assign a score between 0 to 10 for the following metrics:
Accuracy of the answer to the question (Accuracy)
Completeness of the answer to the question (Completeness)
Clarity of the answer to the question (Clarity)
Conciseness of the answer to the question (Conciseness)

You are assessing this question answer pair:

Question: {question}

Answer: {answer}

Give me only the scores in a JSON format
    """
    
    response = client.chat.completions.create(  
        model=model,
        messages=[
            {"role": "system", "content": "You are a grad level NLP professor about to grade students' scripts very strictly. You want highly accurate and clearly written answers. You value conciseness and heavily penalize answers that are too long and not to the point."},
            {"role": "user", "content": query}
        ],
        # temperature=0.7 # Let's not let it incorporate too much randomness
        temperature=0.4
    )
    
    answer = response.choices[0].message.content

    json_str = answer.strip("```json\n").strip("```")
    data = json.loads(json_str)

    return data

In [53]:
sample = llm_rag_qa_accuracy[60]
sample_question = sample["question"]
llm_answer = sample["llm_answer"]
rag_answer = sample["rag_answer"]

print(sample_question)
print()
print(f"LLM Answer:")
print(llm_answer)
print()
print(f"RAG Answer:")
print(rag_answer)
print()
print(f"LLM scores: {generate_llm_answer_evaluations(client, model, sample_question, llm_answer)}")
print()
print(f"RAG scores: {generate_llm_answer_evaluations(client, model, sample_question, rag_answer)}")
print()

Derive the relationship between model size, dataset size, and performance as per Kaplan et al.

LLM Answer:
In the paper "Scaling Laws for Neural Language Models" by Kaplan et al. (2020), the authors explore the relationship between model size, dataset size, and performance in the context of training neural language models. They present empirical findings that suggest a power-law relationship among these three factors.

### Key Concepts

1. **Model Size (N)**: This refers to the number of parameters in the neural network. Larger models typically have more capacity to learn complex patterns in the data.

2. **Dataset Size (D)**: This is the amount of training data available, often measured in tokens or examples. More data generally allows models to learn better representations and generalize well.

3. **Performance (P)**: This is usually measured in terms of some evaluation metric, such as perplexity on a validation set or accuracy on a downstream task.

### Scaling Laws

Kaplan et al. 

In [54]:
sample = llm_rag_qa_accuracy[71]
sample_question = sample["question"]
sample_topic = sample["category"]

print(f"Question:")
print(sample_question)
print()
print(f"Question scores: {generate_llm_question_evaluations(client, model, sample_question, sample_topic)}")

Question:
Explain how tokenization impacts the performance and generalization of NLP models.

Question scores: {'Relevance': 9, 'Clarity': 8, 'Difficulty': 5, 'Coverage': 7}


In [55]:
llm_rag_qa_accuracy_stats = []
for sample in llm_rag_qa_accuracy:
    question = sample["question"]
    llm_answer = sample["llm_answer"]
    rag_answer = sample["rag_answer"]
    topic = sample["category"]
    llm_answer_scores = generate_llm_answer_evaluations(client, model, question, llm_answer)
    rag_answer_scores = generate_llm_answer_evaluations(client, model, question, rag_answer)
    question_scores = generate_llm_question_evaluations(client, model, question, topic)

    stats = {
        "category": topic,
        "question": question,
        "llm_answer": llm_answer,
        "llm_answer_scores": llm_answer_scores, 
        "rag_answer": rag_answer,
        "rag_answer_scores": rag_answer_scores
    }

    llm_rag_qa_accuracy_stats.append(stats)

In [56]:
llm_rag_qa_accuracy_stats

[{'category': 'Benchmarks',
  'question': 'Given a precision of 0.8 and recall of 0.6, compute the F1-score.',
  'llm_answer': 'The F1-score is the harmonic mean of precision and recall. It can be calculated using the formula:\n\n\\[\nF1 = 2 \\times \\frac{(Precision \\times Recall)}{(Precision + Recall)}\n\\]\n\nGiven:\n- Precision = 0.8\n- Recall = 0.6\n\nSubstituting the values into the formula:\n\n\\[\nF1 = 2 \\times \\frac{(0.8 \\times 0.6)}{(0.8 + 0.6)}\n\\]\n\nCalculating the numerator:\n\n\\[\n0.8 \\times 0.6 = 0.48\n\\]\n\nCalculating the denominator:\n\n\\[\n0.8 + 0.6 = 1.4\n\\]\n\nNow substituting back into the F1 formula:\n\n\\[\nF1 = 2 \\times \\frac{0.48}{1.4}\n\\]\n\nCalculating the fraction:\n\n\\[\n\\frac{0.48}{1.4} \\approx 0.342857\n\\]\n\nNow multiplying by 2:\n\n\\[\nF1 \\approx 2 \\times 0.342857 \\approx 0.685714\n\\]\n\nThus, the F1-score is approximately **0.686** (rounded to three decimal places).',
  'llm_answer_scores': {'Accuracy': 10,
   'Completeness': 10

In [57]:
llm_rag_qa_accuracy_stats_save_file = "../data/output/evaluation/llm_rag_qa_accuracy_stats.json"

os.makedirs(os.path.dirname(llm_rag_qa_accuracy_stats_save_file), exist_ok=True)

with open(llm_rag_qa_accuracy_stats_save_file, 'w') as json_file:
    json.dump(llm_rag_qa_accuracy_stats, json_file, indent=4)

We run this on 80 sample questions and then save this output for future use (visualization in experiments/visualizations and output).

In [58]:
categories = list(set([question["category"] for question in accuracy_questions]))
print(categories)

['Probability Recap for NLP', 'Few Shot Learners', 'Fast Attention', 'Cross-attention', 'Language Models', 'PEFT', 'Feedforward Neural Networks', 'VAEs', 'Decoding', 'Benchmarks', 'Intro to NLP and History', 'Pre-training Transformers', 'Compression', 'Linear Sequence Models', 'Self-Attention', 'Word Vectors', 'Tokenization', 'KNN LM', 'Instruct Tuning', 'Scaling Laws for LLMs']


When generating questions we change the base prompt to mention that we want to generate 10 graduate-level NLP questions around a particular topic and ask the LLM to take on the role of an NLP teaching assistant (in our formulation, the TA generates the questions and the professor judges their appropriateness and then also grades the answers).

The next few cells outline how we establish that. We find that our RAG pipeline is able to generate 200 questions in just 197.6 seconds (~3.25 minutes).

It is important to note that when generating questions we ask the RAG pipeline to retrieve the top 5 most relevant documents based on the topic specified.

In [60]:
def augment_base_prompt_for_gen_q(base_prompt: str, topic: str, client, index, k):
    """Augment the prompt with relevant information from the knowledge base"""
    source_knowledge = retrieve_pinecone(topic, client, index, k)
    
    augmented_prompt = f"""{base_prompt}\n\nRelevant information:\n{source_knowledge}\n\nTopic:\n{topic}\n\nQuestions:"""
    
    # print("\nAugmented Prompt:\n", augmented_prompt)
    return source_knowledge, augmented_prompt

In [61]:
def generate_response_for_gen_q(base_prompt, topic, client, index, model="gpt-4o-mini", k=3):
    try:
        # Initialize OpenAI client (if not already initialized)
        client = OpenAI(api_key=OPENAI_API_KEY)

        base_prompt = base_prompt
        topic = topic        
        
        # prompt = augment_base_prompt(base_prompt, query, client, index)
        
        source_knowledge, prompt = augment_base_prompt_for_gen_q(base_prompt, topic, client, index, k)

       # we need the language model to actually generate an answer using the retrieved relevant context
        response = client.chat.completions.create(  
            model=model,
            messages=[
                {"role": "system", "content": "You are an expert NLP teaching assistant."},
                {"role": "user", "content": prompt}
            ],
            # temperature=0.7 # Let's not let it incorporate too much randomness
            temperature=0.4
        )

        answer = response.choices[0].message.content
        # json_str = answer.strip("```json\n").strip("```")
        # answer = json.loads(json_str)
        
        # print(answer)
        return source_knowledge, prompt, answer
    except Exception as e:
        print(f"Debug - Error in main: {str(e)}")
        raise e

In [62]:
# Ask RAG system to generate questions
base_prompt_gen_q = f"""
You are an astute thinker and academic well-versed in NLP.

You are going to generate 10 grad-level NLP questions for the given topic. The questions can cover math, coding, short theoretical answers and descriptive answers.

Given the following topic and relevant information from the course materials, please provide the required questions. Your response should:
1. Directly address the topic
2. Be based on the provided relevant text from the course materials
3. Use academic language appropriate for graduate-level NLP courses
4. Include relevant technical details if applicable
5. Mention which lecture materials you're referencing in your answer

Your response should be a valid JSON where each element in the list is a dictionary of the type
'topic': # the topic the question is about
'type': # type of question generated,
'question': # the question itself,


If the provided information is insufficient to generate questions from, state this clearly and explain what additional information would be needed.
"""

In [63]:
import time

k = 5

start_time = time.time()
for category in categories:
    topic = f"The questions should be about {category}"
    source_knowledge, prompt, answer = generate_response_for_gen_q(base_prompt_gen_q, topic, client, index, model, k)
    print(answer)

end_time = time.time()
execution_time = end_time - start_time
print(f"Time taken to generate 200 questions (10 per topic): {execution_time:.6f}s")

```json
[
    {
        "topic": "Probability Recap for NLP",
        "type": "theoretical",
        "question": "Discuss the significance of probability theory in Natural Language Processing as outlined in Part 1 of the lecture materials. How does it facilitate the modeling of language?"
    },
    {
        "topic": "Probability Recap for NLP",
        "type": "mathematical",
        "question": "Given a discrete random variable X representing the occurrence of words in a corpus, derive the formula for the probability mass function (PMF) and explain its relevance in NLP applications."
    },
    {
        "topic": "Probability Recap for NLP",
        "type": "coding",
        "question": "Implement a Python function that calculates the conditional probability P(A|B) using a frequency count from a given text corpus. Explain the importance of this calculation in language modeling."
    },
    {
        "topic": "Probability Recap for NLP",
        "type": "descriptive",
        "questi

In [65]:
questions_list = [
    {
        "topic": "Probability Recap for NLP",
        "type": "theoretical",
        "question": "Discuss the significance of probability theory in Natural Language Processing as outlined in Part 1 of the lecture materials. How does it facilitate the modeling of language?"
    },
    {
        "topic": "Probability Recap for NLP",
        "type": "mathematical",
        "question": "Given a discrete random variable X representing the occurrence of words in a corpus, derive the formula for the probability mass function (PMF) and explain its relevance in NLP applications."
    },
    {
        "topic": "Probability Recap for NLP",
        "type": "coding",
        "question": "Implement a Python function that calculates the conditional probability P(A|B) using a frequency count from a given text corpus. Explain the importance of this calculation in language modeling."
    },
    {
        "topic": "Probability Recap for NLP",
        "type": "descriptive",
        "question": "Explain the concept of joint probability and its application in NLP, particularly in the context of n-gram models as discussed in the lecture materials."
    },
    {
        "topic": "Probability Recap for NLP",
        "type": "theoretical",
        "question": "What is Bayes' theorem, and how is it applied in Natural Language Processing tasks such as spam detection and sentiment analysis? Reference the relevant sections from the course materials."
    },
    {
        "topic": "Probability Recap for NLP",
        "type": "mathematical",
        "question": "Calculate the likelihood of a sequence of words in a bigram model using the provided corpus data. Discuss the implications of your findings for language generation tasks."
    },
    {
        "topic": "Probability Recap for NLP",
        "type": "coding",
        "question": "Write a Python script that implements a simple Naive Bayes classifier for text classification. Discuss how the underlying probability concepts from the lecture materials are utilized in your implementation."
    },
    {
        "topic": "Probability Recap for NLP",
        "type": "theoretical",
        "question": "Describe the role of probability distributions in NLP, particularly focusing on the Gaussian distribution and its applications in word embeddings and topic modeling."
    },
    {
        "topic": "Probability Recap for NLP",
        "type": "descriptive",
        "question": "Discuss the concept of maximum likelihood estimation (MLE) in the context of training probabilistic models for NLP. How does MLE relate to the principles outlined in the lecture materials?"
    },
    {
        "topic": "Probability Recap for NLP",
        "type": "theoretical",
        "question": "Analyze the relationship between entropy and information gain in the context of Natural Language Processing. How do these concepts influence model selection and evaluation?"
    },
    {
        "topic": "Few Shot Learners",
        "type": "theoretical",
        "question": "Discuss the primary mechanisms through which few-shot learning algorithms generalize from limited training data, as outlined in the lecture on efficient few-shot learning."
    },
    {
        "topic": "Few Shot Learners",
        "type": "coding",
        "question": "Implement a simple few-shot learning model using a framework of your choice (e.g., PyTorch or TensorFlow). Describe the architecture and training process based on the principles discussed in the lectures."
    },
    {
        "topic": "Few Shot Learners",
        "type": "mathematical",
        "question": "Define the loss function commonly used in few-shot learning scenarios. How does it differ from traditional loss functions in supervised learning, as discussed in the course materials?"
    },
    {
        "topic": "Few Shot Learners",
        "type": "descriptive",
        "question": "Elaborate on the role of transfer learning in few-shot learning, referencing the insights provided in the lecture by Anoop Sarkar on LLMs as few-shot learners."
    },
    {
        "topic": "Few Shot Learners",
        "type": "theoretical",
        "question": "What are the limitations of few-shot learning models, particularly in the context of language models, as highlighted in the efficient few-shot learning lecture?"
    },
    {
        "topic": "Few Shot Learners",
        "type": "coding",
        "question": "Write a function that evaluates the performance of a few-shot learning model on a benchmark dataset such as BoolQ. What metrics would you include, and why?"
    },
    {
        "topic": "Few Shot Learners",
        "type": "mathematical",
        "question": "Explain the concept of 'support set' and 'query set' in few-shot learning. How do these concepts influence the training and evaluation phases, as discussed in the course materials?"
    },
    {
        "topic": "Few Shot Learners",
        "type": "descriptive",
        "question": "Analyze the impact of dataset diversity on the performance of few-shot learning models. How does this relate to the findings presented in the efficient few-shot learning lecture?"
    },
    {
        "topic": "Few Shot Learners",
        "type": "theoretical",
        "question": "Critically assess the performance of LLMs as few-shot learners compared to traditional few-shot learning methods. What advantages and challenges do LLMs present, based on the lectures?"
    },
    {
        "topic": "Few Shot Learners",
        "type": "coding",
        "question": "Design a few-shot learning experiment using a synthetic dataset. Describe the steps taken to generate the dataset and the expected outcomes based on the principles discussed in the lectures."
    },
    {
        "topic": "Fast Attention",
        "type": "theoretical",
        "question": "Explain the fundamental differences between Standard Attention and Position Infused Attention as discussed in the Fast Attention lecture materials."
    },
    {
        "topic": "Fast Attention",
        "type": "mathematical",
        "question": "Derive the computational complexity of Standard Attention and discuss how it can be optimized in the context of Fast Attention mechanisms."
    },
    {
        "topic": "Fast Attention",
        "type": "coding",
        "question": "Implement a basic version of Position Infused Attention in Python. What are the key components that differentiate it from Standard Attention?"
    },
    {
        "topic": "Fast Attention",
        "type": "descriptive",
        "question": "Discuss the implications of using Position Infused Attention in natural language processing tasks. How does it enhance model performance compared to traditional attention mechanisms?"
    },
    {
        "topic": "Fast Attention",
        "type": "theoretical",
        "question": "What are the limitations of Standard Attention mechanisms that Fast Attention aims to address, particularly in terms of scalability and efficiency?"
    },
    {
        "topic": "Fast Attention",
        "type": "mathematical",
        "question": "Given a sequence length of \( n \) and a model dimension of \( d \), calculate the memory requirements for both Standard Attention and Fast Attention. How do these requirements impact large-scale NLP applications?"
    },
    {
        "topic": "Fast Attention",
        "type": "coding",
        "question": "Write a function in TensorFlow or PyTorch that implements Fast Attention. What optimizations can you incorporate to improve its performance on large datasets?"
    },
    {
        "topic": "Fast Attention",
        "type": "descriptive",
        "question": "Critically evaluate the role of attention mechanisms in transformer architectures, particularly focusing on how Fast Attention modifies the traditional approach."
    },
    {
        "topic": "Fast Attention",
        "type": "theoretical",
        "question": "How does the concept of cross-attention differ from standard attention, and what advantages does it offer in multi-modal learning scenarios as per the cross-attention lecture materials?"
    },
    {
        "topic": "Fast Attention",
        "type": "mathematical",
        "question": "If the attention scores are computed using a dot-product mechanism, derive the gradients with respect to the input embeddings for both Standard and Fast Attention. How do these gradients influence the learning process?"
    },
    {
        "topic": "Cross-attention",
        "type": "theoretical",
        "question": "Define cross-attention in the context of transformer architectures and explain how it differs from self-attention."
    },
    {
        "topic": "Cross-attention",
        "type": "coding",
        "question": "Write a Python function that implements the cross-attention mechanism using PyTorch. Include comments to explain each step of the process."
    },
    {
        "topic": "Cross-attention",
        "type": "math",
        "question": "Given the queries \( Q \), keys \( K \), and values \( V \) matrices, derive the formula for the output of the cross-attention mechanism. Explain each component of the formula."
    },
    {
        "topic": "Cross-attention",
        "type": "descriptive",
        "question": "Discuss the role of cross-attention in multi-modal learning scenarios, providing examples of how it can be applied in tasks such as image captioning or video analysis."
    },
    {
        "topic": "Cross-attention",
        "type": "theoretical",
        "question": "What are the implications of using cross-attention for sequence-to-sequence tasks? Discuss its impact on model performance and efficiency."
    },
    {
        "topic": "Cross-attention",
        "type": "short theoretical",
        "question": "What are the key components that constitute the input \( x \) in the context of cross-attention, and how do they influence the attention mechanism?"
    },
    {
        "topic": "Cross-attention",
        "type": "coding",
        "question": "Implement a simple example of cross-attention in TensorFlow. Include a brief explanation of how the attention weights are computed."
    },
    {
        "topic": "Cross-attention",
        "type": "math",
        "question": "If the dimensions of the queries, keys, and values are \( d_k \), \( d_k \), and \( d_v \) respectively, what is the computational complexity of the cross-attention mechanism? Explain your reasoning."
    },
    {
        "topic": "Cross-attention",
        "type": "descriptive",
        "question": "Analyze the advantages and potential limitations of cross-attention compared to traditional attention mechanisms. Provide examples from recent literature."
    },
    {
        "topic": "Cross-attention",
        "type": "theoretical",
        "question": "In what scenarios would you prefer to use cross-attention over self-attention? Justify your answer with relevant examples."
    },
    {
        "topic": "Language Models",
        "type": "theoretical",
        "question": "Discuss the significance of evaluating language models in the context of natural language processing. What metrics are commonly used for this evaluation, and how do they reflect the model's performance?"
    },
    {
        "topic": "Language Models",
        "type": "coding",
        "question": "Implement a simple n-gram language model in Python. Your implementation should include functionality to calculate the probability of a given sequence of words and to generate text based on the learned probabilities."
    },
    {
        "topic": "Language Models",
        "type": "descriptive",
        "question": "Explain the concept of perplexity in language models. How does it relate to the model's ability to predict a sequence of words, and why is it considered an important metric?"
    },
    {
        "topic": "Language Models",
        "type": "math",
        "question": "Given a dataset of sentences, calculate the bigram probabilities for the following sentence: 'The cat sat on the mat'. Show your calculations and explain the steps involved in deriving these probabilities."
    },
    {
        "topic": "Language Models",
        "type": "theoretical",
        "question": "What are the challenges associated with understanding language models in terms of nuance and author intent? How can these challenges affect the evaluation of machine translation outputs?"
    },
    {
        "topic": "Language Models",
        "type": "theoretical",
        "question": "Critically analyze the sentence editing task as described in the WMT 2009–2010. What does this task reveal about the understanding of language models in relation to machine translation?"
    },
    {
        "topic": "Language Models",
        "type": "descriptive",
        "question": "Describe the role of context in language models. How do modern architectures, such as transformers, leverage context to improve language understanding and generation?"
    },
    {
        "topic": "Language Models",
        "type": "coding",
        "question": "Create a function that evaluates a language model's output against a set of reference translations. Discuss how you would implement metrics such as BLEU or ROUGE in your evaluation."
    },
    {
        "topic": "Language Models",
        "type": "theoretical",
        "question": "Discuss the implications of using deep learning techniques for language modeling. How do these techniques compare to traditional statistical approaches in terms of performance and interpretability?"
    },
    {
        "topic": "Language Models",
        "type": "theoretical",
        "question": "Examine the importance of training data quality and diversity in the development of robust language models. How can biases in training data affect the outputs of these models?"
    },
    {
      "topic": "PEFT",
      "type": "theoretical",
      "question": "Discuss the significance of Content Understanding Tests in evaluating machine translation outputs. How do these tests assess the ability of monolingual speakers to comprehend translated content?"
    },
    {
      "topic": "PEFT",
      "type": "coding",
      "question": "Write a Python function that simulates a sentence editing task similar to the WMT 2009–2010. The function should take a machine-translated sentence as input and return a modified version that aims to improve fluency without access to the source text."
    },
    {
      "topic": "PEFT",
      "type": "math",
      "question": "Given a set of machine translation outputs, calculate the percentage of outputs that allow a monolingual speaker to answer basic factual questions correctly. Assume you have a dataset of 100 translations, with 75 allowing correct answers."
    },
    {
      "topic": "PEFT",
      "type": "descriptive",
      "question": "Elaborate on the challenges of devising questions for Content Understanding Tests. What factors contribute to the complexity of evaluating understanding in machine translation?"
    },
    {
      "topic": "PEFT",
      "type": "theoretical",
      "question": "Analyze the role of nuance and author intent in machine translation evaluation. How can these elements be quantitatively assessed in a Content Understanding Test?"
    },
    {
      "topic": "PEFT",
      "type": "coding",
      "question": "Implement a simple algorithm that evaluates the correctness of edits made by person B in the sentence editing task. What metrics would you use to determine the success of the edits?"
    },
    {
      "topic": "PEFT",
      "type": "theoretical",
      "question": "What are the implications of the findings from Content Understanding Tests for the development of more advanced machine translation systems?"
    },
    {
      "topic": "PEFT",
      "type": "descriptive",
      "question": "Discuss the importance of temporal and causal relationships in machine translation outputs. How do these relationships affect a monolingual speaker's understanding of the translated text?"
    },
    {
      "topic": "PEFT",
      "type": "math",
      "question": "If a Content Understanding Test consists of 10 questions, and a monolingual speaker answers 7 correctly, calculate the accuracy rate. How does this metric inform the evaluation of machine translation systems?"
    },
    {
      "topic": "PEFT",
      "type": "theoretical",
      "question": "Critically evaluate the sentence editing task as a method for assessing translation fluency. What are its strengths and weaknesses in the context of Content Understanding Tests?"
    },
    {
        "topic": "Feedforward Neural Networks",
        "type": "theoretical",
        "question": "Explain the architecture of a typical feedforward neural network and discuss the role of activation functions in this context."
    },
    {
        "topic": "Feedforward Neural Networks",
        "type": "mathematical",
        "question": "Given a feedforward neural network with one hidden layer containing 5 neurons, derive the output of the network for a single input vector \( x \) using the following weights and biases: weights \( W_1 \) for the input to hidden layer and \( W_2 \) for hidden to output layer, and biases \( b_1 \) and \( b_2 \)."
    },
    {
        "topic": "Feedforward Neural Networks",
        "type": "coding",
        "question": "Implement a simple feedforward neural network from scratch in Python using NumPy. The network should have one hidden layer and utilize the ReLU activation function."
    },
    {
        "topic": "Feedforward Neural Networks",
        "type": "descriptive",
        "question": "Discuss the differences between feedforward neural networks and recurrent neural networks, particularly in terms of architecture and use cases."
    },
    {
        "topic": "Feedforward Neural Networks",
        "type": "theoretical",
        "question": "What are the limitations of feedforward neural networks in processing sequential data, and how do these limitations impact their application in NLP tasks?"
    },
    {
        "topic": "Feedforward Neural Networks",
        "type": "mathematical",
        "question": "If a feedforward neural network uses a softmax activation function in the output layer, derive the loss function used for training the network and explain its significance in classification tasks."
    },
    {
        "topic": "Feedforward Neural Networks",
        "type": "coding",
        "question": "Using a deep learning library (e.g., TensorFlow or PyTorch), create a feedforward neural network model to classify text data. Include the necessary preprocessing steps for the input data."
    },
    {
        "topic": "Feedforward Neural Networks",
        "type": "theoretical",
        "question": "Describe the backpropagation algorithm used in training feedforward neural networks. How does it adjust weights based on the error gradient?"
    },
    {
        "topic": "Feedforward Neural Networks",
        "type": "descriptive",
        "question": "Analyze the impact of overfitting in feedforward neural networks and discuss strategies to mitigate this issue during training."
    },
    {
        "topic": "Feedforward Neural Networks",
        "type": "theoretical",
        "question": "What is the role of regularization techniques in feedforward neural networks, and how do they contribute to improving model generalization?"
    },
    {
        "topic": "Decoding",
        "type": "theoretical",
        "question": "Explain the role of the initial hypothesis in the decoding process of statistical machine translation as discussed in Chapter 6. How does this initial hypothesis influence subsequent decoding steps?"
    },
    {
        "topic": "Decoding",
        "type": "mathematical",
        "question": "Given a translation model with a vocabulary size of V and a sentence of length N, derive the computational complexity of the decoding process using the Viterbi algorithm. What factors contribute to the time complexity?"
    },
    {
        "topic": "Decoding",
        "type": "coding",
        "question": "Implement a simple beam search decoding algorithm for a statistical machine translation model in Python. Ensure that your implementation can handle a small vocabulary and demonstrate how it manages multiple hypotheses."
    },
    {
        "topic": "Decoding",
        "type": "descriptive",
        "question": "Discuss the significance of the decoding step in statistical machine translation. How does it differ from the training phase, and what are the key challenges faced during decoding as outlined in Chapter 6?"
    },
    {
        "topic": "Decoding",
        "type": "theoretical",
        "question": "What are the implications of using a greedy decoding strategy versus a beam search strategy in statistical machine translation? Refer to the advantages and disadvantages highlighted in Chapter 6."
    },
    {
        "topic": "Decoding",
        "type": "short theoretical",
        "question": "Define 'decoding' in the context of statistical machine translation and explain its importance in generating fluent and coherent target language output."
    },
    {
        "topic": "Decoding",
        "type": "theoretical",
        "question": "How does the concept of 'coverage' relate to the decoding process in statistical machine translation? Discuss its implications on the quality of the output as mentioned in Chapter 6."
    },
    {
        "topic": "Decoding",
        "type": "evaluation",
        "question": "Based on the content understanding tests described in Chapter 8, how can the effectiveness of a decoding strategy be evaluated in terms of the target language speaker's comprehension of the translated output?"
    },
    {
        "topic": "Decoding",
        "type": "coding",
        "question": "Write a function that simulates the decoding process by generating multiple candidate translations for a given input sentence. Include a scoring mechanism based on a simple language model."
    },
    {
        "topic": "Decoding",
        "type": "theoretical",
        "question": "Discuss the challenges of ensuring fluency and accuracy in the output of a decoding process. How do these challenges relate to the editing tasks described in the evaluation section?"
    },
    {
        "topic": "Benchmarks",
        "type": "theoretical",
        "question": "Discuss the significance of using accuracy and F1 score as evaluation metrics in the context of imbalanced classes, as highlighted in the Quora Question Pairs (QQP) dataset."
    },
    {
        "topic": "Benchmarks",
        "type": "coding",
        "question": "Write a Python function that calculates the F1 score given the true positive, false positive, and false negative counts from a similarity task such as QQP."
    },
    {
        "topic": "Benchmarks",
        "type": "descriptive",
        "question": "Describe the nature of the data used in the QQP dataset and how it contributes to the challenges of training models for similarity and paraphrase tasks."
    },
    {
        "topic": "Benchmarks",
        "type": "mathematical",
        "question": "Given a dataset with 1000 question pairs, where 370 pairs are positive (similar) and 630 pairs are negative (dissimilar), calculate the precision, recall, and F1 score assuming your model predicts 300 true positives, 70 false positives, and 70 false negatives."
    },
    {
        "topic": "Benchmarks",
        "type": "theoretical",
        "question": "Analyze the implications of class imbalance in the QQP dataset on model evaluation and the potential strategies to mitigate its effects."
    },
    {
        "topic": "Benchmarks",
        "type": "coding",
        "question": "Implement a simple logistic regression model in Python using scikit-learn to classify question pairs from the QQP dataset and evaluate its performance using accuracy and F1 score."
    },
    {
        "topic": "Benchmarks",
        "type": "descriptive",
        "question": "Examine the types of paraphrase pairs presented in the QQP dataset and discuss the linguistic features that may influence their classification."
    },
    {
        "topic": "Benchmarks",
        "type": "theoretical",
        "question": "What are the potential biases that might arise from using community-generated data such as that from Quora for training NLP models, and how can these biases affect benchmark results?"
    },
    {
        "topic": "Benchmarks",
        "type": "mathematical",
        "question": "If a model achieves an accuracy of 80% on the QQP dataset, discuss how this figure might be misleading in the context of class imbalance and propose a more informative metric."
    },
    {
        "topic": "Benchmarks",
        "type": "theoretical",
        "question": "Critically evaluate the effectiveness of the QQP dataset as a benchmark for measuring progress in NLP tasks related to similarity and paraphrase detection."
    },
    {
        "topic": "Intro to NLP and History",
        "type": "theoretical",
        "question": "Discuss the evolution of Natural Language Processing (NLP) from rule-based systems to statistical methods, highlighting key milestones and their impact on the field. Reference the historical context provided in the course materials."
    },
    {
        "topic": "Intro to NLP and History",
        "type": "descriptive",
        "question": "Describe the significance of the Turing Test in the context of NLP's historical development. How does it relate to the goals of NLP as outlined in the lecture materials?"
    },
    {
        "topic": "Intro to NLP and History",
        "type": "coding",
        "question": "Implement a simple text classification model using a linear model approach as described in the lecture on linear models. Provide the code and explain each step of the process."
    },
    {
        "topic": "Intro to NLP and History",
        "type": "math",
        "question": "Define precision, recall, and F1-score in the context of NLP classification tasks. Calculate these metrics given a confusion matrix where True Positives = 30, False Positives = 10, True Negatives = 50, and False Negatives = 10."
    },
    {
        "topic": "Intro to NLP and History",
        "type": "theoretical",
        "question": "Examine the role of machine learning in the advancement of NLP, particularly in the transition from traditional linguistic approaches to data-driven methods. Cite specific examples from the course materials."
    },
    {
        "topic": "Intro to NLP and History",
        "type": "short theoretical",
        "question": "What are the primary challenges faced by early NLP systems, and how have these challenges shaped modern approaches to NLP as discussed in the historical overview?"
    },
    {
        "topic": "Intro to NLP and History",
        "type": "descriptive",
        "question": "Analyze the contributions of the Chomskyan revolution to the field of NLP. How did generative grammar influence computational approaches to language processing?"
    },
    {
        "topic": "Intro to NLP and History",
        "type": "coding",
        "question": "Write a Python function that tokenizes a given text input into words and sentences. Discuss the significance of tokenization in NLP tasks as presented in the lecture materials."
    },
    {
        "topic": "Intro to NLP and History",
        "type": "math",
        "question": "Using the concept of n-grams discussed in the course, calculate the bigrams for the sentence 'Natural Language Processing is fascinating'. Provide the resulting bigrams and explain their utility in NLP tasks."
    },
    {
        "topic": "Intro to NLP and History",
        "type": "theoretical",
        "question": "Critically evaluate the impact of deep learning on the field of NLP. What are the key innovations introduced by deep learning models, and how do they differ from earlier statistical methods?"
    },
    {
        "topic": "Pre-training Transformers",
        "type": "theoretical",
        "question": "Discuss the significance of representation learning in the context of pre-training transformers, as outlined in the lecture materials."
    },
    {
        "topic": "Pre-training Transformers",
        "type": "coding",
        "question": "Implement a simple transformer model in Python using PyTorch, focusing on the pre-training phase. Highlight the key components that are essential for this phase as discussed in the provided notebook."
    },
    {
        "topic": "Pre-training Transformers",
        "type": "mathematical",
        "question": "Derive the attention mechanism used in transformers, including the scaled dot-product attention formula. Explain its relevance during the pre-training phase based on the lecture content."
    },
    {
        "topic": "Pre-training Transformers",
        "type": "descriptive",
        "question": "Describe the process of fine-tuning a pre-trained transformer model on a specific NLP task. What are the implications of this process for transfer learning, as mentioned in the course materials?"
    },
    {
        "topic": "Pre-training Transformers",
        "type": "theoretical",
        "question": "What are the key differences between supervised and unsupervised pre-training of transformers? Cite examples from the lecture materials to support your answer."
    },
    {
        "topic": "Pre-training Transformers",
        "type": "coding",
        "question": "Write a function to visualize the attention weights from a pre-trained transformer model. Discuss how this visualization can aid in understanding the model's decision-making process."
    },
    {
        "topic": "Pre-training Transformers",
        "type": "mathematical",
        "question": "Explain the role of the positional encoding in transformer architectures. Provide the mathematical formulation as presented in the lecture materials and discuss its impact on the model's performance."
    },
    {
        "topic": "Pre-training Transformers",
        "type": "theoretical",
        "question": "Analyze the role of masked language modeling in the pre-training of transformers. How does this technique contribute to the model's ability to generalize across different NLP tasks?"
    },
    {
        "topic": "Pre-training Transformers",
        "type": "descriptive",
        "question": "Evaluate the advantages and disadvantages of using transformer models over traditional RNNs for pre-training in NLP tasks, as discussed in the relevant lectures."
    },
    {
        "topic": "Pre-training Transformers",
        "type": "theoretical",
        "question": "Discuss the concept of multi-head attention in transformers. How does it enhance the model's capacity to learn complex representations during the pre-training phase?"
    },
    {
        "topic": "Compression",
        "type": "theoretical",
        "question": "Discuss the significance of achieving an entropy of 1.21 bits per character (bpc) in the context of text compression, particularly in relation to Shannon's upper bound of 1.3 bpc."
    },
    {
        "topic": "Compression",
        "type": "mathematical",
        "question": "Given a text file of size 1696.7 MB compressed to 256.5 MB using RNNME-200, calculate the compression ratio and express it as a percentage."
    },
    {
        "topic": "Compression",
        "type": "coding",
        "question": "Implement a simple arithmetic coding algorithm in Python to compress a given string of text. Explain how your implementation relates to the compression techniques discussed in the lecture materials."
    },
    {
        "topic": "Compression",
        "type": "theoretical",
        "question": "Explain the role of predictors in data compression, as mentioned in the context of the formula 'Data compressor = Predictor + Arithmetic coding'."
    },
    {
        "topic": "Compression",
        "type": "descriptive",
        "question": "Describe the various techniques employed to improve compression rates as outlined in the lecture materials, specifically focusing on the use of multiple models and different learning rates."
    },
    {
        "topic": "Compression",
        "type": "theoretical",
        "question": "Compare and contrast the performance of gzip and PAQ8o10t based on their bits per character (bpc) results. What implications do these results have for practical applications of text compression?"
    },
    {
        "topic": "Compression",
        "type": "coding",
        "question": "Write a function that simulates the compression of text data using a skip-gram model. Discuss how this model contributes to the overall compression effectiveness."
    },
    {
        "topic": "Compression",
        "type": "theoretical",
        "question": "What are the advantages and limitations of using RNN-based models for text compression as opposed to traditional methods like gzip, based on the findings in the lecture materials?"
    },
    {
        "topic": "Compression",
        "type": "mathematical",
        "question": "If the original text file has an entropy of 8 bits per character and is compressed to 1.21 bpc, calculate the percentage reduction in entropy achieved through compression."
    },
    {
        "topic": "Compression",
        "type": "descriptive",
        "question": "Discuss the implications of using state-of-the-art compression programs like PAQ8o10t in real-world applications, considering the results presented in the lecture on text compression."
    },
    {
      "topic": "Linear Sequence Models",
      "type": "theoretical",
      "question": "Discuss the differences between log-linear models and feedforward neural networks in the context of linear sequence modeling. What are the advantages and disadvantages of each approach?"
    },
    {
      "topic": "Linear Sequence Models",
      "type": "coding",
      "question": "Implement a simple feedforward neural network in Python using a library of your choice (e.g., TensorFlow or PyTorch) to model a linear sequence. Provide the code and explain each component of your implementation."
    },
    {
      "topic": "Linear Sequence Models",
      "type": "mathematical",
      "question": "Given a linear sequence model, derive the update rule for weights using Stochastic Gradient Descent (SGD). Explain how the learning rate affects convergence."
    },
    {
      "topic": "Linear Sequence Models",
      "type": "descriptive",
      "question": "Explain the significance of the XOR problem as a motivating example for understanding feedforward neural networks in the context of linear sequence models."
    },
    {
      "topic": "Linear Sequence Models",
      "type": "theoretical",
      "question": "What are computation graphs, and how do they facilitate the implementation of linear sequence models in neural networks?"
    },
    {
      "topic": "Linear Sequence Models",
      "type": "mathematical",
      "question": "Consider a linear sequence model with a single hidden layer. Derive the forward propagation equations, including the activation functions used."
    },
    {
      "topic": "Linear Sequence Models",
      "type": "coding",
      "question": "Write a function that takes a sequence of inputs and outputs the predictions of a trained feedforward neural network. Include the necessary preprocessing steps for the input data."
    },
    {
      "topic": "Linear Sequence Models",
      "type": "theoretical",
      "question": "Compare and contrast the training dynamics of log-linear models versus feedforward neural networks when applied to linear sequence data."
    },
    {
      "topic": "Linear Sequence Models",
      "type": "descriptive",
      "question": "Describe the role of activation functions in feedforward neural networks and their impact on the model's ability to learn linear sequences."
    },
    {
      "topic": "Linear Sequence Models",
      "type": "mathematical",
      "question": "Given a dataset of sequences, outline the steps to train a linear sequence model using Stochastic Gradient Descent, including loss function selection and convergence criteria."
    },
    {
        "topic": "Self-Attention",
        "type": "theoretical",
        "question": "Explain the concept of self-attention and its significance in the context of neural network architectures, particularly in relation to the Transformer model as discussed in the lecture materials."
    },
    {
        "topic": "Self-Attention",
        "type": "coding",
        "question": "Implement a basic self-attention mechanism in Python. Assume you have an input tensor of shape (batch_size, seq_length, embedding_dim). Provide the code and explain each step."
    },
    {
        "topic": "Self-Attention",
        "type": "math",
        "question": "Derive the mathematical formulation of self-attention, including the computation of the attention scores and the output representation. Reference the equations presented in the lecture materials."
    },
    {
        "topic": "Self-Attention",
        "type": "descriptive",
        "question": "Discuss how self-attention differs from traditional attention mechanisms, particularly in terms of computational efficiency and parallelization, as highlighted in the provided course materials."
    },
    {
        "topic": "Self-Attention",
        "type": "theoretical",
        "question": "What are the advantages of using self-attention over recurrent neural networks (RNNs) in processing sequential data? Cite specific examples from the lecture materials."
    },
    {
        "topic": "Self-Attention",
        "type": "coding",
        "question": "Using TensorFlow or PyTorch, create a simple model that incorporates self-attention layers. Describe the architecture and the role of self-attention within it."
    },
    {
        "topic": "Self-Attention",
        "type": "math",
        "question": "Given a set of input vectors, calculate the self-attention scores and the resulting output vectors. Use a small numerical example to illustrate your calculations."
    },
    {
        "topic": "Self-Attention",
        "type": "theoretical",
        "question": "Critically evaluate the role of self-attention in the context of natural language processing tasks, such as machine translation and text summarization, as discussed in the course materials."
    },
    {
        "topic": "Self-Attention",
        "type": "descriptive",
        "question": "Describe the process of multi-head self-attention and its advantages over single-head attention, referencing the relevant sections in the lecture materials."
    },
    {
        "topic": "Self-Attention",
        "type": "theoretical",
        "question": "How does self-attention facilitate the capture of long-range dependencies in sequences compared to traditional approaches? Provide a detailed explanation based on the course content."
    },
    {
        "topic": "Word Vectors",
        "type": "theoretical",
        "question": "Discuss the advantages and limitations of using one-hot vectors in representing words in natural language processing. How do these limitations lead to the development of more sophisticated models like Word2Vec and GloVe?"
    },
    {
        "topic": "Word Vectors",
        "type": "coding",
        "question": "Implement a Python function that generates one-hot vectors for a given list of words. Ensure your implementation can handle words not present in the vocabulary."
    },
    {
        "topic": "Word Vectors",
        "type": "mathematical",
        "question": "Explain the mathematical principles behind Singular Value Decomposition (SVD) and demonstrate how it can be applied to reduce the dimensionality of a word vector space."
    },
    {
        "topic": "Word Vectors",
        "type": "theoretical",
        "question": "Compare and contrast Word2Vec and GloVe in terms of their underlying algorithms and the type of word relationships they capture. Which model would you recommend for a specific NLP task and why?"
    },
    {
        "topic": "Word Vectors",
        "type": "descriptive",
        "question": "Describe the evaluation metrics commonly used to assess the quality of word vectors. How do these metrics relate to the semantic relationships captured by the vectors?"
    },
    {
        "topic": "Word Vectors",
        "type": "coding",
        "question": "Using the Gensim library, write a script that trains a Word2Vec model on a sample corpus. Include steps for preprocessing the text and visualizing the resulting word vectors."
    },
    {
        "topic": "Word Vectors",
        "type": "theoretical",
        "question": "What role does context play in the generation of word vectors? Discuss how models like Word2Vec utilize context windows to derive semantic meaning."
    },
    {
        "topic": "Word Vectors",
        "type": "mathematical",
        "question": "Given a matrix representation of word co-occurrences, perform SVD on this matrix and interpret the resulting singular values and vectors in the context of word similarity."
    },
    {
        "topic": "Word Vectors",
        "type": "descriptive",
        "question": "Explain how the concept of vector arithmetic can be applied to word vectors. Provide examples of how this can be used to derive relationships such as 'king - man + woman = queen'."
    },
    {
        "topic": "Word Vectors",
        "type": "theoretical",
        "question": "Discuss the impact of dimensionality reduction techniques, such as PCA and SVD, on the efficiency and effectiveness of word vector representations in NLP tasks."
    },
    {
        "topic": "Tokenization",
        "type": "theoretical",
        "question": "Discuss the importance of tokenization in Natural Language Processing and how it affects subsequent processing stages. Reference the implications of tokenization as outlined in the lecture materials."
    },
    {
        "topic": "Tokenization",
        "type": "coding",
        "question": "Implement a Python function that takes a string input and returns a list of tokens using a basic whitespace-based tokenization approach. Ensure to handle punctuation as demonstrated in the tokenization lecture."
    },
    {
        "topic": "Tokenization",
        "type": "descriptive",
        "question": "Explain the differences between word-based and character-based tokenization methods. Provide examples of scenarios where each method might be preferable based on the content type."
    },
    {
        "topic": "Tokenization",
        "type": "math",
        "question": "Given a text with 100 words, if a tokenization algorithm produces an average of 1.2 tokens per word, calculate the total number of tokens generated. Discuss the implications of this ratio on the efficiency of NLP models."
    },
    {
        "topic": "Tokenization",
        "type": "theoretical",
        "question": "Critically analyze the challenges of tokenizing languages with no clear word boundaries, such as Chinese. Refer to the examples provided in the course materials to illustrate your points."
    },
    {
        "topic": "Tokenization",
        "type": "coding",
        "question": "Using the NLTK library in Python, write a script that tokenizes the following sentence: 'There, said Bob.' Include the output and explain how the library handles punctuation."
    },
    {
        "topic": "Tokenization",
        "type": "theoretical",
        "question": "What role does tokenization play in the preprocessing pipeline of NLP tasks such as sentiment analysis and machine translation? Discuss with reference to the lecture on tokenization."
    },
    {
        "topic": "Tokenization",
        "type": "descriptive",
        "question": "Describe the Byte Pair Encoding (BPE) tokenization method and its advantages over traditional tokenization techniques. How does BPE address issues of vocabulary size and out-of-vocabulary words?"
    },
    {
        "topic": "Tokenization",
        "type": "math",
        "question": "If a tokenization process results in 150 tokens from a document of 300 characters, calculate the average characters per token. Discuss how this metric can inform decisions about tokenization strategies."
    },
    {
        "topic": "Tokenization",
        "type": "theoretical",
        "question": "Evaluate the impact of tokenization on the performance of deep learning models in NLP. How does the choice of tokenization strategy influence model training and inference?"
    },
    {
        "topic": "KNN LM",
        "type": "theoretical",
        "question": "Explain the fundamental principles behind the k-Nearest Neighbors Language Model (kNN LM) and how it differs from traditional n-gram models."
    },
    {
        "topic": "KNN LM",
        "type": "mathematical",
        "question": "Given a dataset of sentences, derive the mathematical formulation for calculating the probability of a word given its context using the kNN LM approach."
    },
    {
        "topic": "KNN LM",
        "type": "coding",
        "question": "Implement a basic kNN LM in Python using a sample text corpus. Outline the steps taken and the libraries utilized for this implementation."
    },
    {
        "topic": "KNN LM",
        "type": "descriptive",
        "question": "Describe the two-step prediction process in kNN LM as outlined in the lecture materials. What are the key operations performed in each step?"
    },
    {
        "topic": "KNN LM",
        "type": "theoretical",
        "question": "Discuss the advantages and limitations of using kNN LM in comparison to other language modeling techniques such as neural networks or statistical models."
    },
    {
        "topic": "KNN LM",
        "type": "mathematical",
        "question": "If k is set to 5 in a kNN LM, how does this choice impact the model's performance in terms of bias and variance? Provide a detailed analysis."
    },
    {
        "topic": "KNN LM",
        "type": "coding",
        "question": "Write a function that computes the k-nearest neighbors for a given word in a kNN LM. What data structures would you use to optimize this function?"
    },
    {
        "topic": "KNN LM",
        "type": "theoretical",
        "question": "What role does distance metric play in the kNN LM framework? Compare and contrast the use of Euclidean distance versus cosine similarity in this context."
    },
    {
        "topic": "KNN LM",
        "type": "descriptive",
        "question": "Elaborate on the significance of the training set size in kNN LM. How does increasing the size of the training data influence the model's accuracy and computational efficiency?"
    },
    {
        "topic": "KNN LM",
        "type": "mathematical",
        "question": "Formulate the expected computational complexity of the kNN LM during both training and inference phases. What optimizations can be applied to reduce this complexity?"
    },
    {
        "topic": "Instruct Tuning",
        "type": "theoretical",
        "question": "Discuss the concept of Instruct Tuning in the context of language models. How does it differ from traditional fine-tuning methods?"
    },
    {
        "topic": "Instruct Tuning",
        "type": "coding",
        "question": "Write a Python function that implements a simple Instruct Tuning mechanism for a pre-trained language model using a given set of instructions and corresponding outputs."
    },
    {
        "topic": "Instruct Tuning",
        "type": "math",
        "question": "Explain the discrete optimization challenges associated with Instruct Tuning. What mathematical techniques could be employed to address these challenges?"
    },
    {
        "topic": "Instruct Tuning",
        "type": "descriptive",
        "question": "Describe the role of prompts in Instruct Tuning. How do they influence the output of language models?"
    },
    {
        "topic": "Instruct Tuning",
        "type": "theoretical",
        "question": "What are the key advantages of using Instruct Tuning over other prompt-based tuning methods? Provide examples to support your answer."
    },
    {
        "topic": "Instruct Tuning",
        "type": "coding",
        "question": "Using a popular NLP library, implement a simple experiment to demonstrate the effectiveness of Instruct Tuning on a specific task. What metrics would you use to evaluate performance?"
    },
    {
        "topic": "Instruct Tuning",
        "type": "theoretical",
        "question": "Critically analyze the statement: 'Instruct Tuning is primarily about optimizing the selection of words to guide model outputs.' What implications does this have for model training?"
    },
    {
        "topic": "Instruct Tuning",
        "type": "math",
        "question": "Formulate the optimization problem associated with Instruct Tuning. What are the variables and constraints involved?"
    },
    {
        "topic": "Instruct Tuning",
        "type": "descriptive",
        "question": "Examine the relationship between Instruct Tuning and transfer learning. How can Instruct Tuning enhance the transferability of language models across tasks?"
    },
    {
        "topic": "Instruct Tuning",
        "type": "theoretical",
        "question": "Discuss the implications of Instruct Tuning on the interpretability of language model outputs. How does it affect our understanding of model behavior?"
    },
    {
        "topic": "Scaling Laws for LLMs",
        "type": "theoretical",
        "question": "Explain the significance of power laws in the context of model performance versus model size as discussed in the lecture 'Scaling Laws for LLMs'."
    },
    {
        "topic": "Scaling Laws for LLMs",
        "type": "math",
        "question": "Given the power law relationship f(x) = (a/x)^k, derive the implications of this equation for model performance as the number of parameters N increases, assuming D and C remain constant."
    },
    {
        "topic": "Scaling Laws for LLMs",
        "type": "coding",
        "question": "Write a Python function that simulates the relationship between model performance and the number of parameters N, dataset size D, and compute C based on the power law described in the lecture."
    },
    {
        "topic": "Scaling Laws for LLMs",
        "type": "descriptive",
        "question": "Discuss how the scaling of model parameters N and dataset size D in tandem contributes to predictable improvements in model performance, referencing the key points from the lecture 'Lessons from scaling LLMs'."
    },
    {
        "topic": "Scaling Laws for LLMs",
        "type": "theoretical",
        "question": "Critically evaluate the statement: 'Performance depends strongly on scale, weakly on model shape.' How does this relate to the findings presented in the lecture?"
    },
    {
        "topic": "Scaling Laws for LLMs",
        "type": "math",
        "question": "If the amount of compute C is increased by a factor of 10 while keeping N and D constant, describe the expected impact on model performance based on the scaling laws discussed."
    },
    {
        "topic": "Scaling Laws for LLMs",
        "type": "descriptive",
        "question": "What are the implications of the observation that training curves follow predictable power laws? Discuss how this affects the design of future LLMs."
    },
    {
        "topic": "Scaling Laws for LLMs",
        "type": "theoretical",
        "question": "Analyze the relevance of hyperparameters such as width vs. depth in the context of scaling laws for LLMs, as mentioned in the lecture materials."
    },
    {
        "topic": "Scaling Laws for LLMs",
        "type": "coding",
        "question": "Implement a simple simulation in a programming language of your choice that visualizes the relationship between model size N, dataset size D, and compute C on model performance, as derived from the scaling laws."
    },
    {
        "topic": "Scaling Laws for LLMs",
        "type": "descriptive",
        "question": "Summarize the key findings from the lecture 'Scaling Laws for LLMs' regarding the relationship between the number of model parameters, dataset size, and compute, and their collective impact on performance."
    }
]

  "question": "Given a sequence length of \( n \) and a model dimension of \( d \), calculate the memory requirements for both Standard Attention and Fast Attention. How do these requirements impact large-scale NLP applications?"
  "question": "Given the queries \( Q \), keys \( K \), and values \( V \) matrices, derive the formula for the output of the cross-attention mechanism. Explain each component of the formula."
  "question": "What are the key components that constitute the input \( x \) in the context of cross-attention, and how do they influence the attention mechanism?"
  "question": "If the dimensions of the queries, keys, and values are \( d_k \), \( d_k \), and \( d_v \) respectively, what is the computational complexity of the cross-attention mechanism? Explain your reasoning."
  "question": "Given a feedforward neural network with one hidden layer containing 5 neurons, derive the output of the network for a single input vector \( x \) using the following weights and bias

In [66]:
questions_generated_save_file = "../data/output/evaluation/questions_generated.json"

os.makedirs(os.path.dirname(questions_generated_save_file), exist_ok=True)

with open(questions_generated_save_file, 'w') as json_file:
    json.dump(questions_list, json_file, indent=4)

In [67]:
questions_list

[{'topic': 'Probability Recap for NLP',
  'type': 'theoretical',
  'question': 'Discuss the significance of probability theory in Natural Language Processing as outlined in Part 1 of the lecture materials. How does it facilitate the modeling of language?'},
 {'topic': 'Probability Recap for NLP',
  'type': 'mathematical',
  'question': 'Given a discrete random variable X representing the occurrence of words in a corpus, derive the formula for the probability mass function (PMF) and explain its relevance in NLP applications.'},
 {'topic': 'Probability Recap for NLP',
  'type': 'coding',
  'question': 'Implement a Python function that calculates the conditional probability P(A|B) using a frequency count from a given text corpus. Explain the importance of this calculation in language modeling.'},
 {'topic': 'Probability Recap for NLP',
  'type': 'descriptive',
  'question': 'Explain the concept of joint probability and its application in NLP, particularly in the context of n-gram models a

In [68]:
# Ask gpt-4o-mini to evaluate questions
questions_generated_stats = []
for sample in questions_list:
    question = sample["question"]
    topic = sample["topic"]
    question_scores = generate_llm_question_evaluations(client, model, question, topic)

    stats = {
        "category": topic,
        "type": sample["type"],
        "question": question,
        "question_scores": question_scores
    }

    questions_generated_stats.append(stats)

In [69]:
questions_generated_stats_save_file = "../data/output/evaluation/questions_generated_stats.json"

os.makedirs(os.path.dirname(questions_generated_stats_save_file), exist_ok=True)

with open(questions_generated_stats_save_file, 'w') as json_file:
    json.dump(questions_generated_stats, json_file, indent=4)

We ask the LLM judge to evaluate the questions and save the output for future use (in visualization notebook and outputs).