In [1]:
import json
import numpy as np
import pandas as pd
from pprint import pprint as original_pprint
from dateutil import parser
from sentence_transformers import SentenceTransformer
import joblib
from sklearn.metrics.pairwise import cosine_similarity
import requests
import os 
from together import Together

  from tqdm.autonotebook import tqdm, trange
2025-09-11 00:14:12.676936: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-11 00:14:12.813484: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757549652.865693     617 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757549652.885121     617 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757549653.014105     617 computation_placer.cc:177] computation placer already regist

In [2]:
def pprint(*args, **kwargs):
    print(json.dumps(*args, indent = 2))

In [3]:
def format_date(date_string):
    # Parse the input string into a datetime object
    date_object = parser.parse(date_string)
    # Format the date to "YYYY-MM-DD"
    formatted_date = date_object.strftime("%Y-%m-%d")
    return formatted_date

In [4]:
# Read the CSV without parsing dates

def read_dataframe(path):
    df = pd.read_csv(path)

    # Apply the custom date formatting function to the relevant columns
    df['published_at'] = df['published_at'].apply(format_date)
    df['updated_at'] = df['updated_at'].apply(format_date)

    # Convert the DataFrame to dictionary after formatting
    df= df.to_dict(orient='records')
    return df

In [5]:
NEWS_DATA = read_dataframe("news_data_dedup.csv")

In [6]:
pprint(NEWS_DATA[9:11])

[
  {
    "guid": "5dae28f191cfd1047f67c409e616fc3f",
    "title": "Paris's Moulin Rouge loses windmill sails overnight",
    "description": "The cause of the sails' collapse from the roof of the world famous cabaret club is not yet clear.",
    "venue": "BBC",
    "url": "https://www.bbc.co.uk/news/world-europe-68895836",
    "published_at": "2024-04-25",
    "updated_at": "2024-04-26"
  },
  {
    "guid": "d2c3ff79d4e068911d05416ca061cd51",
    "title": "Ukraine uses longer-range US missiles for first time",
    "description": "Missiles secretly delivered this month have been used to strike Russian targets in Crimea, US media say.",
    "venue": "BBC",
    "url": "https://www.bbc.co.uk/news/world-europe-68893196",
    "published_at": "2024-04-25",
    "updated_at": "2024-04-26"
  }
]


In [7]:
def query_news(indices):
    """
    Retrieves elements from a dataset based on specified indices.

    Parameters:
    indices (list of int): A list containing the indices of the desired elements in the dataset.
    dataset (list or sequence): The dataset from which elements are to be retrieved. It should support indexing.

    Returns:
    list: A list of elements from the dataset corresponding to the indices provided in list_of_indices.
    """
     
    output = [NEWS_DATA[index] for index in indices]

    return output

In [8]:
# Fetching some indices
indices = [3, 6, 9]
pprint(query_news(indices))

[
  {
    "guid": "e696224ac208878a5cec8bdc9f97c632",
    "title": "Europe risks dying and faces big decisions - Macron",
    "venue": "BBC",
    "url": "https://www.bbc.co.uk/news/world-europe-68898887",
    "published_at": "2024-04-25",
    "updated_at": "2024-04-26"
  },
  {
    "guid": "4f585bad8f61b715fbafe2f022ab0ae8",
    "title": "Supreme Court divided on whether Trump has immunity",
    "description": "The justices discussed immunity, coups, pardons, Operation Mongoose - and the future of democracy.",
    "venue": "BBC",
    "url": "https://www.bbc.co.uk/news/world-us-canada-68901817",
    "published_at": "2024-04-25",
    "updated_at": "2024-04-26"
  },
  {
    "guid": "5dae28f191cfd1047f67c409e616fc3f",
    "title": "Paris's Moulin Rouge loses windmill sails overnight",
    "description": "The cause of the sails' collapse from the roof of the world famous cabaret club is not yet clear.",
    "venue": "BBC",
    "url": "https://www.bbc.co.uk/news/world-europe-68895836",
    "

In [9]:
model = SentenceTransformer('BAAI/bge-base-en-v1.5')




In [10]:
EMBEDDINGS = joblib.load("embeddings.joblib")

In [11]:
def retrieve(query, top_k = 5):
    query_embedding = model.encode(query)

    similarity_scores = cosine_similarity(query_embedding.reshape(1,-1), EMBEDDINGS)[0]
    print("similarity_scores: ")
    print(similarity_scores)
    
    similarity_indices = np.argsort(-similarity_scores)
    print("similarity_indices: ")
    print(similarity_indices)

    top_k_indices = similarity_indices[:top_k]
    print("top_k_indices: ")
    print(top_k_indices)

    return top_k_indices

In [12]:
# Let's test the retrieve function!
indices = retrieve("Concerts in North America", top_k = 1)
print(indices)

similarity_scores: 
[0.31009883 0.44885677 0.35684338 0.4466366  0.38281614 0.41626477
 0.42662114 0.4817115  0.3568998  0.4668556  0.4303986  0.3675533
 0.4753781  0.33592293 0.29099596 0.5324107  0.45125043 0.4384882
 0.5158867  0.432179   0.4893912  0.36603883 0.42270267 0.46550712
 0.40377954 0.47620988 0.41322684 0.39763775 0.41954857 0.41435674
 0.4219189  0.30349818 0.4121761  0.391993   0.53126055 0.48985094
 0.37431082 0.39333832 0.3074522  0.4170124  0.34213907 0.43197995
 0.3818015  0.37477124 0.4233189  0.4569085  0.40358943 0.4474628
 0.39995104 0.45552167 0.525492   0.4681095  0.46175188 0.45258328
 0.42194098 0.4411291  0.4710812  0.44207317 0.39605466 0.3927312
 0.31604257 0.36798668 0.4565735  0.43626523 0.41357404 0.48608878
 0.45045173 0.43029943 0.38733917 0.40272287 0.34274215 0.47779727
 0.44844964 0.41019428 0.39239278 0.5418557  0.46609408 0.44679552
 0.33927435 0.48479512 0.4214564  0.46446365 0.37487867 0.41311666
 0.40165597 0.34799054 0.39625546 0.37487867 0

In [13]:
# Now let's query the corresponding news_
retrieved_documents = query_news(indices)
pprint(retrieved_documents)

[
  {
    "guid": "927257674585bb6ef669cf2c2f409fa7",
    "title": "\u2018The working class can\u2019t afford it\u2019: the shocking truth about the money bands make on tour",
    "description": "As Taylor Swift tops $1bn in tour revenue, musicians playing smaller venues are facing pitiful fees and frequent losses. Should the state step in to save our live music scene?When you see a band playing to thousands of fans in a sun-drenched festival field, signing a record deal with a major label or playing endlessly from the airwaves, it\u2019s easy to conjure an image of success that comes with some serious cash to boot \u2013 particularly when Taylor Swift has broken $1bn in revenue for her current Eras tour. But looks can be deceiving. \u201cI don\u2019t blame the public for seeing a band playing to 2,000 people and thinking they\u2019re minted,\u201d says artist manager Dan Potts. \u201cBut the reality is quite different.\u201dPost-Covid there has been significant focus on grassroots mus

In [14]:
# GRADED CELL 

def get_relevant_data(query: str, top_k: int = 5) -> list[dict]:
    """
    Retrieve and return the top relevant data items based on a given query.

    This function performs the following steps:
    1. Retrieves the indices of the top 'k' relevant items from a dataset based on the provided `query`.
    2. Fetches the corresponding data for these indices from the dataset.

    Parameters:
    - query (str): The search query string used to find relevant items.
    - top_k (int, optional): The number of top items to retrieve. Default is 5.

    Returns:
    - list[dict]: A list of dictionaries containing the data associated 
      with the top relevant items.

    """
    ### START CODE HERE ###

    # Retrieve the indices of the top_k relevant items given the query
    relevant_indices = retrieve(query, top_k = top_k)

    # Obtain the data related to the items using the indices from the previous step
    relevant_data = query_news(relevant_indices)

    ### END CODE HERE
    
    return relevant_data

In [15]:
query = "Greatest storms in the US"
relevant_data = get_relevant_data(query, top_k = 1)
pprint(relevant_data)

similarity_scores: 
[0.33428088 0.40383196 0.30251238 0.4122001  0.32496014 0.34122095
 0.413675   0.5411361  0.37657276 0.5064984  0.46372762 0.4110004
 0.44385228 0.35879418 0.26817995 0.5002453  0.38362932 0.4532919
 0.4399386  0.42236438 0.45083436 0.33098906 0.44390836 0.4437565
 0.3987965  0.3830702  0.38726878 0.4146625  0.370234   0.4010813
 0.40839228 0.35195774 0.42295873 0.3872332  0.46948913 0.44858378
 0.2953934  0.40085855 0.32363832 0.34641054 0.3775559  0.42703983
 0.36544478 0.40514338 0.42989415 0.44655496 0.44625825 0.44185874
 0.3798787  0.4587327  0.4967316  0.47817338 0.4292118  0.4149161
 0.34172374 0.41054395 0.39017394 0.38287258 0.4071352  0.43528208
 0.3503903  0.3501415  0.48371175 0.51977843 0.36108133 0.4157025
 0.4593592  0.39391303 0.38297325 0.44654033 0.27444646 0.47242004
 0.36611277 0.3833584  0.4108893  0.49321547 0.44604796 0.3688018
 0.4815142  0.41769344 0.3692529  0.4172175  0.4034897  0.37370053
 0.36876538 0.36324608 0.41622525 0.4034897  0.41

In [16]:
# GRADED CELL

def format_relevant_data(relevant_data):
    """
    Retrieves the top_k most relevant documents based on a given query and constructs an augmented prompt for a RAG system.

    Parameters:
    relevant_data (list): A list with relevant data.

    Returns:
    str: An augmented prompt with the top_k relevant documents, formatted for use in a Retrieval-Augmented Generation (RAG) system."
    """

    ### START CODE HERE ###

    # Create a list so store the formatted documents
    formatted_documents = []
    
    # Iterates over each relevant document.
    for document in relevant_data:

        # Formats each document into a structured layout string. Remember that each document is in one different line. So you should add a new line character after each document added.
        formatted_document = "Title: {news_title}, Description: {news_description}, Published at: {news_published_date}\nURL: {news_URL}".format(news_title = document['title'], news_description = document['description'], news_published_date = document['published_at'], news_URL = document['url'])
        
        # Append the formatted document string to the formatted_documents list
        formatted_documents.append(formatted_document)
    
    ### END CODE HERE ###
    
    # Returns the final augmented prompt string.

    return "\n".join(formatted_documents)

In [17]:
example_data = NEWS_DATA[4:8]

In [18]:
print(format_relevant_data(example_data))

Title: Prosecutors ask for halt to case against Spain PM's wife, Description: Pedro Sánchez is deciding whether to resign after a case against his wife by an anti-corruption group., Published at: 2024-04-25
URL: https://www.bbc.co.uk/news/world-europe-68895727
Title: WATCH: Would you pay a tourist fee to enter Venice?, Description: From Thursday visitors making a trip to the famous city at peak times will be charged a trial entrance fee., Published at: 2024-04-25
URL: https://www.bbc.co.uk/news/world-europe-68898441
Title: Supreme Court divided on whether Trump has immunity, Description: The justices discussed immunity, coups, pardons, Operation Mongoose - and the future of democracy., Published at: 2024-04-25
URL: https://www.bbc.co.uk/news/world-us-canada-68901817
Title: More than 150 killed as heavy rains pound Tanzania, Description: The prime minister warns that El Niño-triggered heavy rains are likely to continue into May., Published at: 2024-04-25
URL: https://www.bbc.co.uk/news/

In [19]:
# EDITABLE CELL

def generate_final_prompt(query, top_k=5, use_rag=True, prompt=None):
    """
    Generates a final prompt based on a user query, optionally incorporating relevant data using retrieval-augmented generation (RAG).

    Args:
        query (str): The user query for which the prompt is to be generated.
        top_k (int, optional): The number of top relevant data pieces to retrieve and incorporate. Default is 5.
        use_rag (bool, optional): A flag indicating whether to use retrieval-augmented generation (RAG)
                                  by including relevant data in the prompt. Default is True.
        prompt (str, optional): A template string for the prompt. It can contain placeholders {query} and {documents}
                                for formatting with the query and formatted relevant data, respectively.

    Returns:
        str: The generated prompt, either consisting solely of the query or expanded with relevant data
             formatted for additional context.
    """
    # If RAG is not being used, format the prompt with just the query or return the query directly
    if not use_rag:
        return query

    # Retrieve the top_k relevant data pieces based on the query
    relevant_data = get_relevant_data(query, top_k=top_k)

    # Format the retrieved relevant data
    retrieve_data_formatted = format_relevant_data(relevant_data)

    # If no custom prompt is provided, use the default prompt template
    if prompt is None:
        prompt = (
            f"Answer the user query below. There will be provided additional information for you to compose your answer. "
            f"The relevant information provided is from 2024 and it should be added as your overall knowledge to answer the query, "
            f"you should not rely only on this information to answer the query, but add it to your overall knowledge."
            f"Query: {query}\n"
            f"2024 News: {retrieve_data_formatted}"
        )
    else:
        # If a custom prompt is provided, format it with the query and formatted relevant data
        prompt = prompt.format(query=query, documents=retrieve_data_formatted)

    return prompt

In [20]:
print(generate_final_prompt("Tell me about the US GDP in the past 3 years."))

similarity_scores: 
[0.31420416 0.38261265 0.3328526  0.47128075 0.38573584 0.31963015
 0.4424041  0.45235923 0.4090824  0.37868035 0.45131734 0.37728548
 0.45384216 0.4462779  0.33310586 0.4885968  0.30069387 0.4506549
 0.46701476 0.33816284 0.5186873  0.38359654 0.3772116  0.45479962
 0.42561674 0.46945527 0.43398032 0.3806609  0.43246925 0.3465603
 0.40847626 0.35321584 0.37367526 0.4166789  0.44078586 0.39295426
 0.3588365  0.37538493 0.32352093 0.4586662  0.40955648 0.46376
 0.38388592 0.43588227 0.42575163 0.4223114  0.41776016 0.46941748
 0.34458032 0.4389196  0.43662363 0.49327987 0.5021156  0.480199
 0.4337067  0.44634807 0.4149751  0.43421197 0.45876896 0.3928178
 0.37935418 0.4112357  0.36234406 0.39138144 0.36158636 0.49998713
 0.4287178  0.4044523  0.47960556 0.43334362 0.30478635 0.44253084
 0.41354758 0.45094112 0.40848717 0.48628256 0.43297377 0.46792483
 0.39319435 0.4073817  0.33066684 0.47768    0.36226887 0.41962647
 0.4203979  0.34320074 0.39222336 0.36226887 0.474

In [21]:
def generate_with_single_input(prompt: str, 
                               role: str = 'assistant', 
                               top_p: float = None, 
                               temperature: float = None,
                               max_tokens: int = 500,
                               model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
                               together_api_key = None,
                              **kwargs):
    
    if top_p is None:
        top_p = 'none'
    if temperature is None:
        temperature = 'none'

    payload = {
            "model": model,
            "messages": [{'role': role, 'content': prompt}],
            "top_p": top_p,
            "temperature": temperature,
            "max_tokens": max_tokens,
            **kwargs
                  }
    if (not together_api_key) and ('TOGETHER_API_KEY' not in os.environ):
        url = os.path.join('https://proxy.dlai.link/coursera_proxy/together', 'v1/chat/completions')   
        response = requests.post(url, json = payload, verify=False)
        if not response.ok:
            raise Exception(f"Error while calling LLM: f{response.text}")
        try:
            json_dict = json.loads(response.text)
        except Exception as e:
            raise Exception(f"Failed to get correct output from LLM call.\nException: {e}\nResponse: {response.text}")
    else:
        if together_api_key is None:
            together_api_key = os.environ['TOGETHER_API_KEY']
        if top_p == 'none':
            payload['top_p'] = None
        if temperature == 'none':
            payload['temperature'] = None
        client = Together(api_key =  together_api_key)
        json_dict = client.chat.completions.create(**payload).model_dump()
        json_dict['choices'][-1]['message']['role'] = json_dict['choices'][-1]['message']['role'].name.lower()
    try:
        output_dict = {'role': json_dict['choices'][-1]['message']['role'], 'content': json_dict['choices'][-1]['message']['content']}
    except Exception as e:
        raise Exception(f"Failed to get correct output dict. Please try again. Error: {e}")
    return output_dict

In [22]:
def llm_call(query, top_k = 5, use_rag = True, prompt = None):
    """
    Calls the LLM to generate a response based on a query, optionally using retrieval-augmented generation.

    Args:
        query (str): The user query that will be processed by the language model.
        use_rag (bool, optional): A flag that indicates whether to use retrieval-augmented generation by 
                                  incorporating relevant documents into the prompt. Default is True.

    Returns:
        str: The content of the response generated by the language model.
    """
    

    # Get the prompt with the query + relevant documents
    prompt = generate_final_prompt(query, top_k, use_rag, prompt)

    # Call the LLM
    generated_response = generate_with_single_input(prompt)

    # Get the content
    generated_message = generated_response['content']
    
    return generated_message

In [23]:
query = "Tell me about the US GDP in the past 3 years."

In [24]:
os.environ['TOGETHER_API_KEY'] = "76d81fd6490bf6aa4277b46347bf1af6b951e468bedd5eef3abc37b851d75b31"

In [25]:
print(llm_call(query, use_rag = True))

similarity_scores: 
[0.31420416 0.38261265 0.3328526  0.47128075 0.38573584 0.31963015
 0.4424041  0.45235923 0.4090824  0.37868035 0.45131734 0.37728548
 0.45384216 0.4462779  0.33310586 0.4885968  0.30069387 0.4506549
 0.46701476 0.33816284 0.5186873  0.38359654 0.3772116  0.45479962
 0.42561674 0.46945527 0.43398032 0.3806609  0.43246925 0.3465603
 0.40847626 0.35321584 0.37367526 0.4166789  0.44078586 0.39295426
 0.3588365  0.37538493 0.32352093 0.4586662  0.40955648 0.46376
 0.38388592 0.43588227 0.42575163 0.4223114  0.41776016 0.46941748
 0.34458032 0.4389196  0.43662363 0.49327987 0.5021156  0.480199
 0.4337067  0.44634807 0.4149751  0.43421197 0.45876896 0.3928178
 0.37935418 0.4112357  0.36234406 0.39138144 0.36158636 0.49998713
 0.4287178  0.4044523  0.47960556 0.43334362 0.30478635 0.44253084
 0.41354758 0.45094112 0.40848717 0.48628256 0.43297377 0.46792483
 0.39319435 0.4073817  0.33066684 0.47768    0.36226887 0.41962647
 0.4203979  0.34320074 0.39222336 0.36226887 0.474

In [26]:
print(llm_call(query, use_rag = False))

The US GDP (Gross Domestic Product) for the past 3 years (2021-2023) is as follows:

1. **2021**: The US GDP in 2021 was $22.67 trillion. The economy experienced a significant rebound from the COVID-19 pandemic, with a growth rate of 5.7% in 2021. This growth was driven by government stimulus, vaccination efforts, and a strong labor market.

2. **2022**: The US GDP in 2022 was $25.42 trillion. The economy continued to grow, but at a slower pace than in 2021. The growth rate in 2022 was 2.1%, which was lower than expected due to inflation, supply chain disruptions, and the impact of the Russian invasion of Ukraine.

3. **2023 (Q1)**: The US GDP in Q1 2023 was $25.53 trillion. The economy has continued to grow, but at a slow pace. The growth rate in Q1 2023 was 1.1%, which was lower than expected due to ongoing inflation, a strong US dollar, and a slowdown in global economic growth.

Please note that these figures are estimates and may be subject to revision. Additionally, the GDP growth

In [27]:
import ipywidgets as widgets
from IPython.display import display, Markdown

def display_widget(llm_call_func):
    def on_button_click(b):
        # Clear outputs
        output1.clear_output()
        output2.clear_output()
        status_output.clear_output()
        # Display "Generating..." message
        status_output.append_stdout("Generating...\n")
        query = query_input.value
        top_k = slider.value
        prompt = prompt_input.value.strip() if prompt_input.value.strip() else None
        response1 = llm_call_func(query, use_rag=True, top_k=top_k, prompt=prompt)
        response2 = llm_call_func(query, use_rag=False, top_k=top_k, prompt=prompt)
        # Update responses
        with output1:
            display(Markdown(response1))
        with output2:
            display(Markdown(response2))
        # Clear "Generating..." message
        status_output.clear_output()

    query_input = widgets.Text(
        description='Query:',
        placeholder='Type your query here',
        layout=widgets.Layout(width='100%')
    )

    prompt_input = widgets.Textarea(
        description='Augmented prompt layout:',
        placeholder=("Type your prompt layout here, don't forget to add {query} and {documents} "
                     "where you want them to be placed! Leaving this blank will default to the "
                     "prompt in generate_final_prompt. Example:\nThis is a query: {query}\nThese are the documents: {documents}"),
        layout=widgets.Layout(width='100%', height='100px'),
        style={'description_width': 'initial'}
    )

    slider = widgets.IntSlider(
        value=5,  # default value
        min=1,
        max=20,
        step=1,
        description='Top K:',
        style={'description_width': 'initial'}
    )

    output1 = widgets.Output(layout={'border': '1px solid #ccc', 'width': '45%'})
    output2 = widgets.Output(layout={'border': '1px solid #ccc', 'width': '45%'})
    status_output = widgets.Output()

    submit_button = widgets.Button(
        description="Get Responses",
        style={'button_color': '#f0f0f0', 'font_color': 'black'}
    )
    submit_button.on_click(on_button_click)

    label1 = widgets.Label(value="With RAG", layout={'width': '45%', 'text_align': 'center'})
    label2 = widgets.Label(value="Without RAG", layout={'width': '45%', 'text_align': 'center'})

    display(widgets.HTML("""
    <style>
        .custom-output {
            background-color: #f9f9f9;
            color: black;
            border-radius: 5px;
        }
        .widget-textarea, .widget-button {
            background-color: #f0f0f0 !important;
            color: black !important;
            border: 1px solid #ccc !important;
        }
        .widget-output {
            background-color: #f9f9f9 !important;
            color: black !important;
        }
        textarea {
            background-color: #fff !important;
            color: black !important;
            border: 1px solid #ccc !important;
        }
    </style>
    """))

    display(query_input, prompt_input, slider, submit_button, status_output)
    hbox_labels = widgets.HBox([label1, label2], layout={'justify_content': 'space-between'})
    hbox_outputs = widgets.HBox([output1, output2], layout={'justify_content': 'space-between'})

    def style_outputs(*outputs):
        for output in outputs:
            output.layout.margin = '5px'
            output.layout.height = '300px'
            output.layout.padding = '10px'
            output.layout.overflow = 'auto'
            output.add_class("custom-output")

    style_outputs(output1, output2)
    # Display label and output boxes
    display(hbox_labels)
    display(hbox_outputs)

In [None]:
display_widget(llm_call)

HTML(value='\n    <style>\n        .custom-output {\n            background-color: #f9f9f9;\n            color…

Text(value='', description='Query:', layout=Layout(width='100%'), placeholder='Type your query here')

Textarea(value='', description='Augmented prompt layout:', layout=Layout(height='100px', width='100%'), placeh…

IntSlider(value=5, description='Top K:', max=20, min=1, style=SliderStyle(description_width='initial'))

Button(description='Get Responses', style=ButtonStyle(button_color='#f0f0f0'))

Output()