# Lesson 2 - Retrieval Augmented Generation (RAG)

### Import packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
from DLAIUtils import Utils

import ast
import os
import pandas as pd

### Set up Pinecone

In [3]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

In [4]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = "dl-ai-wiki"

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(
  name=INDEX_NAME,
  dimension=1536,
  metric="cosine",
  spec=ServerlessSpec(cloud="aws", region="us-east-1"))

index = pinecone.Index(INDEX_NAME)

### Load the dataset

#### Download the dataset


In [5]:
if not os.path.exists("wiki.csv"):
    !wget -q -O lesson2-wiki.csv.zip "https://www.dropbox.com/scl/fi/yxzmsrv2sgl249zcspeqb/lesson2-wiki.csv.zip?rlkey=paehnoxjl3s5x53d1bedt4pmc&dl=0"
    !unzip lesson2-wiki.csv.zip

### Load the dataset

<p style="background-color:#fff1d7; padding:15px; "> <b>(Note: <code>max_articles_num = 500</code>):</b> To achieve a more comprehensive context for the Language Learning Model, a larger number of articles is generally more beneficial. In this lab, we've initially set <code>max_articles_num</code> to 500 for speedier results, allowing you to observe the outcomes faster. Once you've done an initial run, consider increasing this value to 750 or 1,000. You'll likely notice that the context provided to the LLM becomes richer and better. You can experiment by gradually raising this variable for different queries to observe the improvements in the LLM's contextual understanding.</p>

In [6]:
max_articles_num = 500
df = pd.read_csv("wiki.csv", nrows=max_articles_num)
df.head()

Unnamed: 0,id,metadata,values
1,1-0,"{'chunk': 0, 'source': 'https://simple.wikiped...","[-0.011254455894231796, -0.01698738895356655, ..."
2,1-1,"{'chunk': 1, 'source': 'https://simple.wikiped...","[-0.0015197008615359664, -0.007858820259571075..."
3,1-2,"{'chunk': 2, 'source': 'https://simple.wikiped...","[-0.009930099360644817, -0.012211072258651257,..."
4,1-3,"{'chunk': 3, 'source': 'https://simple.wikiped...","[-0.011600767262279987, -0.012608098797500134,..."
5,1-4,"{'chunk': 4, 'source': 'https://simple.wikiped...","[-0.026462381705641747, -0.016362832859158516,..."


### Prepare the embeddings and Upsert to Pinecone

In [7]:
prepped = []

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    metada = ast.literal_eval(row['metadata'])
    prepped.append({'id':row['id'], 
                    'values':ast.literal_eval(row['values']), 
                    'metadata':metada})
    if len(prepped) >= 250:
        index.upsert(prepped)
        prepped = []

100%|██████████| 500/500 [00:07<00:00, 63.09it/s] 


In [8]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 500}},
 'total_vector_count': 500}

### Connecto to OpenAI

In [9]:
OPENAI_API_KEY = utils.get_openai_api_key()
openai_client = OpenAI(api_key=OPENAI_API_KEY)

### Helper function to generate completions

In [10]:
def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input=articles, model=model)

### Run the Query

In [11]:
query = "What is the Berlin wall?"

embed = get_embeddings(query)
results = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)
text = [result['metadata']["text"] for result in results['matches']]
print("\n".join(text))

August 13  1961: Building of the Berlin Wall begins.
 August 14  1945: Japan announces its surrender at the end of World War II.
 August 14/15  1947: India is partitioned at independence from the UK, as the new mainly Islamic state of Pakistan is created.
 August 15  1960: The Republic of the Congo becomes independent.
 August 15  1971: Bahrain becomes independent.
 August 16  1977: Elvis Presley dies aged 42, leading to a worldwide outpouring of grief.
 August 17  1945: Indonesia declares independence from the Netherlands.
 August 17  1960: Gabon becomes independent.
 August 17  1962: Peter Fechter becomes the first person to be shot dead at the Berlin Wall.
 August 19  43 BC: Augustus becomes Roman consul.
 August 19  14: Augustus dies.
 August 19  1919: Afghanistan becomes independent.
 August 19  1991: The August Coup against Mikhail Gorbachev, in the Soviet Union, begins.
 August 20  1940: Leon Trotsky is fatally wounded with an ice pick in Mexico.
 August 20  1968: The Prague Spr

### Build the Prompt

In [12]:
query = "Write an article titled: What is the berlin wall?"
embed = get_embeddings([query])
res = index.query(vector=embed.data[0].embedding, top_k=3, include_metadata=True)

contexts = [
    x['metadata']['text'] for x in res['matches']
]

prompt_start = (
    "Answer the question based on the context below.\n\n"+
    "Context:\n"
)

prompt_end = (
    f"\n\nQuestion: {query}\nAnswer:"
)

prompt = (
    prompt_start + "\n\n---\n\n".join(contexts) + 
    prompt_end
)

print(prompt)

Answer the question based on the context below.

Context:
August 13  1961: Building of the Berlin Wall begins.
 August 14  1945: Japan announces its surrender at the end of World War II.
 August 14/15  1947: India is partitioned at independence from the UK, as the new mainly Islamic state of Pakistan is created.
 August 15  1960: The Republic of the Congo becomes independent.
 August 15  1971: Bahrain becomes independent.
 August 16  1977: Elvis Presley dies aged 42, leading to a worldwide outpouring of grief.
 August 17  1945: Indonesia declares independence from the Netherlands.
 August 17  1960: Gabon becomes independent.
 August 17  1962: Peter Fechter becomes the first person to be shot dead at the Berlin Wall.
 August 19  43 BC: Augustus becomes Roman consul.
 August 19  14: Augustus dies.
 August 19  1919: Afghanistan becomes independent.
 August 19  1991: The August Coup against Mikhail Gorbachev, in the Soviet Union, begins.
 August 20  1940: Leon Trotsky is fatally wounded wi

### Get the Summary

In [16]:
res = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
      {
        "role": "user",
        "content": prompt
      }
    ],
    temperature=0,
    max_tokens=650,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop=None
)

print('-' * 80)
print(res.choices[0].message.content)

--------------------------------------------------------------------------------
**What is the Berlin Wall?**

The Berlin Wall was a physical barrier that divided the city of Berlin from 1961 to 1989. It was constructed by the German Democratic Republic (GDR), commonly known as East Germany, starting on August 13, 1961. The Wall was a symbol of the Cold War and represented the ideological divide between the communist Eastern Bloc and the capitalist Western Bloc.

**Historical Context**

After World War II, Germany was divided into four occupation zones controlled by the United States, the United Kingdom, France, and the Soviet Union. Berlin, although located entirely within the Soviet zone, was similarly divided into four sectors. Tensions between the Soviet Union and the Western Allies escalated, leading to the establishment of two separate German states in 1949: the Federal Republic of Germany (West Germany) and the German Democratic Republic (East Germany).

**Construction and Purpo