# RAG From Scratch

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

In [2]:
"""
Downloading a word encoder.
We can use word2vect, but glove downloads way faster. For our purposes they're conceptually identical
"""
import sys

#!{sys.executable} -m pip install gensim

import gensim.downloader as gd

# downloading encoder
word_encoder= gd.load('glove-twitter-25')

# getting the embedding for a word
word_encoder['apple']

array([ 0.85337  ,  0.011645 , -0.033377 , -0.31981  ,  0.26126  ,
        0.16059  ,  0.010724 , -0.15542  ,  0.75044  ,  0.10688  ,
        1.9249   , -0.45915  , -3.3887   , -1.2152   , -0.054263 ,
       -0.20555  ,  0.54706  ,  0.4371   ,  0.25194  ,  0.0086557,
       -0.56612  , -1.1762   ,  0.010479 , -0.55316  , -0.15816  ],
      dtype=float32)

In [3]:
# defining a function for embedding an entire document to a single mean vector

def embed_sequence(sequence):
    
    vects= word_encoder[sequence.split(' ')]
    
    return np.mean(vects, axis=0)


embed_sequence('its a sunny day today')

array([-6.3483393e-01,  1.3683620e-01,  2.0645106e-01, -2.1831200e-01,
       -1.8181981e-01,  2.6023200e-01,  1.3276964e+00,  1.7272198e-01,
       -2.7881199e-01, -4.2115799e-01, -4.7215199e-01, -5.3013992e-02,
       -4.6326599e+00,  4.3883198e-01,  3.6487383e-01, -3.6672002e-01,
       -2.6924044e-03, -3.0394283e-01, -5.5415201e-01, -9.1787003e-02,
       -4.4997922e-01, -1.4819117e-01,  1.0654800e-01,  3.7024397e-01,
       -4.6688594e-02], dtype=float32)

In [4]:
# calculating distance between two embedding vectors uses manhattan distance

from scipy.spatial.distance import cdist

def calc_distance(embedding1, embedding2):
    
    dist= cdist(np.expand_dims(embedding1, axis=0), 
                np.expand_dims(embedding2, axis=0), metric='cityblock')[0][0]
    
    return dist


print('Similar phrases:')
f1= embed_sequence('sunny day today')
f2= embed_sequence('rainy morning presently')
print(calc_distance(f1, f2))

print('Different phrases:')
f3= embed_sequence('perhaps reality is painful')
print(calc_distance(f1, f3))

Similar phrases:
8.496297497302294
Different phrases:
11.832107525318861


# Retrieval and Augmentation

In [5]:
"""
Defining documents
for simplicities we only included words the embedder knows. We could just parse out all the words the 
embedder doesn't know, though. After all, the retreival is done on a mean of all embeddings, so a 
missing word or two is of little consequence
"""

documents= {"menu": "ratatouille is a stew thats twelve dollars and fifty cents also gazpacho is a salad thats thirteen dollars and ninety eight cents also hummus is a dip thats eight dollars and seventy five cents also meat sauce is a pasta dish thats twelve dollars also penne marinera is a pasta dish thats eleven dollars also shrimp and linguini is a pasta dish thats fifteen dollars",
            "events": "on thursday we have karaoke and on tuesdays we have trivia",
            "allergins": "the only item on the menu common allergen is hummus which contain pine nuts",
            "info": "the resteraunt was founded by two brothers in two thousand and three"}

In [6]:
# defining a function that retreives the most relevent document

def retrieve_relevant(prompt, documents=documents):
    
    min_dist= 1e10
    r_docname= ''
    r_doc= ''
    
    for docname, doc in documents.items():
        dist= calc_distance(embed_sequence(prompt), embed_sequence(doc))
        
        if (dist< min_dist):
            min_dist= dist
            r_docname= docname
            r_doc= doc
            
    return r_docname, r_doc


prompt= 'what pasta dishes do you have'
print(f'Finding relevent doc for: "{prompt}"')
print(retrieve_relevant(prompt))
print('----')

prompt= 'what events do you guys do'
print(f'Finding relevent doc for: "{prompt}"')
print(retrieve_relevant(prompt))
print('----')

prompt= 'what pasta dishes do you guys have'
print(f'Finding relevent doc for: "{prompt}"')
print(retrieve_relevant(prompt))
# The last case comes with quirks from the reality of the art

Finding relevent doc for: "what pasta dishes do you have"
('menu', 'ratatouille is a stew thats twelve dollars and fifty cents also gazpacho is a salad thats thirteen dollars and ninety eight cents also hummus is a dip thats eight dollars and seventy five cents also meat sauce is a pasta dish thats twelve dollars also penne marinera is a pasta dish thats eleven dollars also shrimp and linguini is a pasta dish thats fifteen dollars')
----
Finding relevent doc for: "what events do you guys do"
('events', 'on thursday we have karaoke and on tuesdays we have trivia')
----
Finding relevent doc for: "what pasta dishes do you guys have"
('info', 'the resteraunt was founded by two brothers in two thousand and three')


# Augmenting and Generating

In [7]:
"""
Defining retreival and augmentation creating a function that does retrieval and augmentation,
this can be passed straight to the LLM model
"""

def retrieve_and_augment(prompt, documents=documents):
    
    docname, doc= retrieve_relevant(prompt, documents)
    
    answer= 'Answer the prompt based on the folowing documents:\n'
    
    return f"{answer}==== document: {docname} ====\n{doc}\n====\n\nprompt: {prompt}\nresponse:"


prompt= 'what events do you guys do'
print(f'Prompt for: "{prompt}":\n')
print(retrieve_and_augment(prompt))

Prompt for: "what events do you guys do":

Answer the prompt based on the folowing documents:
==== document: events ====
on thursday we have karaoke and on tuesdays we have trivia
====

prompt: what events do you guys do
response:


In [None]:
# using RAG with OpenAI's gpt model

#!{sys.executable} -m pip install openai

import os
from openai import OpenAI

# lets initialize the API client
client= OpenAI(
    api_key=os.environ.get('OPENAI_API_KEY')
)

prompts= ['what pasta dishes do you have',
          'what events do you guys do',
          'oh cool what is karaoke'
         ]

for prompt in prompts:
    
    ra_prompt= retrieve_and_augment(prompt)
    response= client.chat.completions.create(model='gpt-3.5-turbo', prompt=ra_prompt,
                                       max_tokens=80).choices[0].text
    
    print(f"Prompt: '{prompt}'")
    print(f"Response: '{response}'")

In [None]:
# towardsdatascience.com/retrieval-augmented-generation-intuitively-and-exhaustively-explain-6a39d6fe6fc9

In [None]:
# https://github.com/openai/openai-python