In [None]:
%pip install langchain faiss-cpu cohere langchain-community rank_bm25 pandas scikit-learn

In [6]:
import cohere
import numpy as np
import re
import pandas as pd
from tqdm import tqdm
from langchain.vectorstores import FAISS
from secret import cohere_api_key

# Paste your API key here. Remember to not share publicly
api_key = cohere_api_key

# Create and retrieve a Cohere API key from os.cohere.ai
co = cohere.Client(api_key)

From the forthcoming book "Hands on Large Language Models" by Jay Alammar & Maarten Grootendorst

## create a database of text related to interstellar

In [7]:
text = """
Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. 
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. 
Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind.

Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007. 
Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. 
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. 
Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles. 
Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects.

Interstellar premiered on October 26, 2014, in Los Angeles. 
In the United States, it was first released on film stock, expanding to venues using digital projectors. 
The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014. 
It received acclaim for its performances, direction, screenplay, musical score, visual effects, ambition, themes, and emotional weight. 
It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics. Since its premiere, Interstellar gained a cult following,[5] and now is regarded by many sci-fi experts as one of the best science-fiction films of all time.
Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades"""

# Split into a list of sentences
texts = text.split('.')

# Clean up to remove empty spaces and new lines
texts = [t.strip(' \n') for t in texts]

In [8]:
# Get the embeddings
response = co.embed(
  texts=texts,
  input_type="search_document",
).embeddings

embeds = np.array(response)
print(embeds.shape)

(15, 4096)


In [9]:
# add embeddings to vector database
import faiss
dim = embeds.shape[1]
index = faiss.IndexFlatL2(dim)
print(index.is_trained)
index.add(np.float32(embeds))

True


In [10]:
def search(query, number_of_results=3):
  
  # 1. Get the query's embedding
  query_embed = co.embed(texts=[query], 
                input_type="search_query",).embeddings[0]

  # 2. Retrieve the nearest neighbors
  distances , similar_item_ids = index.search(np.float32([query_embed]), number_of_results) 

  # 3. Format the results
  texts_np = np.array(texts) # Convert texts list to numpy for easier indexing
  results = pd.DataFrame(data={'texts': texts_np[similar_item_ids[0]], 
                              'distance': distances[0]})
  
  # 4. Print and return the results
  print(f"Query:'{query}'\nNearest neighbors:")
  return results

In [11]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None) 

query = "how precise was the science"
results = search(query)
results

Query:'how precise was the science'
Nearest neighbors:


Unnamed: 0,texts,distance
0,It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics,10757.366211
1,"Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar",11566.133789
2,Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects,11922.839844


### compare to keyword search

In [12]:
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction import _stop_words
import string

def bm25_tokenizer(text):
    tokenized_doc = []
    for token in text.lower().split():
        token = token.strip(string.punctuation)

        if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
            tokenized_doc.append(token)
    return tokenized_doc

tokenized_corpus = []
for passage in tqdm(texts):
    tokenized_corpus.append(bm25_tokenizer(passage))

bm25 = BM25Okapi(tokenized_corpus)

def keyword_search(query, top_k=3, num_candidates=15):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    print(f"Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[0:top_k]:
        print("\t{:.3f}\t{}".format(hit['score'], texts[hit['corpus_id']].replace("\n", " ")))

100%|██████████| 15/15 [00:00<00:00, 95614.83it/s]


In [13]:
## compare to keyword search
keyword_search(query = "how precise was the science")

Input question: how precise was the science
Top-3 lexical search (BM25) hits
	1.789	Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan
	1.373	Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar
	0.000	It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine


what happens if there is no document that exists?

In [14]:
## what is if the answer doesn't appear in the text?
query = "what is the mass of the moon"
results = search(query)
results

Query:'what is the mass of the moon'
Nearest neighbors:


Unnamed: 0,texts,distance
0,It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics,14125.734375
1,Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm,14178.546875
2,"The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014",14203.515625


you may want add a threshold to filter the results

## Reranking

In [15]:
query = "how precise was the science"
results = co.rerank(query=query, documents=texts, top_n=3, return_documents=True)

In [16]:
for idx, result in enumerate(results.results):
  print(idx, result.relevance_score , result.document.text)

0 0.1698185 It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics
1 0.07004896 The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014
2 0.0043994132 Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar


In [17]:
def keyword_and_reranking_search(query, top_k=3, num_candidates=10):
    print("Input question:", query)

    ##### BM25 search (lexical search) #####
    bm25_scores = bm25.get_scores(bm25_tokenizer(query))
    top_n = np.argpartition(bm25_scores, -num_candidates)[-num_candidates:]
    bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
    bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
    
    print(f"Top-3 lexical search (BM25) hits")
    for hit in bm25_hits[0:top_k]:
        print("\t{:.3f}\t{}".format(hit['score'], texts[hit['corpus_id']].replace("\n", " ")))

   
    #Add re-ranking
    docs = [texts[hit['corpus_id']] for hit in bm25_hits]
    
    print(f"\nTop-3 hits by rank-API ({len(bm25_hits)} BM25 hits re-ranked)")
    results = co.rerank(query=query, documents=docs, top_n=top_k, return_documents=True)
    # print(results.results)
    for hit in results.results:
        # print(hit)
        print("\t{:.3f}\t{}".format(hit.relevance_score, hit.document.text.replace("\n", " ")))

In [18]:
keyword_and_reranking_search(query = "how precise was the science")

Input question: how precise was the science
Top-3 lexical search (BM25) hits
	1.789	Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan
	1.373	Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar
	0.000	Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects

Top-3 hits by rank-API (10 BM25 hits re-ranked)
	0.004	Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar
	0.004	Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind
	0.003	Brothers Christopher and Jonat

## Retrieval Augmented Generation (RAG)

#### using the cohere api

In [19]:
query='income generated'
# 1- Retrieval
# We’ll use embedding search. But ideally we’d do hybrid
results = search(query)

# 2- Grounded Generation
docs_dict = [{'text': text} for text in results['texts']]
response = co.chat(
    message = query,
    documents=docs_dict
)

Query:'income generated'
Nearest neighbors:


In [20]:
print(response.text)

The film generated a worldwide gross of over $677 million, or $773 million with subsequent re-releases.


In [21]:
print(response.citations)

[ChatCitation(start=21, end=57, text='worldwide gross of over $677 million', document_ids=['doc_0']), ChatCitation(start=62, end=103, text='$773 million with subsequent re-releases.', document_ids=['doc_0'])]


In [None]:
#pip install huggingface-hub

## in terminal:

#huggingface-cli login
#huggingface-cli download microsoft/Phi-3-mini-4k-instruct-gguf Phi-3-mini-4k-instruct-q4.gguf --local-dir . --local-dir-use-symlinks False

In [None]:
%pip install llama-cpp-python==0.2.83 sentence-transformers

In [22]:
from langchain import LlamaCpp
#from llama_cpp import Llama

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="Phi-3-mini-4k-instruct-q4.gguf",
    n_gpu_layers=-1,
    max_tokens=500,
    n_ctx=2048,
    seed=42,
    verbose=False
)

In [23]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

# Embedding model for converting text to numerical representations
embedding_model = HuggingFaceEmbeddings(
    model_name='thenlper/gte-small'
)

In [24]:
from langchain.vectorstores import FAISS

# Create a local vector database
db = FAISS.from_texts(texts, embedding_model)

In [25]:
from langchain import PromptTemplate

# Create a prompt template
template = """<|user|>
Relevant information:
{context}

Provide a concise answer to the following question using the relevant information provided above:
{question}<|end|>
<|assistant|>"""
prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

from langchain.chains import RetrievalQA

# RAG pipeline
rag = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=db.as_retriever(),
    chain_type_kwargs={
        "prompt": prompt
    },
    verbose=True
)

In [26]:
rag.invoke('Income generated')




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'Income generated',
 'result': ' Interstellar, with its worldwide gross of over $677 million (and $773 million after subsequent re-releases), became the tenth-highest grossing film in 2014. It was shot on 35 mm movie film using the Panavision anamorphic format and IMAX 70 mm, utilizing a mix of practical effects, miniature models, and digital effects produced by Double Negative. In the United States, it initially premiered in venues projecting onto film stock before transitioning to digital projectors.'}