Install required libraries first:

In [None]:
# Install required packages
!pip install openai==0.27.10 faiss-gpu tiktoken


Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


Step 0: Setup OpenAI API Key

In [None]:
# Import necessary libraries
import os
import numpy as np
import pickle

In [None]:
import openai
import json

with open('secrets/config.json', 'r') as config:
    data = config.read()

creds = json.loads(data)

# OpenAI API key is set here
openai.api_key = creds["AZURE_OPENAI_KEY"]

Step 1: Read the Large Text Document

In [None]:
with open('/content/large-text-file-to-query.txt', 'r', encoding='utf-8') as f:
    text = f.read()

Step 2: Split the Text into Chunks

In [None]:
import tiktoken

def split_text_into_chunks(text, chunk_size=500, overlap=50):   # Size value is in tokens
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding('cl100k_base')  # Use the appropriate encoding

    # Tokenize the entire text
    tokens = tokenizer.encode(text)

    chunks = []
    text_length = len(tokens)
    start = 0

    while start < text_length:
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk = tokenizer.decode(chunk_tokens)
        chunks.append(chunk)
        start += chunk_size - overlap  # Move the window forward by chunk_size minus overlap

    return chunks

# Use the updated function
chunks = split_text_into_chunks(text)
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 1125


Step 3: Generate Embeddings and Build the Vector Index

In [None]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.Embedding.create(
        input=text,
        model=model
    )
    embedding = response['data'][0]['embedding']
    return embedding

In [None]:
# Generate embeddings for each chunk
from tqdm import tqdm

# Generate embeddings with a progress bar
embeddings = []
chunk_texts = []

for idx, chunk in enumerate(tqdm(chunks, desc="Generating embeddings")):
    if chunk.strip():
        embedding = get_embedding(chunk)
        embeddings.append(embedding)
        chunk_texts.append(chunk)


Generating embeddings: 100%|██████████| 1125/1125 [06:09<00:00,  3.05it/s]


In [None]:
import faiss

# Convert embeddings to a numpy array and build the FAISS index
embedding_matrix = np.array(embeddings).astype('float32')
embedding_size = embedding_matrix.shape[1]

# Build the FAISS index
index = faiss.IndexFlatL2(embedding_size)
index.add(embedding_matrix)
print(f"FAISS index has {index.ntotal} vectors.")


FAISS index has 1125 vectors.


Step 4: Save the Index and Chunks (Optional)

In [None]:
# Save the FAISS index
faiss.write_index(index, "faiss_index.index")

# Save the chunk texts
with open('chunk_texts.pkl', 'wb') as f:
    pickle.dump(chunk_texts, f)


Step 5: Load the Index and Chunks (Optional)

In [None]:
# Load the FAISS index
index = faiss.read_index("faiss_index.index")

# Load the chunk texts
with open('chunk_texts.pkl', 'rb') as f:
    chunk_texts = pickle.load(f)


Step 6: Define the Search and Answer Generation Functions

In [None]:
def search(query, top_k_hits=5):
    query_embedding = get_embedding(query)
    query_vector = np.array([query_embedding]).astype('float32')
    distances, indices = index.search(query_vector, top_k_hits)
    return indices[0], distances[0]

In [None]:
def generate_answer(query, top_k_hits=5):
    indices, distances = search(query, top_k_hits)
    relevant_chunks = [chunk_texts[i] for i in indices]

    # Combine the relevant chunks
    context = "\n\n".join(relevant_chunks)

    # Construct the prompt
    prompt = f"Answer the following question using the context provided.\n\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"

    # Use GPT-4 to generate the answer
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
            {"role": "user", "content": prompt}
        ]
    )

    answer = response['choices'][0]['message']['content']
    return answer

Step 7: Query the Document

In [None]:
# Input your query
query = "Is there any communication suggesting that Sean expected to repay the note (bridge loan) to DJ using the CPUcoin cryptocurrency?"

# Generate the answer
answer = generate_answer(query)
print("Response:")
print(answer)

Response:
No, there is no specific communication suggesting that Sean expected to repay the bridge loan to DJ using the CPUcoin cryptocurrency. Sean refers to the bridge loan as a convertible note that converts into equity but does not explicitly state that CPUcoin would be used for repayment.


In [None]:
# Input your query
query = "Is there any communication about Keith?"

# Generate the answer
answer = generate_answer(query, 5)
print("Response:")
print(answer)

Response:
Yes, there is communication about Keith. DJ has been regularly updating his distribution list on Keith's health condition. In the most recent update, he explained that Keith was released from UCSF Sunday evening and was able to sleep well in his own bed. He also mentioned having to take Keith to his twice weekly clinic for anti-rejection medications, dealing with some bureaucratic issues, and assisting Keith with hygiene and mobility at home. Previously, DJ reported that Keith's recovery has been swift and he has been able to spend time out of bed and receive visitors.
