Install required libraries first:

In [None]:
# Install required packages
!pip install openai==0.27.10 faiss-cpu tiktoken




Step 0: Setup OpenAI API Key

In [None]:
# Import necessary libraries
import os
import openai
import faiss
import numpy as np
import tiktoken
import pickle

In [27]:
import openai
import json

with open('config.json', 'r') as config:
    data = config.read()

creds = json.loads(data)

# OpenAI API key is set here
openai.api_key = creds["AZURE_OPENAI_KEY"]

Step 1: Read the Large Text Document

In [None]:
with open('/content/DJ and Sean Barger skype transcript', 'r', encoding='utf-8') as f:
    text = f.read()

Step 2: Split the Text into Chunks

In [9]:
def split_text_into_chunks(text, max_tokens=500):
    tokenizer = tiktoken.get_encoding('cl100k_base')
    sentences = text.split('. ')
    chunks = []
    current_chunk = ''
    current_tokens = 0

    for sentence in sentences:
        sentence += '. '
        tokens = tokenizer.encode(sentence)
        token_count = len(tokens)
        if current_tokens + token_count > max_tokens:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
            current_tokens = token_count
        else:
            current_chunk += sentence
            current_tokens += token_count

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

chunks = split_text_into_chunks(text)
print(f"Number of chunks: {len(chunks)}")

Number of chunks: 1117


Step 3: Generate Embeddings and Build the Vector Index

In [12]:
def get_embedding(text, model="text-embedding-3-small"):
    response = openai.Embedding.create(
        input=text,
        model=model
    )
    embedding = response['data'][0]['embedding']
    return embedding

In [13]:
# Generate embeddings for each chunk
embeddings = []
chunk_texts = []

for idx, chunk in enumerate(chunks):
    embedding = get_embedding(chunk)
    embeddings.append(embedding)
    chunk_texts.append(chunk)
    if (idx + 1) % 10 == 0 or (idx + 1) == len(chunks):
        print(f"Processed chunk {idx + 1}/{len(chunks)}")


Processed chunk 10/1117
Processed chunk 20/1117
Processed chunk 30/1117
Processed chunk 40/1117
Processed chunk 50/1117
Processed chunk 60/1117
Processed chunk 70/1117
Processed chunk 80/1117
Processed chunk 90/1117
Processed chunk 100/1117
Processed chunk 110/1117
Processed chunk 120/1117
Processed chunk 130/1117
Processed chunk 140/1117
Processed chunk 150/1117
Processed chunk 160/1117
Processed chunk 170/1117
Processed chunk 180/1117
Processed chunk 190/1117
Processed chunk 200/1117
Processed chunk 210/1117
Processed chunk 220/1117
Processed chunk 230/1117
Processed chunk 240/1117
Processed chunk 250/1117
Processed chunk 260/1117
Processed chunk 270/1117
Processed chunk 280/1117
Processed chunk 290/1117
Processed chunk 300/1117
Processed chunk 310/1117
Processed chunk 320/1117
Processed chunk 330/1117
Processed chunk 340/1117
Processed chunk 350/1117
Processed chunk 360/1117
Processed chunk 370/1117
Processed chunk 380/1117
Processed chunk 390/1117
Processed chunk 400/1117
Processed

In [14]:
# Convert embeddings to a numpy array and build the FAISS index
embedding_matrix = np.array(embeddings).astype('float32')
embedding_size = embedding_matrix.shape[1]

# Build the FAISS index
index = faiss.IndexFlatL2(embedding_size)
index.add(embedding_matrix)
print(f"FAISS index has {index.ntotal} vectors.")


FAISS index has 1117 vectors.


Step 4: Save the Index and Chunks (Optional)

In [15]:
# Save the FAISS index
faiss.write_index(index, "faiss_index.index")

# Save the chunk texts
with open('chunk_texts.pkl', 'wb') as f:
    pickle.dump(chunk_texts, f)


Step 5: Load the Index and Chunks (Optional)

In [16]:
# Load the FAISS index
index = faiss.read_index("faiss_index.index")

# Load the chunk texts
with open('chunk_texts.pkl', 'rb') as f:
    chunk_texts = pickle.load(f)


Step 6: Define the Search and Answer Generation Functions

In [17]:
def search(query, k=5):
    query_embedding = get_embedding(query)
    query_vector = np.array([query_embedding]).astype('float32')
    distances, indices = index.search(query_vector, k)
    return indices[0], distances[0]

In [18]:
def generate_answer(query, k=5):
    indices, distances = search(query, k)
    relevant_chunks = [chunk_texts[i] for i in indices]

    # Combine the relevant chunks
    context = "\n\n".join(relevant_chunks)

    # Construct the prompt
    prompt = f"Answer the following question using the context provided.\n\nContext:\n{context}\n\nQuestion: {query}\n\nAnswer:"

    # Use GPT-4 to generate the answer
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based on the provided context."},
            {"role": "user", "content": prompt}
        ]
    )

    answer = response['choices'][0]['message']['content']
    return answer

Step 7: Query the Document

In [21]:
# Input your query
query = "Is there any communication suggesting that Sean expected to repay the note (bridge loan) to DJ using the CPUcoin cryptocurrency?"

# Generate the answer
answer = generate_answer(query)
print("Response:")
print(answer)

Response:
Yes, there is communication that suggests that Sean Barger expected to repay the bridge loan to DJ. In his message on 01/03/2020 19:18:25, Sean said that he would work on the term sheet that pays down the convertible note with 10% of funds raised at CPUcoin as and when the funds come in, in return for all tokens in DJ's possession.


In [28]:
# Input your query
query = "Is there any communication about Keith?"

# Generate the answer
answer = generate_answer(query)
print("Response:")
print(answer)

Response:
Yes, there has been extensive communication about Keith. The updates mention how he's recovering, expressing that Keith's joyful personality has won over the staff, and that he was expected to move from the ICU back to the 9th floor ward. He had regained enough energy to accept visitors and take phone calls. Later updates, however, highlighted that Keith was to be readmitted to investigate high bilirubin levels.
