# Tutorial for building a graph from a document for Q/A and Similarity

## Import Libraries

In [1]:
# for parsing documents with structure
# sudo apt install pandoc
# pip install pandoc
import pandoc 
# for embeddings DB with meta data
import pinecone
# for large language model use (davinci-003)
# this is a paid API
# pip install openai
import openai
# hugging face transformers for efficient access to trained embeddings models
from transformers import pipeline
from sentence_transformers import SentenceTransformer, CrossEncoder

# for parsing web-pages
import requests
from requests.exceptions import ConnectTimeout
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import re
import tqdm

from pypdf import PdfReader
import nltk
nltk.download('punkt')
import os
from urllib.parse import urlparse

from tika import parser  
requests.encoding = 'ISO-8859-1'

import sys
sys.setrecursionlimit(999999)

2023-04-28 07:25:14.625462: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /home/titan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Define Inputs

In [2]:
VECTOR_INDEX_NAME = 'audio'
# we encode and insert in batches of 64
batch_size = 64

OPENAI_COMPLETION_ENGINE = 'text-davinci-003'
OPENAI_CONTEXT_LENGTH_LIMIT = 4000
EMBED_MODEL = 'multi-qa-mpnet-base-dot-v1'
EMBED_CROSS_MODEL = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
READER_MODEL = 'deepset/electra-base-squad2'

openai_api_key = "sk-KrOiHnlQW3dDLnU7aE8PooCP5MGhvYzP9XX2uSEG"
pinecone_api_key = 'c65fa925-08e1-4af0-b08b-1104c6ffba25' # https://app.pinecone.io/projects
google_api_key = 'AIzaSyAIIY6OsTws8dTfoyxNmJLmnfmH2f859Fw' # https://console.cloud.google.com/apis/dashboard

tika_headers = {
            "X-Tika-OCRLanguage": "eng",
            "X-Tika-OCRTimeout": "300"
        }

web_header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }

## Open AI functions

In [3]:
def openai_complete(context, prompt, max_answer_len=256) -> str:
        text_context = (context[:OPENAI_CONTEXT_LENGTH_LIMIT] + '...') \
            if len(context) > OPENAI_CONTEXT_LENGTH_LIMIT else context

        openai_prompt = f"{text_context}\n\n{prompt}"

        response = openai.Completion.create(
            engine=OPENAI_COMPLETION_ENGINE,
            prompt=openai_prompt,
            temperature=0.7,
            max_tokens=max_answer_len,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].text

def openai_summary(text, max_answer_len=255) -> str:
        if len(text) < max_answer_len:
            return text
        return openai_complete(text, "TLDR:", max_answer_len)[0]
    
def openai_question(question, context, max_answer_len=512) -> str:
        context = context[0:9000]
        openai_prompt = f"Answer the question: {question}\n\nContext: {context}\n\nAnswer:"

        response = openai.Completion.create(
            engine=OPENAI_COMPLETION_ENGINE,
            prompt=openai_prompt,
            temperature=0.7,
            max_tokens=max_answer_len,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].text.strip()
    
def openai_question_conv(question, context, previous_context, max_answer_len=512) -> str:
        context = context[0:9000]
        openai_prompt = f"Answer the question: {question}\n\nContext: {context}\n\nConversation History: {previous_context}\n\nAnswer:"

        response = openai.Completion.create(
            engine=OPENAI_COMPLETION_ENGINE,
            prompt=openai_prompt,
            temperature=0.7,
            max_tokens=max_answer_len,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].text.strip()

## Embedding definitions

In [4]:
retriever = SentenceTransformer(EMBED_MODEL)
EMBED_DIMENSIONS = retriever.get_sentence_embedding_dimension()

cross_encoder = CrossEncoder(EMBED_CROSS_MODEL)
reader = pipeline(tokenizer=READER_MODEL, model=READER_MODEL, task='question-answering')

openai.api_key = openai_api_key

pinecone.init(
    api_key=pinecone_api_key
)

if VECTOR_INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(
        name=VECTOR_INDEX_NAME,
        dimension=EMBED_DIMENSIONS,
        metric='dotproduct',
    )
vector_index = pinecone.Index(VECTOR_INDEX_NAME)

## Webpage parsing

In [5]:
def clean_html(raw_html):
    # Remove extra white space
    clean_text = re.sub('\s+', ' ', raw_html).strip()
    
    # Remove special characters and symbols except for punctuation
    clean_text = re.sub('[^A-Za-z0-9\s.,?!]+', '', clean_text)
    
    return clean_text

       
def get_domain_name(url):
    parsed_uri = urlparse(url)
    return f"{parsed_uri.scheme}://{parsed_uri.netloc}"

def tokenize_sentences(text):
    """
    Tokenize sentences in the given text using the nltk library.
    
    Args:
        text (str): The input text.
    
    Returns:
        list: A list of sentence strings.
    """
    # Use the nltk library to tokenize sentences
    return nltk.sent_tokenize(text)
           
def extract_text(pdf_file):
    reader = PdfReader(pdf_file)
    
    # Initialize an empty string to store the extracted text
    text = ""
    
    # Iterate through each page of the PDF file
    for page_num in range(len(reader.pages)):
        # Extract the text from each page
        page = reader.pages[page_num]
        page_text = page.extract_text()
        
        # Concatenate the extracted text to the final string
        text += page_text
   
    # Return the extracted text
    return text

def extract_text_tika(pdf_file):
    # opening pdf file
    parsed_pdf = parser.from_file(pdf_file, requestOptions={'timeout': 600})
  
    # saving content of pdf
    # you can also bring text only, by parsed_pdf['text'] 
    # parsed_pdf['content'] returns string 
    text = parsed_pdf['content']
    # Clean the extracted text
    text = re.sub('\n+', ' ', text)  # Replace multiple carriage returns with a single space
    text = re.sub(' +', ' ', text)   # Replace multiple spaces with a single space
    # Return the extracted text
    return text.strip()
    
def download_file(url, file_name):
    # open in binary mode
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }
    response = requests.get(url, headers=headers, timeout=300)
    with open(file_name, "wb") as file:
        file.write(response.content)
    return
        
      
visited = set()
pdf_files_downloaded = set()
audio_files_downloaded = set()
image_files_downloaded = set()

def download_pdf(url, filename):
    if filename.endswith("JFK-JulOct2017Release.zip") or filename.endswith("JFK-April2018.zip") or filename.endswith("jfk2022-ocr.zip"):
        return
    try:
        response = requests.get(url, timeout=600, headers=web_header)
    except ConnectTimeout:
        print('Request time out for pdf file: ' + url)
        return False

    if response.status_code == 200:
        with open(filename, "wb") as f:
            f.write(response.content)
        return True
    else:
        print(f'Error {response.status_code} for pdf file: {url}')
    return False

def download_audio(url, filename):
    try:
        response = requests.get(url, timeout=300, headers=web_header)
    except ConnectTimeout:
        print('Request time out for audio file: ' + url)
        return False

    if response.status_code == 200:
        with open(filename, "wb") as f:
            f.write(response.content)
        return True
    else:
        print(f'Error {response.status_code} for audio file: {url}')
    return False

def download_image(url, filename):
    try:
        response = requests.get(url, timeout=120, headers=web_header)
    except ConnectTimeout:
        print('Request time out for image file: ' + url)
        return False

    if response.status_code == 200:
        with open(filename, "wb") as f:
            f.write(response.content)
        return True
    else:
        print(f'Error {response.status_code} for image file: {url}')
    return False

def crawl_and_download_pdfs(website_url, base_url):
    if website_url in visited:
        return
    page_text = []

    visited.add(website_url)

    try:
        response = requests.get(website_url, timeout=30)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {website_url}: {e}")
        return

    try:
        soup = BeautifulSoup(response.content, "html.parser", from_encoding="iso-8859-1")
    except Exception as e:
        print(f"Error parsing {website_url}: {e}")
        return

    text = soup.get_text()
    page_text.append([clean_html(text), website_url])
    links = soup.find_all('a')

    for link in links:
        href = link.get("href")

        if not href or href.startswith("mailto") or "?" in href:
            continue

        absolute_url = urljoin(base_url, href)
        if absolute_url.startswith(base_url) or "theblackvault.com" in absolute_url:
            try:
                # Handle different file types separately
                if href.lower().endswith((".pdf", ".zip", ".srt")):
                    filename = os.path.join("pdf_downloads", href.split("/")[-1])
                    if filename not in pdf_files_downloaded:
                        if not os.path.isfile(filename):
                            print(f"Downloading {absolute_url}")
                            down = download_pdf(absolute_url, filename)
                            if down:
                                print(f"Downloaded {absolute_url}")
                                pdf_files_downloaded.add((absolute_url, filename))
                        else:
                            print(f"Added pdf {absolute_url}")
                            pdf_files_downloaded.add((absolute_url, filename))
                elif href.lower().endswith((".mp3", ".mp4", ".wav", ".avi", ".wmv")):
                    filename = os.path.join("audio_downloads", href.split("/")[-1])
                    if filename not in audio_files_downloaded:
                        if not os.path.isfile(filename):
                            print(f"Downloading {absolute_url}")
                            down = download_audio(absolute_url, filename)
                            if down:
                                print(f"Downloaded {absolute_url}")
                                audio_files_downloaded.add((absolute_url, filename))
                        else:
                            print(f"Added video {absolute_url}")
                            audio_files_downloaded.add((absolute_url, filename))
                elif href.lower().endswith((".jpg", ".gif", ".webp", ".jpeg", ".png")):
                    filename = os.path.join("image_downloads", href.split("/")[-1])
                    if filename not in image_files_downloaded:
                        if not os.path.isfile(filename):
                            print(f"Downloading {absolute_url}")
                            down = download_image(absolute_url, filename)
                            if down:
                                print(f"Downloaded {absolute_url}")
                                image_files_downloaded.add((absolute_url, filename))
                        else:
                            print(f"Added image {absolute_url}")
                            image_files_downloaded.add((absolute_url, filename))
                else:
                    crawl_and_download_pdfs(absolute_url, base_url)
            except Exception as e:
                print(f"Error downloading {absolute_url}: {e}")
                continue

    return page_text, pdf_files_downloaded, audio_files_downloaded, image_files_downloaded

## Text

In [6]:
def append_text_to_file(file_path, text_to_append):
    with open(file_path, 'a', encoding='utf-8') as file:
        file.write(text_to_append)
    return

## Crawl the websites to index text and save PDF files

In [None]:
#domains = [('https://www.theblackvault.com/documentarchive/', 'Black Vault')]
domains = [('https://www.theblackvault.com/documentarchive/', 'Black Vault')]

for domain in domains:
    channel_name = domain[1]
    base_url = get_domain_name(domain[0])
    
    if not os.path.exists("pdf_downloads"):
        os.makedirs("pdf_downloads")
        
    if not os.path.exists("audio_downloads"):
        os.makedirs("audio_downloads")

    if not os.path.exists("image_downloads"):
        os.makedirs("image_downloads")        
        
    page_text, pdf_files, audio_files, image_files = crawl_and_download_pdfs(domain[0], base_url)

Added image https://www.theblackvault.com/community/wp-content/uploads/2020/06/unnamed-file.jpg
Added image https://insidetheblackvault.com/wp-content/uploads/2019/01/greenewald.png
Added image https://insidetheblackvault.com/wp-content/uploads/2019/01/doublesided.png
Added image https://insidetheblackvault.com/wp-content/uploads/2019/01/cover-coin-e1546648303905.png
Added image https://insidetheblackvault.com/wp-content/uploads/2019/01/2019-01-03_16-16-59.jpg
Added image https://insidetheblackvault.com/wp-content/uploads/2019/01/cover-e1546531327405.png
Added image https://insidetheblackvault.com/wp-content/uploads/2019/01/book-money-second.png
Added image https://insidetheblackvault.com/wp-content/uploads/2019/01/greenewald.png
Added image https://insidetheblackvault.com/wp-content/uploads/2019/01/doublesided.png
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2023/04/3.jpg
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/

Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2021/12/unnamed-file-3.jpg
Added pdf https://documents2.theblackvault.com/documents/navy/DON-NAVY-2022-000883.pdf
Added pdf https://documents2.theblackvault.com/documents/navy/DON-NAVY-2022-000883.pdf
Added pdf https://documents2.theblackvault.com/documents/navy/DON-NAVY-2022-000883.pdf
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/09/b-1.jpg
Added video https://www.theblackvault.com/documentarchive/podcast-player/16719/ep-110-chris-rutkowski-on-canadas-ufos-government-ufo-consultation-and-nearly-five-decades-of-research.mp3
Added pdf https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/09/TBVR-Ep110_otter_ai.srt
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/08/unnamed-file-2.jpg
Added video https://www.theblackvault.com/documentarchive/podcast-player/16702/ep-109-the-reaction-to-my-deep-dive-into-luis-elizondos-ig-complai

Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/08/unnamed-file.jpg
Added video https://www.theblackvault.com/documentarchive/podcast-player/16680/ep-107-the-hottel-memo-finally-revealed-in-full-so-whats-it-say.mp3
Added pdf https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/08/TBVR-Ep107_otter_ai.srt
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2020/10/b-1.jpg
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2020/10/10-5-2020-6-38-15-AM.jpg
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2018/09/2018-09-11_8-43-51.png
Added pdf https://documents.theblackvault.com/documents/fbifiles/gangsterera/067E-HQ-10565.pdf
Added pdf https://documents.theblackvault.com/documents/fbifiles/historical/theodoregunderson-fbi1.pdf
Added pdf https://documents2.theblackvault.com/documents/fbifiles/fbi/tedgunderson-fbi2.pdf
Added image https://www.theblackvault.com/d

Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/07/a-2.jpg
Added video https://www.theblackvault.com/documentarchive/podcast-player/16621/ep-105-exploring-the-dods-new-all-domain-anomaly-resolution-office.mp3
Added pdf https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/07/TBVR-Ep105_otter_ai.srt
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/07/a-2.jpg
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/07/7-20-2022-2-16-04-PM.jpg
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/07/7-20-2022-2-16-20-PM.jpg
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/07/7-20-2022-2-18-59-PM.jpg
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/07/7-20-2022-2-19-12-PM.jpg
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2022/07/Dr.-Sean-M.-Kirkpatrick-EM.jpg
A

Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2015/03/John_Edwards_Pittsburgh_2007-scaled.jpg
Added pdf https://documents.theblackvault.com/documents/financial/Edwards.pdf
Added pdf https://documents.theblackvault.com/documents/financial/Edwards.pdf
Added image https://www.theblackvault.com/documentarchive/wp-content/uploads/2016/08/2016-08-16_14-25-24.jpg
Added pdf https://documents.theblackvault.com/documents/nro/F-2018-00116.pdf
Added pdf https://documents.theblackvault.com/documents/nro/F-2018-00084.pdf
Added pdf https://documents.theblackvault.com/documents/nro/NRO-BudgetJustification-2014.pdf
Added pdf https://documents.theblackvault.com/documents/nro/nrobudjetjustification-2013.pdf
Added pdf https://documents.theblackvault.com/documents/nro/fy2010cbjb.pdf
Added pdf https://documents.theblackvault.com/documents/nro/https://documents.theblackvault.com/documents/nro/fy2010cbjb.pdf
Added pdf https://documents.theblackvault.com/documents/nro/F-2016-0013

## Index the PDF files

In [11]:
fileName = "train.txt"
channel_name = 'Black Vault'
for pdf_data in pdf_files:
    # set window (length of text chunk) and stride
    window = 1
    stride = 1  # smaller stride creates overlap
    
    data = []
    results = []
    
    new_data = []
    
    window = 6  # number of sentences to combine
    stride = 3  # number of sentences to 'stride' over, used to create overlap
    
    text = ""
    try:
        if not os.path.isfile("./"+filename):
            continue
        print(f"processing file {pdf_data[1]}")
        text = tokenize_sentences(extract_text_tika(pdf_data[1]))
        #print(text)
        append_text_to_file("./"+fileName, text)
    except:
        print(f"error reading {pdf_data[1]}")
        continue
    if text[0].startswith("PDF"):
        print('skipping PDF')
        continue
    else:
        new_data.append({
            'text': text,
            'id': pdf_data[0],
            'url': pdf_data[0],
            "name":channel_name,
            "title":channel_name,
        })
        
    # loop through in batches of 64
    index = pinecone.Index(VECTOR_INDEX_NAME)
    
    for j in range(0, len(new_data), batch_size):
        if len(new_data) > 0:
            # find end position of batch (for when we hit end of data)
            j_end = min(len(new_data), j+batch_size)
            #print(j, j_end, len(new_data))
            try:
                # extract the metadata like text, start/end positions, etc
                batch_meta = [{
                    "text": new_data[x]["text"],
                    "url": new_data[x]["url"],
                    "name": new_data[x]["name"],
                    "title": new_data[x]["title"]
                } for x in range(j, j_end)]
                # extract only text to be encoded by embedding model
                batch_text = [row['text'] for row in new_data[j:j_end]]
                # create the embedding vectors
                batch_embeds = retriever.encode(batch_text).tolist()
                # extract IDs to be attached to each embedding and metadata
                batch_ids = [row['id'] for row in new_data[j:j_end]]
                # 'upsert' (eg insert) IDs, embeddings, and metadata to index
                to_upsert = list(zip(batch_ids, batch_embeds, batch_meta))
                index.upsert(to_upsert)
            except:
                continue

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [10]:
pdf_files

{('https://www.theblackvault.com/docid-32365377.pdf',
  'pdf_downloads/docid-32365377.pdf'),
 ('https://www.theblackvault.com/ARRBmails09733.pdf',
  'pdf_downloads/ARRBmails09733.pdf'),
 ('https://www.theblackvault.com/FOIA%20Backlog%20Report-%2005-11-2015_Redacted.pdf',
  'pdf_downloads/FOIA%20Backlog%20Report-%2005-11-2015_Redacted.pdf'),
 ('https://www.theblackvault.com/ARRBmails04367.pdf',
  'pdf_downloads/ARRBmails04367.pdf'),
 ('https://www.theblackvault.com/ARRBmails06062.pdf',
  'pdf_downloads/ARRBmails06062.pdf'),
 ('https://www.theblackvault.com/ARRBmails03127.pdf',
  'pdf_downloads/ARRBmails03127.pdf'),
 ('https://www.theblackvault.com/docid-32327810.pdf',
  'pdf_downloads/docid-32327810.pdf'),
 ('https://www.theblackvault.com/docid-32263513.pdf',
  'pdf_downloads/docid-32263513.pdf'),
 ('https://www.theblackvault.com/104-10010-10057_1.pdf',
  'pdf_downloads/104-10010-10057_1.pdf'),
 ('https://www.theblackvault.com/docid-32293105.pdf',
  'pdf_downloads/docid-32293105.pdf'),


## Query pinecone index for answer to question

In [60]:
# Define the query or question to ask
query = "How can Sterling and Rose help organizations setup DAOs?"
# Create and embedding representing the question
xq = retriever.encode(query).tolist()
# Search the index for the top (k) answers 
results = vector_index.query(xq, top_k=10, include_metadata=True)
print(results)

{'matches': [{'id': 'https://stirlingandrose.com/category/web3-metaverse/daos/',
              'metadata': {'name': 'Sterling and Rose',
                           'text': ['DAOs  Stirling  Rose Skip to content Home '
                                    'Expertise Fellowship People Insights Menu '
                                    'Home Expertise Fellowship People Insights '
                                    'Category DAOs DAOs, Web 3.0  Metaverse '
                                    'Bodies Without Organs Can a DAO that is '
                                    'run entirely on smart contracts raise the '
                                    'possibility that data structures could do '
                                    'the functions of companies?',
                                    'Dr Jason Allen31 August 2022 About Us '
                                    'Terms of Use Privacy Policy Contact Us '
                                    'Menu About Us Terms of Use Privacy Policy 

In [61]:
context = ""
for result in results['matches']:
    for line in result['metadata']['text']:
        context += line+" "
openai_answser = openai_question(query, context)

In [62]:
openai_answser

'Sterling and Rose can help organizations setup DAOs by providing legal advice and guidance on the complex regulations and legal frameworks surrounding DAOs. They can also provide support with the identification of potential risks and liabilities, the creation of legal contracts, and the management of data rights. Additionally, they can help organizations with the development of AOs (autonomous organisations) and their integration into existing legal frameworks.'

In [66]:
# Define the query or question to ask
query = "What are the benefits of Sterling and Rose helping your organization setup a DAO?"
# Create and embedding representing the question
xq = retriever.encode(query).tolist()
# Search the index for the top (k) answers 
results = vector_index.query(xq, top_k=10, include_metadata=True)
context = ""
for result in results['matches']:
    for line in result['metadata']['text']:
        context += line+" "
openai_answer = openai_question(query, context)
openai_answer

'The benefits of Sterling and Rose helping your organization set up a DAO include cost reduction, access to a broader user base for feedback, increased positive feedback from outside groups, legal accountability for AOs, legal personality to protect internal and external stakeholders, and a balance between recognizing the separate functionality of AOs and ensuring humans are held accountable for their actions and decisions related to the AO.'

In [67]:
previous_context = openai_summary(query+" "+context+" "+openai_answer)

In [69]:
# Define the conversational query or question to ask
query = "Explain the detailed benefits for an organizations to have a DAO in partnership with Sterling and Rose, \
    with emphasis on why Sterling and Rose would be a good partner"
# Create and embedding representing the question
xq = retriever.encode(query).tolist()
# Search the index for the top (k) answers 
results = vector_index.query(xq, top_k=10, include_metadata=True)
context = ""
for result in results['matches']:
    for line in result['metadata']['text']:
        context += line+" "
openai_answer = openai_question_conv(query, context, previous_context)
openai_answer

'Partnering with Sterling and Rose can provide organizations with a number of detailed benefits. Sterling and Rose is a law firm that specializes in legal issues related to Artificial Intelligence, data structures and Decentralized Autonomous Organizations (DAOs). As such, they can provide organizations with legal expertise on how to use DAOs to operate efficiently and securely. Additionally, Sterling and Rose can provide organizations with valuable insight into the laws and regulations surrounding DAOs and the data structures that power them. By partnering with Sterling and Rose, organizations can ensure they are compliant with the relevant laws and regulations and can access the legal advice and guidance necessary to ensure the success of their DAOs. Furthermore, partnering with Sterling and Rose can help organizations build relationships with other organizations and stakeholders in the DAO space and provide them with access to a network of potential partners and collaborators. Final

In [70]:
previous_context = openai_summary(query+" "+context+" "+openai_answer)
# Define the conversational query or question to ask
query = "How do I get started with Sterling and Rose to setup a DAO, who do I contact"
# Create and embedding representing the question
xq = retriever.encode(query).tolist()
# Search the index for the top (k) answers 
results = vector_index.query(xq, top_k=10, include_metadata=True)
context = ""
for result in results['matches']:
    for line in result['metadata']['text']:
        context += line+" "
openai_answer = openai_question_conv(query, context, previous_context)
openai_answer

'To get started with Sterling and Rose to setup a DAO, you can contact their team at infostirlingandrose.com or 1800 178 218. You can also follow them on LinkedIn to stay up to date with their services.'