# Tutorial for building a graph from a document for Q/A and Similarity

## Import Libraries

In [36]:
# for parsing documents with structure
# sudo apt install pandoc
# pip install pandoc
import pandoc 
# for embeddings DB with meta data
import pinecone
# for large language model use (davinci-003)
# this is a paid API
# pip install openai
import openai
# hugging face transformers for efficient access to trained embeddings models
from transformers import pipeline
from sentence_transformers import SentenceTransformer, CrossEncoder

# for parsing web-pages
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
import tqdm

from pypdf import PdfReader
import nltk
nltk.download('punkt')
import os
from urllib.parse import urlparse

from tika import parser  

[nltk_data] Downloading package punkt to /home/titan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Define Inputs

In [52]:
VECTOR_INDEX_NAME = 'audio'
# we encode and insert in batches of 64
batch_size = 64

OPENAI_COMPLETION_ENGINE = 'text-davinci-003'
OPENAI_CONTEXT_LENGTH_LIMIT = 4000
EMBED_MODEL = 'multi-qa-mpnet-base-dot-v1'
EMBED_CROSS_MODEL = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
READER_MODEL = 'deepset/electra-base-squad2'

openai_api_key = "sk-KrOiHnlQW3dDLnU7aE8PooCP5MGhvYzP9XX2uSEG"
pinecone_api_key = 'c65fa925-08e1-4af0-b08b-1104c6ffba25' # https://app.pinecone.io/projects
google_api_key = 'AIzaSyAIIY6OsTws8dTfoyxNmJLmnfmH2f859Fw' # https://console.cloud.google.com/apis/dashboard

tika_headers = {
            "X-Tika-OCRLanguage": "eng",
            "X-Tika-OCRTimeout": "300"
        }

## Open AI functions

In [68]:
def openai_complete(context, prompt, max_answer_len=256) -> str:
        text_context = (context[:OPENAI_CONTEXT_LENGTH_LIMIT] + '...') \
            if len(context) > OPENAI_CONTEXT_LENGTH_LIMIT else context

        openai_prompt = f"{text_context}\n\n{prompt}"

        response = openai.Completion.create(
            engine=OPENAI_COMPLETION_ENGINE,
            prompt=openai_prompt,
            temperature=0.7,
            max_tokens=max_answer_len,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].text

def openai_summary(text, max_answer_len=255) -> str:
        if len(text) < max_answer_len:
            return text
        return openai_complete(text, "TLDR:", max_answer_len)[0]
    
def openai_question(question, context, max_answer_len=512) -> str:
        context = context[0:9000]
        openai_prompt = f"Answer the question: {question}\n\nContext: {context}\n\nAnswer:"

        response = openai.Completion.create(
            engine=OPENAI_COMPLETION_ENGINE,
            prompt=openai_prompt,
            temperature=0.7,
            max_tokens=max_answer_len,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].text.strip()
    
def openai_question_conv(question, context, previous_context, max_answer_len=512) -> str:
        context = context[0:9000]
        openai_prompt = f"Answer the question: {question}\n\nContext: {context}\n\nConversation History: {previous_context}\n\nAnswer:"

        response = openai.Completion.create(
            engine=OPENAI_COMPLETION_ENGINE,
            prompt=openai_prompt,
            temperature=0.7,
            max_tokens=max_answer_len,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].text.strip()

## Embedding definitions

In [4]:
retriever = SentenceTransformer(EMBED_MODEL)
EMBED_DIMENSIONS = retriever.get_sentence_embedding_dimension()

cross_encoder = CrossEncoder(EMBED_CROSS_MODEL)
reader = pipeline(tokenizer=READER_MODEL, model=READER_MODEL, task='question-answering')

openai.api_key = openai_api_key

pinecone.init(
    api_key=pinecone_api_key
)

if VECTOR_INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(
        name=VECTOR_INDEX_NAME,
        dimension=EMBED_DIMENSIONS,
        metric='dotproduct',
    )
vector_index = pinecone.Index(VECTOR_INDEX_NAME)

## Webpage parsing

In [57]:
def clean_html(raw_html):
    # Remove extra white space
    clean_text = re.sub('\s+', ' ', raw_html).strip()
    
    # Remove special characters and symbols except for punctuation
    clean_text = re.sub('[^A-Za-z0-9\s.,?!]+', '', clean_text)
    
    return clean_text

def crawl_domain(url):
    # Parse the base URL
    base_url = urlparse(url).scheme + '://' + urlparse(url).hostname
    
    # Initialize a set to store visited URLs
    visited_urls = set()
    
    # Initialize a list to store text information
    text_info = []
    pdf_files = []
    
    # Define a recursive function to crawl links within the domain
    def crawl(url):
        # Check if the URL has already been visited
        if url in visited_urls:
            return
        
        # Add the URL to the visited set
        visited_urls.add(url)
        
        # Define headers to make the request look more like a legitimate request from a web browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }
        
        # Make a request to the URL with headers
        response = requests.get(url, headers=headers)
        
        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract the text information from the page
        text = soup.get_text()
        
        # Clean the raw html text
        text = clean_html(text)
        
        # Append the text information to the list
        if text.startswith("PDF"):
            a = urlparse(url)
            filename = os.path.basename(a.path)
            print("Downloading file: ", filename)
            download_file(url, filename)
            print("Finished downloading file: ", filename)
            pdf_files.append((url, filename))
        else:
            text_info.append((url, text))
        
        # Find all links on the page
        links = soup.find_all('a')
        
        # Recursively crawl links within the domain
        for link in links:
            href = link.get('href')
            if href and base_url in href:
                crawl(href)
    
    # Start crawling from the base URL
    crawl(base_url)
    
    # Return the text information
    return text_info, pdf_files

def tokenize_sentences(text):
    """
    Tokenize sentences in the given text using the nltk library.
    
    Args:
        text (str): The input text.
    
    Returns:
        list: A list of sentence strings.
    """
    # Use the nltk library to tokenize sentences
    return nltk.sent_tokenize(text)
           
def extract_text(pdf_file):
    reader = PdfReader(pdf_file)
    
    # Initialize an empty string to store the extracted text
    text = ""
    
    # Iterate through each page of the PDF file
    for page_num in range(len(reader.pages)):
        # Extract the text from each page
        page = reader.pages[page_num]
        page_text = page.extract_text()
        
        # Concatenate the extracted text to the final string
        text += page_text
   
    # Return the extracted text
    return text

def extract_text_tika(pdf_file):
    # opening pdf file
    parsed_pdf = parser.from_file(pdf_file, requestOptions={'timeout': 600})
  
    # saving content of pdf
    # you can also bring text only, by parsed_pdf['text'] 
    # parsed_pdf['content'] returns string 
    text = parsed_pdf['content']
    # Clean the extracted text
    text = re.sub('\n+', ' ', text)  # Replace multiple carriage returns with a single space
    text = re.sub(' +', ' ', text)   # Replace multiple spaces with a single space
    # Return the extracted text
    return text.strip()
    
def download_file(url, file_name):
    # open in binary mode
    headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }
    response = requests.get(url, headers=headers)
    with open(file_name, "wb") as file:
        file.write(response.content)

## Crawl the websites to index text and save PDF files

In [19]:
domains = [('https://stirlingandrose.com/', 'Sterling and Rose')]

for domain in domains:
    channel_name = domain[1]
    text_info, pdf_files = crawl_domain(domain[0])
    for scrubed_data in text_info:
        # set window (length of text chunk) and stride
        window = 1
        stride = 1  # smaller stride creates overlap
        
        data = []
        results = []
        
        new_data = []
        
        window = 6  # number of sentences to combine
        stride = 3  # number of sentences to 'stride' over, used to create overlap
        
        text = tokenize_sentences(scrubed_data[1])
        if text[0].startswith("PDF"):
            print('skipping PDF')
        else:
            new_data.append({
                'text': text,
                'id': scrubed_data[0],
                'url': scrubed_data[0],
                "name":channel_name,
                "title":"channel_name",
            })
            
        # loop through in batches of 64
        index = pinecone.Index(VECTOR_INDEX_NAME)
        
        for j in range(0, len(new_data), batch_size):
            if len(new_data) > 0:
                # find end position of batch (for when we hit end of data)
                j_end = min(len(new_data), j+batch_size)
                print(j, j_end, len(new_data))
                try:
                    # extract the metadata like text, start/end positions, etc
                    batch_meta = [{
                        "text": new_data[x]["text"],
                        "url": new_data[x]["url"],
                        "name": new_data[x]["name"],
                        "title": new_data[x]["title"]
                    } for x in range(j, j_end)]
                    # extract only text to be encoded by embedding model
                    batch_text = [row['text'] for row in new_data[j:j_end]]
                    # create the embedding vectors
                    batch_embeds = retriever.encode(batch_text).tolist()
                    # extract IDs to be attached to each embedding and metadata
                    batch_ids = [row['id'] for row in new_data[j:j_end]]
                    # 'upsert' (eg insert) IDs, embeddings, and metadata to index
                    to_upsert = list(zip(batch_ids, batch_embeds, batch_meta))
                    index.upsert(to_upsert)
                except:
                    continue

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Downloading file:  23.03.10-Stirling-Rose-Submission-to-Token-Mapping-Consultation.pdf
Finished downloading file:  23.03.10-Stirling-Rose-Submission-to-Token-Mapping-Consultation.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Downloading file:  FINAL-Stirling-Rose-Submission-Automated-Decision-Making-and-AI-Regulation-2022.pdf
Finished downloading file:  FINAL-Stirling-Rose-Submission-Automated-Decision-Making-and-AI-Regulation-2022.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Downloading file:  FINAL-Stirling-Rose-Submission-Automated-Decision-Making-and-AI-Regulation-2022.pdf
Finished downloading file:  FINAL-Stirling-Rose-Submission-Automated-Decision-Making-and-AI-Regulation-2022.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Downloading file:  25.19.01-UK-Law-Commision-DAO-Submissionx.pdf
Finished downloading file:  25.19.01-UK-Law-Commision-DAO-Submissionx.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Downloading file:  05.12.22-Digital-Assets-UK-Submission.pdf
Finished downloading file:  05.12.22-Digital-Assets-UK-Submission.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Downloading file:  22.10.24_Stirling-Rose_Submission_Digital-Assets-Market-Regulation-Bill_Consultation.pdf
Finished downloading file:  22.10.24_Stirling-Rose_Submission_Digital-Assets-Market-Regulation-Bill_Consultation.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Downloading file:  Stirling-Rose-CASSPr-Submission-June-2022-1.pdf
Finished downloading file:  Stirling-Rose-CASSPr-Submission-June-2022-1.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Downloading file:  SR-Digital-Assets-Primer-Summer-23.pdf
Finished downloading file:  SR-Digital-Assets-Primer-Summer-23.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Downloading file:  SLC-Primer.pdf
Finished downloading file:  SLC-Primer.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Downloading file:  Dig-Primer-v3.11.pdf
Finished downloading file:  Dig-Primer-v3.11.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Downloading file:  Digital-Assets-Primer-Spring-2022.pdf
Finished downloading file:  Digital-Assets-Primer-Spring-2022.pdf
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1


## Index the PDF files

In [58]:
channel_name = 'Sterling and Rose'
for pdf_data in pdf_files:
    # set window (length of text chunk) and stride
    window = 1
    stride = 1  # smaller stride creates overlap
    
    data = []
    results = []
    
    new_data = []
    
    window = 6  # number of sentences to combine
    stride = 3  # number of sentences to 'stride' over, used to create overlap
    
    text = tokenize_sentences(extract_text_tika(pdf_data[1]))
    if text[0].startswith("PDF"):
        print('skipping PDF')
    else:
        new_data.append({
            'text': text,
            'id': pdf_data[0],
            'url': pdf_data[0],
            "name":channel_name,
            "title":channel_name,
        })
        
    # loop through in batches of 64
    index = pinecone.Index(VECTOR_INDEX_NAME)
    
    for j in range(0, len(new_data), batch_size):
        if len(new_data) > 0:
            # find end position of batch (for when we hit end of data)
            j_end = min(len(new_data), j+batch_size)
            print(j, j_end, len(new_data))
            try:
                # extract the metadata like text, start/end positions, etc
                batch_meta = [{
                    "text": new_data[x]["text"],
                    "url": new_data[x]["url"],
                    "name": new_data[x]["name"],
                    "title": new_data[x]["title"]
                } for x in range(j, j_end)]
                # extract only text to be encoded by embedding model
                batch_text = [row['text'] for row in new_data[j:j_end]]
                # create the embedding vectors
                batch_embeds = retriever.encode(batch_text).tolist()
                # extract IDs to be attached to each embedding and metadata
                batch_ids = [row['id'] for row in new_data[j:j_end]]
                # 'upsert' (eg insert) IDs, embeddings, and metadata to index
                to_upsert = list(zip(batch_ids, batch_embeds, batch_meta))
                index.upsert(to_upsert)
            except:
                continue

0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1


## Query pinecone index for answer to question

In [60]:
# Define the query or question to ask
query = "How can Sterling and Rose help organizations setup DAOs?"
# Create and embedding representing the question
xq = retriever.encode(query).tolist()
# Search the index for the top (k) answers 
results = vector_index.query(xq, top_k=10, include_metadata=True)
print(results)

{'matches': [{'id': 'https://stirlingandrose.com/category/web3-metaverse/daos/',
              'metadata': {'name': 'Sterling and Rose',
                           'text': ['DAOs  Stirling  Rose Skip to content Home '
                                    'Expertise Fellowship People Insights Menu '
                                    'Home Expertise Fellowship People Insights '
                                    'Category DAOs DAOs, Web 3.0  Metaverse '
                                    'Bodies Without Organs Can a DAO that is '
                                    'run entirely on smart contracts raise the '
                                    'possibility that data structures could do '
                                    'the functions of companies?',
                                    'Dr Jason Allen31 August 2022 About Us '
                                    'Terms of Use Privacy Policy Contact Us '
                                    'Menu About Us Terms of Use Privacy Policy 

In [61]:
context = ""
for result in results['matches']:
    for line in result['metadata']['text']:
        context += line+" "
openai_answser = openai_question(query, context)

In [62]:
openai_answser

'Sterling and Rose can help organizations setup DAOs by providing legal advice and guidance on the complex regulations and legal frameworks surrounding DAOs. They can also provide support with the identification of potential risks and liabilities, the creation of legal contracts, and the management of data rights. Additionally, they can help organizations with the development of AOs (autonomous organisations) and their integration into existing legal frameworks.'

In [66]:
# Define the query or question to ask
query = "What are the benefits of Sterling and Rose helping your organization setup a DAO?"
# Create and embedding representing the question
xq = retriever.encode(query).tolist()
# Search the index for the top (k) answers 
results = vector_index.query(xq, top_k=10, include_metadata=True)
context = ""
for result in results['matches']:
    for line in result['metadata']['text']:
        context += line+" "
openai_answer = openai_question(query, context)
openai_answer

'The benefits of Sterling and Rose helping your organization set up a DAO include cost reduction, access to a broader user base for feedback, increased positive feedback from outside groups, legal accountability for AOs, legal personality to protect internal and external stakeholders, and a balance between recognizing the separate functionality of AOs and ensuring humans are held accountable for their actions and decisions related to the AO.'

In [67]:
previous_context = openai_summary(query+" "+context+" "+openai_answer)

In [69]:
# Define the conversational query or question to ask
query = "Explain the detailed benefits for an organizations to have a DAO in partnership with Sterling and Rose, \
    with emphasis on why Sterling and Rose would be a good partner"
# Create and embedding representing the question
xq = retriever.encode(query).tolist()
# Search the index for the top (k) answers 
results = vector_index.query(xq, top_k=10, include_metadata=True)
context = ""
for result in results['matches']:
    for line in result['metadata']['text']:
        context += line+" "
openai_answer = openai_question_conv(query, context, previous_context)
openai_answer

'Partnering with Sterling and Rose can provide organizations with a number of detailed benefits. Sterling and Rose is a law firm that specializes in legal issues related to Artificial Intelligence, data structures and Decentralized Autonomous Organizations (DAOs). As such, they can provide organizations with legal expertise on how to use DAOs to operate efficiently and securely. Additionally, Sterling and Rose can provide organizations with valuable insight into the laws and regulations surrounding DAOs and the data structures that power them. By partnering with Sterling and Rose, organizations can ensure they are compliant with the relevant laws and regulations and can access the legal advice and guidance necessary to ensure the success of their DAOs. Furthermore, partnering with Sterling and Rose can help organizations build relationships with other organizations and stakeholders in the DAO space and provide them with access to a network of potential partners and collaborators. Final

In [70]:
previous_context = openai_summary(query+" "+context+" "+openai_answer)
# Define the conversational query or question to ask
query = "How do I get started with Sterling and Rose to setup a DAO, who do I contact"
# Create and embedding representing the question
xq = retriever.encode(query).tolist()
# Search the index for the top (k) answers 
results = vector_index.query(xq, top_k=10, include_metadata=True)
context = ""
for result in results['matches']:
    for line in result['metadata']['text']:
        context += line+" "
openai_answer = openai_question_conv(query, context, previous_context)
openai_answer

'To get started with Sterling and Rose to setup a DAO, you can contact their team at infostirlingandrose.com or 1800 178 218. You can also follow them on LinkedIn to stay up to date with their services.'