# Tutorial for building a graph from a document for Q/A and Similarity

## Import Libraries

In [1]:
# for parsing documents with structure
# sudo apt install pandoc
# pip install pandoc
import pandoc 
# for embeddings DB with meta data
import pinecone
# for large language model use (davinci-003)
# this is a paid API
# pip install openai
import openai
# hugging face transformers for efficient access to trained embeddings models
from transformers import pipeline
from sentence_transformers import SentenceTransformer, CrossEncoder

# for parsing web-pages
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
import tqdm

import PyPDF2
import nltk
nltk.download('punkt')

2023-03-25 23:58:08.152795: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-25 23:58:09.136710: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-25 23:58:09.136849: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
[nltk_data] Downloading package punkt to /home/titan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Define Inputs

In [2]:
VECTOR_INDEX_NAME = 'audio'
# we encode and insert in batches of 64
batch_size = 64

OPENAI_COMPLETION_ENGINE = 'text-davinci-003'
OPENAI_CONTEXT_LENGTH_LIMIT = 4000
EMBED_MODEL = 'multi-qa-mpnet-base-dot-v1'
EMBED_CROSS_MODEL = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
READER_MODEL = 'deepset/electra-base-squad2'

openai_api_key = "sk-KrOiHnlQW3dDLnU7aE8PooCP5MGhvYzP9XX2uSEG"
pinecone_api_key = 'c65fa925-08e1-4af0-b08b-1104c6ffba25' # https://app.pinecone.io/projects
google_api_key = 'AIzaSyAIIY6OsTws8dTfoyxNmJLmnfmH2f859Fw' # https://console.cloud.google.com/apis/dashboard

## Open AI functions

In [3]:
def openai_complete(context, prompt, max_answer_len=256) -> str:
        text_context = (context[:OPENAI_CONTEXT_LENGTH_LIMIT] + '...') \
            if len(context) > OPENAI_CONTEXT_LENGTH_LIMIT else context

        openai_prompt = f"{text_context}\n\n{prompt}"

        response = openai.Completion.create(
            engine=OPENAI_COMPLETION_ENGINE,
            prompt=openai_prompt,
            temperature=0.7,
            max_tokens=max_answer_len,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].text

def openai_summary(text, max_answer_len=255) -> str:
        if len(text) < max_answer_len:
            return text
        return openai_complete(text, "TLDR:", max_answer_len)[0]
    
def openai_question(question, context, max_answer_len=512) -> str:
        context = context[0:9000]
        openai_prompt = f"Answer the question: {question}\n\nContext: {context}\n\nAnswer:"

        response = openai.Completion.create(
            engine=OPENAI_COMPLETION_ENGINE,
            prompt=openai_prompt,
            temperature=0.7,
            max_tokens=max_answer_len,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response.choices[0].text.strip()

## Embedding definitions

In [4]:
retriever = SentenceTransformer(EMBED_MODEL)
EMBED_DIMENSIONS = retriever.get_sentence_embedding_dimension()

cross_encoder = CrossEncoder(EMBED_CROSS_MODEL)
reader = pipeline(tokenizer=READER_MODEL, model=READER_MODEL, task='question-answering')

openai.api_key = openai_api_key

pinecone.init(
    api_key=pinecone_api_key
)

if VECTOR_INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(
        name=VECTOR_INDEX_NAME,
        dimension=EMBED_DIMENSIONS,
        metric='dotproduct',
    )
vector_index = pinecone.Index(VECTOR_INDEX_NAME)

## Webpage parsing

In [75]:
def clean_html(raw_html):
    # Remove extra white space
    clean_text = re.sub('\s+', ' ', raw_html).strip()
    
    # Remove special characters and symbols except for punctuation
    clean_text = re.sub('[^A-Za-z0-9\s.,?!]+', '', clean_text)
    
    return clean_text

def crawl_domain(url):
    # Parse the base URL
    base_url = urlparse(url).scheme + '://' + urlparse(url).hostname
    
    # Initialize a set to store visited URLs
    visited_urls = set()
    
    # Initialize a list to store text information
    text_info = []
    
    # Define a recursive function to crawl links within the domain
    def crawl(url):
        # Check if the URL has already been visited
        if url in visited_urls:
            return
        
        # Add the URL to the visited set
        visited_urls.add(url)
        
        # Define headers to make the request look more like a legitimate request from a web browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }
        
        # Make a request to the URL with headers
        response = requests.get(url, headers=headers)
        
        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract the text information from the page
        text = soup.get_text()
        
        # Clean the raw html text
        text = clean_html(text)
        
        # Append the text information to the list
        text_info.append((url, text))
        
        # Find all links on the page
        links = soup.find_all('a')
        
        # Recursively crawl links within the domain
        for link in links:
            href = link.get('href')
            if href and base_url in href:
                crawl(href)
    
    # Start crawling from the base URL
    crawl(base_url)
    
    # Return the text information
    return text_info

def tokenize_sentences(text):
    """
    Tokenize sentences in the given text using the nltk library.
    
    Args:
        text (str): The input text.
    
    Returns:
        list: A list of sentence strings.
    """
    # Use the nltk library to tokenize sentences
    return nltk.sent_tokenize(text)

def crawl_website_pdfs(url):
    # Get HTML content of website
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find all links in the HTML content
    links = soup.find_all('a')
    
    # Filter out only the links that end with ".pdf"
    pdf_links = [link.get('href') for link in links if link.get('href') and link.get('href').endswith('.pdf')]
    
    # Download each PDF file to local system
    for pdf_link in pdf_links:
        file_name = pdf_link.split("/")[-1]
        try:
            urllib.request.urlretrieve(pdf_link, file_name)
            print(f"Downloaded {file_name}")
        except:
            print(f"Failed to download {file_name}")
            
def extract_text(pdf_file):
    # Open the PDF file in read-binary mode
    with open(pdf_file, 'rb') as f:
        # Create a PDF reader object
        pdf_reader = PyPDF2.PdfFileReader(f)
        
        # Initialize an empty string to store the extracted text
        text = ""
        
        # Iterate through each page of the PDF file
        for page_num in range(pdf_reader.numPages):
            # Extract the text from each page
            page = pdf_reader.getPage(page_num)
            page_text = page.extractText()
            
            # Concatenate the extracted text to the final string
            text += page_text
        
        # Close the PDF file
        f.close()
        
        # Return the extracted text
        return text

In [74]:
domains = [('https://www.lawyersweekly.com.au/sme-law/36853-why-your-firm-needs-an-ai-use-policy', 'Lawyers Weekly'),
                ('https://stirlingandrose.com/insights/', 'Sterling and Rose'),
                ('https://stirlingandrose.com/', 'Sterling and Rose')]

for domain in domains:
    channel_name = domain[1]
    text_info = crawl_domain(domain[0])
    for scrubed_data in text_info:
        # set window (length of text chunk) and stride
        window = 1
        stride = 1  # smaller stride creates overlap
        
        data = []
        results = []
        
        new_data = []
        
        window = 6  # number of sentences to combine
        stride = 3  # number of sentences to 'stride' over, used to create overlap
        
        text = tokenize_sentences(scrubed_data[1])
        if text[0].startswith("PDF"):
            print('skipping PDF')
        else:
            new_data.append({
                'text': text,
                'id': scrubed_data[0],
                'url': scrubed_data[0],
                "name":channel_name,
                "title":"channel_name",
            })
            
        # loop through in batches of 64
        index = pinecone.Index(VECTOR_INDEX_NAME)
        
        for j in range(0, len(new_data), batch_size):
            if len(new_data) > 0:
                # find end position of batch (for when we hit end of data)
                j_end = min(len(new_data), j+batch_size)
                print(j, j_end, len(new_data))
                try:
                    # extract the metadata like text, start/end positions, etc
                    batch_meta = [{
                        "text": new_data[x]["text"],
                        "url": new_data[x]["url"],
                        "name": new_data[x]["name"],
                        "title": new_data[x]["title"]
                    } for x in range(j, j_end)]
                    # extract only text to be encoded by embedding model
                    batch_text = [row['text'] for row in new_data[j:j_end]]
                    # create the embedding vectors
                    batch_embeds = retriever.encode(batch_text).tolist()
                    # extract IDs to be attached to each embedding and metadata
                    batch_ids = [row['id'] for row in new_data[j:j_end]]
                    # 'upsert' (eg insert) IDs, embeddings, and metadata to index
                    to_upsert = list(zip(batch_ids, batch_embeds, batch_meta))
                    index.upsert(to_upsert)
                except:
                    continue

0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
skipping PDF
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
skipping PDF
skipping PDF
0 1 1
skipping PDF
0 1 1
0 1 1
0 1 1
skipping PDF
0 1 1
0 1 1
0 1 1
0 1 1
skipping PDF
0 1 1
0 1 1
0 1 1
0 1 1
skipping PDF
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
skipping PDF
skipping PDF
skipping PDF
0 1 1
0 1 1
0 1 1
0 1 1
skipping PDF


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
skipping PDF
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
skipping PDF
skipping PDF
0 1 1
skipping PDF
0 1 1
0 1 1
0 1 1
skipping PDF
0 1 1
0 1 1
0 1 1
0 1 1
skipping PDF
0 1 1
0 1 1
0 1 1
0 1 1
skipping PDF
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
0 1 1
skipping PDF
skipping PDF
skipping PDF
0 1 1
0 1 1
0 1 1
0 1 1
skipping PDF


## Query pinecone index for answer to question

In [6]:
# Define the query or question to ask
query = "Who the hell is the lawyer SJ Price?"
# Create and embedding representing the question
xq = retriever.encode(query).tolist()
# Search the index for the top (k) answers 
results = vector_index.query(xq, top_k=5, include_metadata=True)
print(results)

{'matches': [{'id': '8LK4wSGC5bQ-t46.44',
              'metadata': {'end': 76.72,
                           'name': '8LK4wSGC5bQ',
                           'start': 46.44,
                           'text': 'done by other means than expensive '
                                   'lawyers. Alright, today I would like to '
                                   'welcome Shelly Jane Price who is a partner '
                                   'with the Sterling and Rose Law Firm here '
                                   'in Western Australia where she leads the '
                                   'artificial intelligence practice. SJ has '
                                   'also started lecturing in technology and '
                                   'the law at Murdoch University also here in '
                                   'Western Australia. ',
                           'title': 'Alex Jenkins and SJ Price talk all things '
                                    'Artificial Intellig

In [7]:
context = ""
for result in results['matches']:
    for line in result['metadata']['text']:
        context += line+" "
openai_answser = openai_question(query, context)
openai_answser

'Shelley Jane Price is a partner with the Sterling and Rose Law Firm in Western Australia, where she leads the artificial intelligence practice. She has also started lecturing in technology and the law at Murdoch University in Western Australia. She is a lawyer and is involved in cases such as wire fraud, conspiracy to commit wire fraud, and campaign finance violations.'

In [8]:
# Define the query or question to ask
query = "What is Sterling and Rose?"
# Create and embedding representing the question
xq = retriever.encode(query).tolist()
# Search the index for the top (k) answers 
results = vector_index.query(xq, top_k=5, include_metadata=True)
context = ""
for result in results['matches']:
    for line in result['metadata']['text']:
        context += line+" "
openai_answser = openai_question(query, context)
openai_answser

'Sterling and Rose is an incorporated legal practice in Western Australia that specializes in emerging technology, including crypto and digital assets, web 3.0, metaverse, smart legal contracts, and data rights. They serve investors, platform providers, entrepreneurs, financial institutions, and governments and are uniquely positioned to advise on FinTech, RegTech and LegalTech holistically.'