# Document processing (a few ways)

In [16]:
# Load environment variables
import os
import re
import pinecone
import glob
import json, jsonlines
import uuid
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from tqdm.auto import tqdm
from tkinter import N
from typing import Tuple

from canopy.tokenizer import Tokenizer
from canopy.knowledge_base import KnowledgeBase
from canopy.context_engine import ContextEngine
from canopy.chat_engine import ChatEngine
from canopy.llm.openai import OpenAILLM
from canopy.llm.models import ModelParams
from canopy.models.data_models import Document, Messages, UserMessage, AssistantMessage
from canopy.models.api_models import ChatResponse

import openai

from IPython.display import display, Markdown

from dotenv import load_dotenv,find_dotenv,dotenv_values
load_dotenv(find_dotenv(),override=True)

openai.api_key = os.getenv('OPENAI_API_KEY')

In [None]:
# Check api keys, load them
# print(os.getenv('OPENAI_API_KEY'))
# print(os.getenv('PINECONE_ENVIRONMENT'))
# print(os.getenv('PINECONE_API_KEY'))

In [7]:
# data_folder='../data/FEA/'
data_folder='../data/AMS/'
docs = glob.glob(data_folder+'*.pdf')   # Only get the PDFs in the directory

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 5000,
    chunk_overlap  = 0,
    length_function = len,
    is_separator_regex = False,
)

# print(docs[-1])
# loader=PyPDFLoader(docs[-1])
# doc_pages=loader.load_and_split(text_splitter)

pages=[]
for doc in docs:
    loader=PyPDFLoader(doc)
    doc_pages=loader.load_and_split(text_splitter)
    pages.extend(doc_pages)
    print('Processed: '+doc)

In [4]:
# Tidy text up
# TODO: I now have this in a function within data_import, can remove this.
pages_dict=[]
docs_canopy=[]
for page in pages:
    page.metadata['source']=os.path.basename(page.metadata['source'])   # Strip path
    page.metadata['page']=int(page.metadata['page'])+1   # Pages are 0 based, update
    # Merge hyphenated words
    page.page_content=re.sub(r"(\w+)-\n(\w+)", r"\1\2", page.page_content)
    # Fix newlines in the middle of sentences
    page.page_content = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", page.page_content.strip())
    # Remove multiple newlines
    page.page_content = re.sub(r"\n\s*\n", "\n\n", page.page_content)

    # Format into canopy format
    dict_temp=dict(page)
    dict_temp_out={'id':page.metadata['source']+"_"+str(page.metadata['page'])+str(uuid.uuid4()),
                   'text':dict_temp.pop('page_content'),
                   'source':dict_temp['metadata']['source'],
                   'metadata':dict_temp['metadata']}
    dict_temp_out['metadata']['page']=str(dict_temp_out['metadata']['page'])
    pages_dict.append(dict_temp_out)


    doc_temp=Document(id=page.metadata['source']+"_"+str(page.metadata['page'])+str(uuid.uuid4()),
                        text=page.page_content,
                        source=page.metadata['source'],
                        metadata={'page':str(page.metadata['page'])})
    docs_canopy.append(doc_temp)

In [5]:
print(pages[4])

page_content='PREFACE  The Aerospace Mechanisms Symposium (AMS) provides a unique forum for those  active in the design, production and use of aerospace mechanisms. A major focus is  the reporting of problems and solutions associated with the development and flight  certification of new mechanisms. Organized by the Mechanisms Education Association,  the National Aeronautics and Space Administration and Lockheed Martin Space  Systems Company (LMSSC) share the responsibility for hosting the AMs. Now in its  38fh symposium, the AMS continues to be well attended, attracting participants from  both the US. and abroad.  The 38fh AMs, hosted by the Langley Research Center (LaRC) in Williamsburg,  Virginia, was held May 17, 18 and 19, 2006. During these three days, 34 papers were  presented. Topics included gimbals, tribology, actuators, aircraft mechanisms,  deployment mechanisms, release mechanisms, and test equipment. Hardware displays  during the supplier exhibit gave attendees an opportun

In [6]:
print(pages_dict[0])

{'id': 'AMS_2006.pdf_16a7516b2-c332-4350-b24b-48e8f88fed68', 'text': 'NASNCP-2006-2 14290  3 sth Aerospace Mechanisms Symposium  Compiled by  Edward A. Boesiger  Lockheed Martin Space Systems Company, Sunnyvale, California  Proceedings of a symposium hosted by  the NASA Langley Research Center and  Lockheed Martin Space Systems Company and  organized by the Mechanisms Education Association  held at the Williamsburg Maniott Hotel  Williamsburg, Virginia  May 17- 19,2006  May 2006', 'source': 'AMS_2006.pdf', 'metadata': {'source': 'AMS_2006.pdf', 'page': '1'}}


In [36]:
import tiktoken
tiktoken.encoding_for_model("gpt-3.5-turbo")

def num_tokens_from_string(string: str, model_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

print(docs_canopy[0])
print(num_tokens_from_string(docs_canopy[0].text, model_name))

id='AMS_2006.pdf_12da44f16-e0da-4128-8f34-3f003b549cb8' text='NASNCP-2006-2 14290  3 sth Aerospace Mechanisms Symposium  Compiled by  Edward A. Boesiger  Lockheed Martin Space Systems Company, Sunnyvale, California  Proceedings of a symposium hosted by  the NASA Langley Research Center and  Lockheed Martin Space Systems Company and  organized by the Mechanisms Education Association  held at the Williamsburg Maniott Hotel  Williamsburg, Virginia  May 17- 19,2006  May 2006' source='AMS_2006.pdf' metadata={'page': '1'}
102


In [8]:
with jsonlines.open(data_folder+'ams_data.jsonl', mode='w') as writer:
    writer.write_all(pages_dict)

# Upserting docs for canopy

## Command line interface

From here we can switch across to Canopy CLI (or other method) and run:

```
canopy
canopy upsert ./ai_arxiv.jsonl
```

## Canopy library for upsert

In [59]:
Tokenizer.initialize()

index_name='canopy--ams'
k=15

kb = KnowledgeBase(index_name=index_name,
                   default_top_k=k)
kb.connect()

In [10]:
batch_size = 10

for i in tqdm(range(0, len(pages_dict), batch_size)):
    kb.upsert(docs_canopy[i: i+batch_size])

  0%|          | 0/718 [00:00<?, ?it/s]

# Chatting

Then we begin chatting by first starting the Canopy Server:

```
canopy start
```

Then begin chatting with:

```
canopy chat
```

_(we can also add the `--no-rag` flag to see how our RAG vs. non-RAG results compare!)_

In [67]:
# Model parameters: # https://platform.openai.com/docs/api-reference/chat/create
from math import inf


model_name='gpt-3.5-turbo'# gpt-3.5-turbo-16k, gpt-3.5-turbo, gpt-3.5-turbo-1106, gpt-4, gpt-4-32k
model_list={'gpt-3.5-turbo':4096,
            'gpt-3.5-turbo-16k':16385,
            'gpt-3.5-turbo-1106':16385, 
            'gpt-4':8192,
            'gpt-4-32k':32768}

temperature=0   # Varies between 0-2
n=None  # How many chat completion choices to generate for each input message.
top_p=None  # where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
max_generated_tokens=None
max_prompt_tokens=model_list[model_name]    # Use max allowed for model

def chat(new_message: str, history: Messages) -> Tuple[str, Messages, ChatResponse]:
    messages = history + [UserMessage(content=new_message)]
    response = chat_engine.chat(messages,model_params=model_params)
    assistant_response = response.choices[0].message.content
    return assistant_response, messages + [AssistantMessage(content=assistant_response)], response

context_engine = ContextEngine(kb)
llm=OpenAILLM(model_name=model_name)
chat_engine = ChatEngine(context_engine,
                         llm=llm,
                         max_generated_tokens=max_generated_tokens,
                         max_prompt_tokens=max_prompt_tokens)
model_params=ModelParams(temperature=temperature,
                         n=n,  # number of completions to generate
                         top_p=top_p)

In [68]:
history = []
response, history, chat_response = chat("What types of lubricants are to be avoided when designing space mechanisms?", history)
display(Markdown(response))

InvalidRequestError: We could not parse the JSON body of your request. (HINT: This likely means you aren't using your HTTP library correctly. The OpenAI API expects a JSON payload, but what was sent was not valid JSON. If you have trouble figuring out how to fix this, please contact us through our help center at help.openai.com.)

In [57]:
response, history, chat_response = chat("Can you speak to what failures have occurred when using mineral oil lubricants?", history)
display(Markdown(response))


InvalidRequestError: We could not parse the JSON body of your request. (HINT: This likely means you aren't using your HTTP library correctly. The OpenAI API expects a JSON payload, but what was sent was not valid JSON. If you have trouble figuring out how to fix this, please contact us through our help center at help.openai.com.)