# Document processing (a few ways)

In [39]:
# Load environment variables
import os
import re
import pinecone
import glob
import json, jsonlines
import uuid
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from canopy.tokenizer import Tokenizer
from canopy.knowledge_base import KnowledgeBase
from canopy.models.data_models import Document
from tqdm.auto import tqdm

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)

import openai
openai.api_key = os.getenv('OPENAI_API_KEY')

In [None]:
# Check api keys
# print(os.getenv('OPENAI_API_KEY'))
# print(os.getenv('PINECONE_ENVIRONMENT'))
# print(os.getenv('PINECONE_API_KEY'))

In [10]:
# data_folder='../data/FEA/'
data_folder='../data/AMS/'
docs = glob.glob(data_folder+'*.pdf')   # Only get the PDFs in the directory

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1000,
    chunk_overlap  = 20,
    length_function = len,
    is_separator_regex = False,
)

# print(docs[-1])
# loader=PyPDFLoader(docs[-1])
# doc_pages=loader.load_and_split(text_splitter)

pages=[]
for doc in docs[-3:]:
    loader=PyPDFLoader(doc)
    doc_pages=loader.load_and_split(text_splitter)
    pages.extend(doc_pages)
    print('Processed: '+doc)

Processed: ../data/AMS\AMS_2018.pdf
Processed: ../data/AMS\AMS_2020.pdf
Processed: ../data/AMS\AMS_2022.pdf


In [33]:
# Tidy text up
pages_dict=[]
docs_canopy=[]
for page in pages:
    page.metadata['source']=os.path.basename(page.metadata['source'])   # Strip path
    page.metadata['page']=int(page.metadata['page'])+1   # Pages are 0 based, update
    # Merge hyphenated words
    page.page_content=re.sub(r"(\w+)-\n(\w+)", r"\1\2", page.page_content)
    # Fix newlines in the middle of sentences
    page.page_content = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", page.page_content.strip())
    # Remove multiple newlines
    page.page_content = re.sub(r"\n\s*\n", "\n\n", page.page_content)

    # Format into canopy format
    dict_temp=dict(page)
    dict_temp_out={'id':page.metadata['source']+"_"+str(page.metadata['page'])+str(uuid.uuid4()),
                   'text':dict_temp.pop('page_content'),
                   'source':dict_temp['metadata']['source'],
                   'metadata':dict_temp['metadata']}
    dict_temp_out['metadata']['page']=str(dict_temp_out['metadata']['page'])
    pages_dict.append(dict_temp_out)


    doc_temp=Document(id=page.metadata['source']+"_"+str(page.metadata['page'])+str(uuid.uuid4()),
                        text=page.page_content,
                        source=page.metadata['source'],
                        metadata={'page':str(page.metadata['page'])})
    docs_canopy.append(doc_temp)

In [28]:
print(pages[4])

page_content='44th Aerospace Mechanisms SymposiumNASA/CP—2018-219887 May 2018Edward A. Boesiger, Compiler Lockheed Martin Space Systems Company, Sunnyvale, California National Aeronautics and Space Administration Glenn Research Center Cleveland, Ohio 44135Proceedings of a conference held at Hilton Cleveland Downtown Hosted by NASA Glenn Research Center and Lockheed Martin Space Systems CompanySponsored and organized by Mechanisms Education AssociationCleveland, OhioMay 16–18, 2018' metadata={'source': 'AMS_2018.pdf', 'page': '5'}


In [12]:
print(pages_dict[0])

{'id': 'AMS_2018.pdf_1bfb03e76-1d88-4ded-a9c1-0f891f878ba1', 'text': '44th Aerospace Mechanisms SymposiumNASA/CP—2018-219887 May 2018Edward A. Boesiger, Compiler Lockheed Martin Space Systems Company, Sunnyvale, California', 'source': 'AMS_2018.pdf', 'metadata': {'source': 'AMS_2018.pdf', 'page': '1'}}


In [34]:
print(docs_canopy[0])

id='AMS_2018.pdf_6b85047d9-8331-4c41-a99f-d6c0122c807d' text='44th Aerospace Mechanisms SymposiumNASA/CP—2018-219887 May 2018Edward A. Boesiger, Compiler Lockheed Martin Space Systems Company, Sunnyvale, California' source='AMS_2018.pdf' metadata={'page': '6'}


In [30]:
with jsonlines.open(data_folder+'ams_data.jsonl', mode='w') as writer:
    writer.write_all(pages_dict)

# Upserting docs for canopy

## Command line interface

From here we can switch across to Canopy CLI (or other method) and run:

```
canopy
canopy upsert ./ai_arxiv.jsonl
```

## Canopy library for upsert

In [32]:
Tokenizer.initialize()

index_name='canopy--ams'
kb = KnowledgeBase(index_name=index_name)
kb.connect()

In [40]:
batch_size = 10

for i in tqdm(range(0, len(pages_dict), batch_size)):
    kb.upsert(docs_canopy[i: i+batch_size])

100%|██████████| 445/445 [08:55<00:00,  1.20s/it]


# Chatting

Then we begin chatting by first starting the Canopy Server:

```
canopy start
```

Then begin chatting with:

```
canopy chat
```

_(we can also add the `--no-rag` flag to see how our RAG vs. non-RAG results compare!)_

In [41]:
from canopy.context_engine import ContextEngine
context_engine = ContextEngine(kb)

In [42]:
from canopy.chat_engine import ChatEngine
chat_engine = ChatEngine(context_engine)

In [43]:
from typing import Tuple
from canopy.models.data_models import Messages, UserMessage, AssistantMessage

def chat(new_message: str, history: Messages) -> Tuple[str, Messages]:
    messages = history + [UserMessage(content=new_message)]
    response = chat_engine.chat(messages)
    assistant_response = response.choices[0].message.content
    return assistant_response, messages + [AssistantMessage(content=assistant_response)]

In [45]:
from IPython.display import display, Markdown

history = []
response, history = chat("What types of lubricants are to be avoided when designing space mechanisms?", history)
display(Markdown(response))

The types of lubricants to be avoided when designing space mechanisms include Perf luoropolyethers (PFPE) and multiply alkylated cyclopentanes (MAC). These lubricants have issues with their tribofilm forming properties, which can lead to seizure and limited resupply of lubricant in vacuum conditions. It is also important to note that wet and binder-based lubricants are not suitable for space mechanisms due to their high outgassing properties. 

Source: AMS_2020.pdf, AMS_2018.pdf, AMS_2022.pdf

In [46]:
response, history = chat("Can you speak to what failures have occurred when using Perf luoropolyethers (PFPE)?", history)
display(Markdown(response))

When using Perf luoropolyethers (PFPE), failures have occurred due to degradation of the lubricant, resulting in increased friction coefficients, material wear, and component failure. This degradation is often referred to as the "Brown Sugar" effect, characterized by the residue resembling the color and texture of brown sugar. PFPE lubricants, although highly chemically inert, are subject to breakdown under high stress environments, especially in the presence of Lewis acids. This breakdown can lead to lubricant starvation, increased wear, and the generation of friction polymers and metallic fluorides, which may initially improve performance but ultimately contribute to further degradation.

Source: AMS_2022.pdf, AMS_2018.pdf