In [13]:
# Load environment variables
import os
import re
import pinecone
import glob
import data_import
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pprint
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(),override=True)

True

In [2]:
# Check api keys
# print(os.getenv('OPENAI_API_KEY'))
# print(os.getenv('PINECONE_ENVIRONMENT'))
# print(os.getenv('PINECONE_API_KEY'))

In [3]:
# Instantiate OpenAI embeddings
embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")

In [4]:
# Initialize Pinecone client, test if it works
# pinecone.init(
#     api_key=os.getenv('PINECONE_API_KEY'),
#     environment=os.getenv('PINECONE_ENVIRONMENT') 
# )
# pinecone.whoami()


In [5]:
# Find the existing index, clear for new start
# index=pinecone.Index(index_name)

In [6]:
# Pinecone database: https://app.pinecone.io/organizations/-Nam3zmbSmzuXKeH8EWl/projects/us-west1-gcp-free:32467cc/indexes/langchain-quickstart
index_name = "ams"

# data_folder='../data/FEA/'
data_folder='../data/AMS/'
docs = glob.glob(data_folder+'*.pdf')   # Only get the PDFs in the directory

# data_import.load_docs(index_name=index_name,
#                       embeddings_model=embeddings_model,
#                       docs=docs,
#                       PINECONE_API_KEY=os.getenv('PINECONE_API_KEY'),
#                       PINECONE_ENVIRONMENT=os.getenv('PINECONE_ENVIRONMENT'),
#                       chunk_size=5000,
#                       chunk_overlap=0)

In [7]:
import json

from llama_index import SimpleDirectoryReader
from llama_index.node_parser import SentenceSplitter
from llama_index.schema import MetadataMode

def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SentenceSplitter(chunk_size=5000)
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    for node in nodes:
        # Merge hyphenated words
        node.text=re.sub(r"(\w+)-\n(\w+)", r"\1\2", node.text)
        # Fix newlines in the middle of sentences
        node.text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", node.text.strip())
        # Remove multiple newlines
        node.text = re.sub(r"\n\s*\n", "\n\n", node.text)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [8]:
nodes = load_corpus(docs[-3:], verbose=True)
# val_nodes = load_corpus(VAL_FILES, verbose=True)

Loading files ['../data/AMS/AMS_2018.pdf', '../data/AMS/AMS_2008.pdf', '../data/AMS/AMS_2020.pdf']
Loaded 1616 docs


Parsing nodes:   0%|          | 0/1616 [00:00<?, ?it/s]

Parsed 1616 nodes


In [9]:
nodes[100].text

'Dynamic Behavior of Ball Bearings under Axial Vibration  Virgil Hinque* and René Seiler* Abstract  The paper addresses the dynamics of ball bearings when exposed to vibration loads along their axis of  rotation. Following common practice in space mechanisms design, the bearings are mounted in either hard  preloaded or soft preloaded pairs. A computer-based model has been developed for the analysis and  prediction of the load-deflection characteristics in bearing systems. Furthermore, the model may be used  to quantify the maximum loads applied on the bearings and the resulting stresses during a vibration test or  a spacecraft launch.  In parallel to the model development, an experimental test program has been carried out in order to get  sufficient data for model correlation. In this context, the paper also elaborates on the post-processing of the  acquired test signals and discusses specific effects, for instance nonlinearities due to the use of snubbers,  in the time domain as well 

# OpenAI, ChatGPT-3.5

In [28]:
from llama_index.llms import OpenAI

# model_name='gpt-3.5-turbo-instruct'
model_name='gpt-3.5-turbo-1106' # 16,385 tokens
# model_name='gpt-4-0613'
llm=OpenAI(model_name=model_name)



In [29]:
from llama_index.finetuning import generate_qa_embedding_pairs


qa_pairs=generate_qa_embedding_pairs(nodes[100:110],
                                     llm=llm,
                                     num_questions_per_chunk=1)

100%|██████████| 10/10 [03:51<00:00, 23.17s/it]


In [61]:
queries=qa_pairs.queries.values()
pprint.pprint(list(queries),width=500)

['Explain the purpose and significance of the computer-based model developed in this study for analyzing and predicting the load-deflection characteristics in ball bearing systems. How can this model be used to quantify maximum loads and resulting stresses during vibration tests or spacecraft launches?',
 'Discuss the challenges faced during the equipment design process when assessing the performance of ball bearings for rotation functions in space mechanisms. How does the use of quasi-static equivalent loads and stresses help in reducing the effect of a sine and random vibration environment? Provide examples of the structural behavior of bearing assemblies and the influence of preload and other parameters on their performance.',
 'Explain the difference between the hard preload and soft preload methods used in ball bearing systems in space mechanisms. What are the advantages and disadvantages of each method?',
 'How does gapping occur in ball bearing systems? Compare and contrast the 

In [62]:
qa_pairs.save_json(data_folder+'ams_dataset_qa_pairs_'+model_name+'.json')

# Hugging Face Models

In [30]:
from llama_index.llms import HuggingFaceInferenceAPI

# model_name='google/flan-t5-xxl'
# model_name='WizardLM/WizardLM-70B-V1.0'
model='mistralai/'
name='Mistral-7B-Instruct-v0.1'
llm=HuggingFaceInferenceAPI(model_name=model+name,token=os.getenv('HUGGINGFACEHUB_API_TOKEN'))

In [24]:
from llama_index.finetuning import generate_qa_embedding_pairs


qa_pairs=generate_qa_embedding_pairs(nodes[100:110],
                                     llm=llm,
                                     num_questions_per_chunk=1)

100%|██████████| 10/10 [00:01<00:00,  6.70it/s]


In [25]:
queries=qa_pairs.queries.values()
pprint.pprint(list(queries),width=500)

['What is the main sizing criterion for ball bearings?',
 'What is the difference between hard and soft preload methods in ball bearing systems in space mechanisms?',
 'What is the difference between hard and soft preloaded bearings?',
 'What is the purpose of preloading bearings in general?',
 'What is the purpose of the bearing cartridges in the test units?',
 'How were the test units mounted on the shakers during the test program?',
 'What was the role of the accelerometers in the test set-ups?',
 'What were the preload parameters for the three test units?',
 'What were the test set-ups used for the sine sweep tests?',
 'What were the test set-ups used for the random vibration tests?',
 'What were the test set-ups used for the constant frequency sine tests?',
 'What was the purpose of the load cells in the test set-ups?',
 'How were the time-domain signals of all sensors recorded during the tests?',
 'What was the comparison made during the sine sweep tests before and after the high

In [27]:
qa_pairs.save_json(data_folder+'ams_dataset_qa_pairs_'+name+'.json')