In [58]:
!pip install langchain



In [1]:
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the tokenizer
tokenizer_path = "D:\Projects\LLM\\tokenizer" # Directory with files
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [3]:
# Constants
stdout_padding = "#" * 20

# Confirm vocabulary size
print(f"{stdout_padding} Llama2 Tokenizer Details {stdout_padding}\n")
print(f"Llama2 tokenizer overview: {tokenizer}")
print(f"Llama2 Vocabulary Size: {len(tokenizer.get_vocab().keys())}\n")
print(f"{stdout_padding} End of Llama2 Tokenizer Details {stdout_padding}\n")

#################### Llama2 Tokenizer Details ####################

Llama2 tokenizer overview: LlamaTokenizerFast(name_or_path='D:\Projects\LLM\tokenizer', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
Llama2 Vocabulary Size: 32000

#################### End of Llama2 Tokenizer Details ####################



In [4]:
# Verify token IDs for Llama2 special tokens
print(f"{stdout_padding} Llama2 Special Tokens {stdout_padding}\n")

UNK = "<unk>" # Unknown token
BOS, EOS = "<s>", "</s>" # Begin of sequnece and end of sequence tokens

special_tokens = [UNK, BOS, EOS]

for token in special_tokens:
    print(f'Token ID for the special token {token}: {tokenizer.get_vocab()[token]}')
    print(f'Encoded {token} becomes: {tokenizer.encode(token)}\n')

print(f"{stdout_padding} End of Llama2 Special Tokens {stdout_padding}\n")

#################### Llama2 Special Tokens ####################

Token ID for the special token <unk>: 0
Encoded <unk> becomes: [1, 0]

Token ID for the special token <s>: 1
Encoded <s> becomes: [1, 1]

Token ID for the special token </s>: 2
Encoded </s> becomes: [1, 2]

#################### End of Llama2 Special Tokens ####################



In [5]:
# Verify token IDs for Llama2 prompt symbols
print(f"{stdout_padding} Llama2 Prompt Symbols {stdout_padding}\n")

B_INST, E_INST = "[INST]", "[/INST]" # Begin of instruction and end of instruction symbols
B_SYS, E_SYS = "<<SYS>>\n", "\n<<SYS>>\n\n" # Begin of system message and end of system message symbols

prompt_symbols = [B_INST, E_INST, B_SYS, E_SYS]

for symbol in prompt_symbols:
    encoded_symbol = tokenizer.encode(symbol)
    print(f'Encoded {repr(symbol)} becomes: {encoded_symbol}')
    
    for token in encoded_symbol:
        print(f"\tToken ID {token} --> {repr(tokenizer.decode(token))}")

print(f"\n{stdout_padding} End of Llama2 Prompt Symbols {stdout_padding}\n")

#################### Llama2 Prompt Symbols ####################

Encoded '[INST]' becomes: [1, 518, 25580, 29962]
	Token ID 1 --> '<s>'
	Token ID 518 --> '['
	Token ID 25580 --> 'INST'
	Token ID 29962 --> ']'
Encoded '[/INST]' becomes: [1, 518, 29914, 25580, 29962]
	Token ID 1 --> '<s>'
	Token ID 518 --> '['
	Token ID 29914 --> '/'
	Token ID 25580 --> 'INST'
	Token ID 29962 --> ']'
Encoded '<<SYS>>\n' becomes: [1, 3532, 14816, 29903, 6778, 13]
	Token ID 1 --> '<s>'
	Token ID 3532 --> '<<'
	Token ID 14816 --> 'SY'
	Token ID 29903 --> 'S'
	Token ID 6778 --> '>>'
	Token ID 13 --> '\n'
Encoded '\n<<SYS>>\n\n' becomes: [1, 29871, 13, 9314, 14816, 29903, 6778, 13, 13]
	Token ID 1 --> '<s>'
	Token ID 29871 --> ''
	Token ID 13 --> '\n'
	Token ID 9314 --> '<<'
	Token ID 14816 --> 'SY'
	Token ID 29903 --> 'S'
	Token ID 6778 --> '>>'
	Token ID 13 --> '\n'
	Token ID 13 --> '\n'

#################### End of Llama2 Prompt Symbols ####################



In [6]:
# Test tokenizer on a sentence
print(f"{stdout_padding} Llama2 Tokenizer Sentence Example {stdout_padding}\n")
sentence = "RHEL subscription manager let's you manage packages on RedHat."
encoded_output = tokenizer.encode(sentence)
print(f"Original sentence: {sentence}")
print(f"Encoded sentence: {encoded_output}")

# Verify what each token ID correlates to
for token in encoded_output:
    print(f"Token ID {token} --> {tokenizer.decode(token)}")

print(f"{stdout_padding} End of Llama2 Tokenizer Sentence Example {stdout_padding}\n")

#################### Llama2 Tokenizer Sentence Example ####################

Original sentence: RHEL subscription manager let's you manage packages on RedHat.
Encoded sentence: [1, 390, 29950, 6670, 25691, 8455, 1235, 29915, 29879, 366, 10933, 9741, 373, 4367, 29950, 271, 29889]
Token ID 1 --> <s>
Token ID 390 --> R
Token ID 29950 --> H
Token ID 6670 --> EL
Token ID 25691 --> subscription
Token ID 8455 --> manager
Token ID 1235 --> let
Token ID 29915 --> '
Token ID 29879 --> s
Token ID 366 --> you
Token ID 10933 --> manage
Token ID 9741 --> packages
Token ID 373 --> on
Token ID 4367 --> Red
Token ID 29950 --> H
Token ID 271 --> at
Token ID 29889 --> .
#################### End of Llama2 Tokenizer Sentence Example ####################



In [7]:
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader('D:\Projects\LLM\Createdb\docs', glob="*.pdf",show_progress=True,use_multithreading=True)
docs = loader.load()
len(docs)

  0%|          | 0/2 [00:00<?, ?it/s]Error loading file D:\Projects\LLM\Createdb\docs\10.1.1.66.5347.pdf
 50%|█████     | 1/2 [00:04<00:04,  4.16s/it]Error loading file D:\Projects\LLM\Createdb\docs\A Recent trend in SHM.pdf
100%|██████████| 2/2 [00:04<00:00,  2.08s/it]


0

In [8]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("D:\Projects\LLM\Createdb\docs\\10.1.1.66.5347.pdf")
pages = loader.load_and_split()

In [9]:
import re


def _clean_text(text):
    text = re.sub(r'-\n', '', text)
    return text

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
def tiktoken_splitter(page: str,chunk_size=300, chunk_overlap=40):
    # create length function
    def len_fn(page):
        tokens = tokenizer.encode(
            page
        )
        return len(tokens)
    # initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len_fn,
        separators=["\n\n", "\n", " ", ""]
    )
    return text_splitter

In [15]:
!pip install PyPDF2
import PyPDF2


def _convert_pdf_to_text(path):
    """Convert the PDF to text and store it in the self.content
    attribute.
    """
    text = []
    with open(path, 'rb') as f:
        # create a PDF object
        pdf = PyPDF2.PdfReader(f)
        # iterate over every page in the PDF
        for page in range(len(pdf.pages)):
            # get the page object
            page_obj = pdf.pages[page]
            # extract text from the page
            text.append(page_obj.extract_text())
    text = "\n".join(text)
    return text




In [27]:
text = _convert_pdf_to_text("D:\Projects\LLM\Createdb\docs\\10.1.1.66.5347.pdf")

In [29]:
splitter = tiktoken_splitter(text,300,40)
clean = _clean_text(text)
chunks = splitter.split_text(clean)
print(chunks,'\n')

['Structural damage identiﬁcation in laminated structures\nusing FRF data\nJ.V. Arau ´jo dos Santos, C.M. Mota Soares*, C.A. Mota Soares, N.M.M. Maia\nIDMEC/IST,InstitutoSuperiorTe ´cnico,Av.RoviscoPais, 1049-001Lisbon,Portugal\nAvailable online 5November 2004\nAbstract\nAdamageidentiﬁcationtechniquebasedonfrequencyresponsefunctions(FRF)sensitivities ispresented.Thistechniqueleadsto\na set of linear equations, which is solved using an algorithm that constrains the solution to be physically admissible. Damage simulation and identiﬁcation on a laminated rectangular plate is performed. The inﬂuence of the number of natural frequencies and\nmode shapes used on the FRF computation, as well as the frequency range, the excitation location and the number of measureddegrees of freedom (m-DOF) is studied. Numerical tests show that the best accuracy is obtained when using the dynamic expansion', 'of the m-DOF. It is also demonstrated that for small damage the errors are the main inﬂuence, whereas

In [63]:
chunks

['Structural damage identiﬁcation in laminated structures\nusing FRF data\nJ.V. Arau ´jo dos Santos, C.M. Mota Soares*, C.A. Mota Soares, N.M.M. Maia\nIDMEC/IST,InstitutoSuperiorTe ´cnico,Av.RoviscoPais, 1049-001Lisbon,Portugal\nAvailable online 5November 2004\nAbstract\nAdamageidentiﬁcationtechniquebasedonfrequencyresponsefunctions(FRF)sensitivities ispresented.Thistechniqueleadsto\na set of linear equations, which is solved using an algorithm that constrains the solution to be physically admissible. Damage simulation and identiﬁcation on a laminated rectangular plate is performed. The inﬂuence of the number of natural frequencies and\nmode shapes used on the FRF computation, as well as the frequency range, the excitation location and the number of measureddegrees of freedom (m-DOF) is studied. Numerical tests show that the best accuracy is obtained when using the dynamic expansion',
 'of the m-DOF. It is also demonstrated that for small damage the errors are the main inﬂuence, wherea

In [89]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader('D:\Projects\LLM\Createdb\docs\A Recent trend in SHM.pdf')
docs = loader.load_and_split()
text_splitter = tiktoken_splitter(docs)
chunks = text_splitter.split_documents(docs)

In [90]:
chunks

[Document(page_content='http://ammtiac.alionscience.com TheAMMTIACQuarterly,Volume3,Number4 3\nINTRODUCTION\nThesubjectofstructuralhealthmonitoring(SHM)isemergingasanincreasinglyimportantcomponentofoverallNondestructiveEvaluation (NDE) programs and is now being considered forimplementationinavarietyofapplicationsincludingspacecraftcomponents, bridges, and aircraft. Over the past several years,\ntherehavebeenanumberoflimiteddemonstrationsofSHMinactualfieldapplications.InadditiontotheArmy,NavyandAirForce,otherfederalagenciesinterestedinthistopicincludethe\nFederalAviationAdministration,NASAandtheDepartmentofTransportation.Thedetectionofdamageatanylocationisacom-\nmonchallengeinmostlargestructures,andthusSHMhaspoten-tial applications across a range of industrial sectors. Examplesincludethedetectionofimpactinduceddelaminationsincompos-\nite aerospace structures, localized corrosion in petrochemical\nplantsandunauthorizedpenetrationsofshippingcontainers.', metadata={'source': 'D:\\Projects\

In [78]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp311-cp311-win_amd64.whl.metadata (3.8 kB)
Downloading faiss_cpu-1.8.0-cp311-cp311-win_amd64.whl (14.5 MB)
   ---------------------------------------- 0.0/14.5 MB ? eta -:--:--
   ---------------------------------------- 0.1/14.5 MB 2.3 MB/s eta 0:00:07
    --------------------------------------- 0.2/14.5 MB 2.5 MB/s eta 0:00:06
   - -------------------------------------- 0.5/14.5 MB 3.9 MB/s eta 0:00:04
   - -------------------------------------- 0.7/14.5 MB 3.8 MB/s eta 0:00:04
   -- ------------------------------------- 0.9/14.5 MB 3.7 MB/s eta 0:00:04
   -- ------------------------------------- 1.0/14.5 MB 3.8 MB/s eta 0:00:04
   --- ------------------------------------ 1.2/14.5 MB 3.7 MB/s eta 0:00:04
   --- ------------------------------------ 1.4/14.5 MB 3.6 MB/s eta 0:00:04
   --- ------------------------------------ 1.4/14.5 MB 3.4 MB/s eta 0:00:04
   ---- ----------------------------------- 1.5/14.5 MB 3.4 MB/s eta 0:00:04
 

In [91]:
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
faiss_index = FAISS.from_string(chunks,embeddings)
faiss_index_name = 'faiss-index-incremental'
faiss_index.save_local(faiss_index_name)

In [92]:
query = "What are active Ultrasonic Sensors"
docs = faiss_index.similarity_search(query)

In [94]:
print(docs[0].page_content)

levels as well as acoustic emission signals. This resulted in a
tremendousreductioninthenumberofdataacquisitionchan-nelsneededforSHMoflargestructures,anditgreatlyincreases
thechanceofhavingacousticemissionsensorsnearanacoustic
emission source. Since ultrasonic waves generated by acoustic
emissioneventsattenuaterapidly,itisessentialtohaveasufficient
distributionofsensorsincompositestructures.Theresponseof
AFCsensors(alsoknownaspiezoceramicactivefibersensors)was
modeled,andtheresearchersdescribedhowtomanufacturethese
sensors.Figure2showsthedesignofa2x2sensorarrayusedby
theresearchers.
TheauthorsconcludedthatAFCsensorswereabletodetect
actual damage propagation in a composite structure tested tofailurelongbeforethestructurefailed.Whencomparedtocon-ventionalacousticemissionsensorsorpiezoelectricpatchtrans-
