In [7]:
import fitz
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
import pandas as pd
from langchain import hub
from langchain_chroma import Chroma
from langchain import PromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import LlamaCppEmbeddings
from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings)


In [8]:
PROMPT = """You are an assistant tasked with summarizing tables and text. 
Give a concise summary of the given table and text.
{content_of_pdf} """

In [17]:
#BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/industrial-cybersecurity-efficiently-monitor-the-cybersecurity-posture-of-your-ics-environment_compress.pdf'
PATH = '/home/iai/sb7059/git/llm_test/data/Book/Images'
#BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/fdgth-06-1321485.pdf'
BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/smeggitt.pdf'
WORKSPACE_DIC = "/hkfs/work/workspace_haic/scratch/sb7059-llm_models_jeremy"

MODEL_PATH = { #"Mixtral-8x-7b": WORKSPACE_DIC + "/Mixtral/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf",
               #"Phi-2": WORKSPACE_DIC + "/Phi/Phi2/phi-2.Q4_K_M.gguf",
               #"Llama2-70b": WORKSPACE_DIC + "/Llama/Llama2/llama-2-70b.Q5_K_M.gguf",
                "Phi-3-medium-128k": WORKSPACE_DIC + "/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf",
               #"LLama-3-70b": WORKSPACE_DIC + "/Llama/LLama3/Meta-Llama-3-70B-Instruct-v2.Q4_K_M.gguf",
               #"Mixtral-8x22b": WORKSPACE_DIC + "/Mixtral/Mixtral-8x22b-Instruct",
               #"Mixtral-8x-22b": WORKSPACE_DIC + "/Mixtral/Mixtral-8x22B-Instruct-v0.1.Q4_K_M-00001-of-00002.gguf",
              }

In [20]:
# Create a document object
doc = fitz.open(BOOK_PDF)

content_of_pdf = ""

#Iterate over all pages in the documents
#for i in range(doc.page_count):
for i in range(0, 3):
  page = doc.load_page(i)

  # read text and print it
  text = page.get_text()

  #Add the text to a string to be used in the prompt
  content_of_pdf = content_of_pdf + text


  # Extract all the images on the page and save the images
  for i in page.get_images(full=True):
    xref = i[0]
    base_image = doc.extract_image(xref)
    image_bytes = base_image["image"]
    image = fitz.Pixmap(doc, xref)
    with open(f'{PATH}/image_{xref}.png', 'wb') as f:
      f.write(image.tobytes())

  # Extract all the tables on the page and save the tables
  tabs = page.find_tables()  # detect the tables
  for i,tab in enumerate(tabs):  # iterate over all tables
      print(f"Table {i} column names: {tab.header.names}, external: {tab.header.external}")
      tab = tabs[i]
      df = tab.to_pandas()
      #Add the table to a string to be used in the prompt
      content_of_pdf += df.to_string()


In [26]:
content_of_pdf = content_of_pdf.replace('\n', ' ')
print(content_of_pdf)

          MEDJACK Attacks:   The Scariest Part of the Hospital                                            Sinclair Meggitt  Comp 116  Tufts University  December 12th, 2018            Table of Contents      Abstract 2  Introduction 2  To the Community 2  Medical Device Vulnerabilities 3  I. The Internet of Things 3  II. A Black Hole 3  MEDJACK Attack 3  I. History 3  II. Anatomy of Attack 4  III. Malware 4  MEDJACK Defense 5  I. Remediation 5  II. Recommendations and Best Practices 5  Conclusion 6  Works Cited 7                                  Abstract  As of 2015, the healthcare industry became the most attacked industry, experiencing 32.7% of all  known breaches nationwide. (TrapX, 2015) The increased targeting is due to three main reasons:  patient records are extremely valuable. the healthcare industry is notoriously slow to evolve  making it an easy target, and hospitals will pay ransom for life or death information. (James,  Simon, 2017) One form of attack, known as a MEDJACK or 

In [34]:
model_path = WORKSPACE_DIC + "/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf"
prompt_template = PromptTemplate.from_template(PROMPT)

llm = LlamaCpp(
    model_path= model_path,
    n_gpu_layers=-1,
    n_batch=512,
    n_ctx=1100,
    temperature=1,
    top_p=1,
    max_tokens = 3000,
    #callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

#Create text splitter to split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=200,
)

# Split the text into chunks
docs = text_splitter.split_text(content_of_pdf)
embedding_function = SentenceTransformerEmbeddings(model_name= "sentence-transformers/all-MiniLM-L6-v2")
db = Chroma.from_texts(docs, embedding_function)



#Retrieve and generate using the relevant snippets of the blog
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 20})

retrieved_docs = retriever.invoke("What is the most important part of a MEDJACK?")

for doc in retrieved_docs:
    print(doc.page_content)


# prompt = hub.pull("rlm/rag-prompt")

# def format_docs(docs):
#     return "\n\n".join(doc.page_content for doc in docs)

# chain = prompt_template | llm

# rag_chain = (
#     {"context": retriever | format_docs, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )

# #Print the complete chain which is given to the llm
# print(rag_chain)

# rag_chain.invoke("When was this paper MEDJACK Attacks: The Scariest Part of the Hospital published?") 

llama_model_loader: loaded meta data with 24 key-value pairs and 195 tensors from /hkfs/work/workspace_haic/scratch/sb7059-llm_models_jeremy/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:                               general.name str              = Phi3
llama_model_loader: - kv   2:                        phi3.context_length u32              = 4096
llama_model_loader: - kv   3:                      phi3.embedding_length u32              = 3072
llama_model_loader: - kv   4:                   phi3.feed_forward_length u32              = 8192
llama_model_loader: - kv   5:                           phi3.block_count u32              = 32
llama_model_loader: - kv   6:                  phi3.attention.head_count u32              = 32
llama_model_loader:

MEDJACK Attacks:   The Scariest Part of the Hospital
MEDJACK Attacks:   The Scariest Part of the Hospital
MEDJACK Attacks:   The Scariest Part of the Hospital
MEDJACK Attacks:   The Scariest Part of the Hospital
MEDJACK Attacks:   The Scariest Part of the Hospital
this  paper will be to outline why MEDJACK attacks are so effective and what actions need to be
this  paper will be to outline why MEDJACK attacks are so effective and what actions need to be
this  paper will be to outline why MEDJACK attacks are so effective and what actions need to be
this  paper will be to outline why MEDJACK attacks are so effective and what actions need to be
this  paper will be to outline why MEDJACK attacks are so effective and what actions need to be
MEDJACK attacks are difficult to prevent, detect, and remediate thus making them the  perfect
MEDJACK attacks are difficult to prevent, detect, and remediate thus making them the  perfect
MEDJACK attacks are difficult to prevent, detect, and remediate thu