In [None]:
import fitz
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
import pandas as pd
from langchain import hub
from langchain_chroma import Chroma
from langchain import PromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import LlamaCppEmbeddings
from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings)
import re
from unidecode import unidecode

In [None]:
PROMPT = """You are an assistant tasked with summarizing tables and text. 
Give a concise summary of the given table and text.
{content_of_pdf} """

In [60]:
BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/industrial-cybersecurity-efficiently-monitor-the-cybersecurity-posture-of-your-ics-environment_compress.pdf'
PATH = '/home/iai/sb7059/git/llm_test/data/Book/Images'
#BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/fdgth-06-1321485.pdf'
#BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/smeggitt.pdf'
WORKSPACE_DIC = "/hkfs/work/workspace_haic/scratch/sb7059-llm_models_jeremy"

MODEL_PATH = { #"Mixtral-8x-7b": WORKSPACE_DIC + "/Mixtral/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf",
               #"Phi-2": WORKSPACE_DIC + "/Phi/Phi2/phi-2.Q4_K_M.gguf",
               "Llama2-70b": WORKSPACE_DIC + "/Llama/Llama2/llama-2-70b.Q5_K_M.gguf",
                "Phi-3-medium-128k": WORKSPACE_DIC + "/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf",
               #"LLama-3-70b": WORKSPACE_DIC + "/Llama/LLama3/Meta-Llama-3-70B-Instruct-v2.Q4_K_M.gguf",
               #"Mixtral-8x22b": WORKSPACE_DIC + "/Mixtral/Mixtral-8x22b-Instruct",
               #"Mixtral-8x-22b": WORKSPACE_DIC + "/Mixtral/Mixtral-8x22B-Instruct-v0.1.Q4_K_M-00001-of-00002.gguf",
              }

In [68]:
# Create a document object
doc = fitz.open(BOOK_PDF)

content_of_pdf = ""
block_dict = {}


#Iterate over all pages in the documents
#for i in range(doc.page_count):
for i in range(0,10):
  page = doc.load_page(i)
  file_dict = page.get_text('dict') # Get the page dictionary
  block = file_dict['blocks'] # Get the block information
  block_dict[i] = block
  # read text and print it
  text = page.get_text()
  #Add the text to a string to be used in the prompt
  content_of_pdf = content_of_pdf + text

  # Extract all the images on the page and save the images
  for i in page.get_images(full=True):
    xref = i[0]
    base_image = doc.extract_image(xref)
    image_bytes = base_image["image"]
    image = fitz.Pixmap(doc, xref)
    with open(f'{PATH}/image_{xref}.png', 'wb') as f:
      f.write(image.tobytes())

  # Extract all the tables on the page and save the tables
  tabs = page.find_tables()  # detect the tables
  for i,tab in enumerate(tabs):  # iterate over all tables
      print(f"Table {i} column names: {tab.header.names}, external: {tab.header.external}")
      tab = tabs[i]
      df = tab.to_pandas()
      #Add the table to a string to be used in the prompt
      content_of_pdf += df.to_string()
      print(df)


In [81]:
spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
rows = []
for page_num, blocks in block_dict.items():
    for block in blocks:
        if block['type'] == 0:
            for line in block['lines']:
                for span in line['spans']:
                    xmin, ymin, xmax, ymax = list(span['bbox'])
                    font_size = span['size']
                    text = unidecode(span['text'])
                    span_font = span['font']
                    is_bold = False
                    if "bold" in span_font.lower():
                        is_bold = True
                    if text.replace(" ","") !=  "":
                        rows.append((xmin, ymin, xmax, ymax, text, is_bold, span_font, font_size))
                        span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text','is_bold','span_font', 'font_size'])

In [83]:
display(span_df)

Unnamed: 0,xmin,ymin,xmax,ymax,text,is_bold,span_font,font_size
0,63.000000,98.383789,258.197113,150.132812,Industrial,True,OpenSans-Bold,38.0
1,63.000000,147.383789,335.261108,199.132812,Cybersecurity,True,OpenSans-Bold,38.0
2,63.000000,203.383789,309.169647,255.132812,Second Edition,False,OpenSans-Italic,38.0
3,63.000000,317.898438,455.428436,339.687500,Efficiently monitor the cybersecurity posture ...,False,OpenSans,16.0
4,63.000000,338.898438,186.712448,360.687500,ICS environment,False,OpenSans,16.0
...,...,...,...,...,...,...,...,...
507,270.212585,531.557495,431.220245,543.813843,Find the beaconing process - netstat,True,OpenSans-Semibold,9.0
508,444.299591,531.557495,459.714813,543.813843,440,True,OpenSans-Semibold,9.0
509,270.212585,544.974854,411.231293,557.231201,Upload executable to VirusTotal,True,OpenSans-Semibold,9.0
510,444.299591,544.974854,459.714813,557.231201,454,True,OpenSans-Semibold,9.0


In [84]:
#Get all unique span_font values and font_size values
unique_span_fonts = span_df['span_font'].unique()
unique_font_sizes = span_df['font_size'].unique()

print(unique_font_sizes)
title = ""

for index, row in span_df.iterrows():
    #Check if the row is bold and if the font size is the greatest font size
    if row['is_bold'] and row['font_size'] == max(unique_font_sizes): 
        title += row['text']
print(title)

[38.  16.  14.  11.  17.  10.  10.5 13.  18.  20.   9.  30. ]
Industrial Cybersecurity Contributors


In [57]:
#content_of_pdf = content_of_pdf.replace('\n', ' ')
print(content_of_pdf)

 
 
 
 
 
MEDJACK Attacks:  
The Scariest Part of the Hospital 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Sinclair Meggitt 
Comp 116 
Tufts University 
December 12th, 2018 
 
 
 
 
 
Table of Contents 
 
 
Abstract
2 
Introduction
2 
To the Community
2 
Medical Device Vulnerabilities
3 
I. The Internet of Things
3 
II. A Black Hole
3 
MEDJACK Attack
3 
I. History
3 
II. Anatomy of Attack
4 
III. Malware
4 
MEDJACK Defense
5 
I. Remediation
5 
II. Recommendations and Best Practices
5 
Conclusion
6 
Works Cited
7 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Abstract 
As of 2015, the healthcare industry became the most attacked industry, experiencing 32.7% of all 
known breaches nationwide. (TrapX, 2015) The increased targeting is due to three main reasons: 
patient records are extremely valuable. the healthcare industry is notoriously slow to evolve 
making it an easy target, and hospitals will pay ransom for life or death information. (James, 
Simon, 2017) One form of attack, known as a MEDJACK or 

In [None]:
prompt_text = """You are an assistant tasked with summarizing texts. \ 
Give a concise summary of the text. Text chunk: {element} """

In [None]:
model_path = WORKSPACE_DIC + "/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf"
llm = LlamaCpp(
    model_path= model_path,
    n_gpu_layers=-1,
    n_batch=4096,
    n_ctx=4096,
    temperature=1,
    top_p=1,
    max_tokens = 5000,
    #callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

In [None]:
prompt_template = PromptTemplate.from_template(PROMPT)

#Create text splitter to split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=512,
    chunk_overlap=128,
)

# Split the text into chunks
texts = text_splitter.split_text(content_of_pdf)

prompt = PromptTemplate.from_template(prompt_text)

summarize_chain = {"element": lambda x: x} | prompt | llm | StrOutputParser()

text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})


In [None]:
#Save the summaries to a file
with open('summaries.txt', 'w') as f:
    for item in text_summaries:
        f.write("%s\n" % item)

In [None]:
#Load the summaries from the file
with open('summaries.txt', 'r') as f:
    summaries = f.readlines()

In [None]:
embedding_function = SentenceTransformerEmbeddings(model_name= "sentence-transformers/all-mpnet-base-v2")

db = Chroma.from_texts(text_summaries, embedding_function)

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 6})

print(retriever)

template2 = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""

template = """Create multiple choice question in one of the following format:
Question: Which two options are the best reasons to use an IPV4 private IP space? (Choose two.)
A. to enable intra-enterprise communication
B. to implement NAT
C. to connect applications
D. to conserve global address space
E. to manage routing overhead
Answer: AD

Question: The corporate security policy requires multiple elements to be matched in an authorization policy. Which elements can be combined to meet the requirement?
A. Device registration status and device activation status
B. Network access device and time condition
C. User credentials and server certificate
D. Built-in profile and custom profile
Answer: B

using the following context:
{context}
"""

#prompt = hub.pull("rlm/rag-prompt")
# Retrieve and generate using the relevant snippets of the blog.
retriever = db.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("""Create multiple choice question in one in the following format out of the provided context""")

In [None]:
display(llm_answer)