In [None]:
import fitz
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
import pandas as pd
from langchain import hub
from langchain_chroma import Chroma
from langchain import PromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import LlamaCppEmbeddings
from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings)
import re
from unidecode import unidecode

In [None]:
PROMPT = """You are an assistant tasked with summarizing tables and text. 
Give a concise summary of the given table and text.
{content_of_pdf} """

In [None]:
BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/industrial-cybersecurity-efficiently-monitor-the-cybersecurity-posture-of-your-ics-environment_compress.pdf'
PATH = '/home/iai/sb7059/git/llm_test/data/Book/Images'
#BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/fdgth-06-1321485.pdf'
#BOOK_PDF = '/home/iai/sb7059/git/llm_test/data/Book/smeggitt.pdf'
WORKSPACE_DIC = "/hkfs/work/workspace_haic/scratch/sb7059-llm_models_jeremy"

MODEL_PATH = { #"Mixtral-8x-7b": WORKSPACE_DIC + "/Mixtral/mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf",
               #"Phi-2": WORKSPACE_DIC + "/Phi/Phi2/phi-2.Q4_K_M.gguf",
               "Llama2-70b": WORKSPACE_DIC + "/Llama/Llama2/llama-2-70b.Q5_K_M.gguf",
                "Phi-3-medium-128k": WORKSPACE_DIC + "/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf",
               #"LLama-3-70b": WORKSPACE_DIC + "/Llama/LLama3/Meta-Llama-3-70B-Instruct-v2.Q4_K_M.gguf",
               #"Mixtral-8x22b": WORKSPACE_DIC + "/Mixtral/Mixtral-8x22b-Instruct",
               #"Mixtral-8x-22b": WORKSPACE_DIC + "/Mixtral/Mixtral-8x22B-Instruct-v0.1.Q4_K_M-00001-of-00002.gguf",
              }

In [162]:
def extract_tables_to_string(page):
    """
    Extracts all tables from a given page and concatenates them into a single string.

    Parameters:
    - page: The page object from which to extract tables.

    Returns:
    - A string containing all tables extracted from the page.
    """
    content_of_pdf = ""
    tabs = page.find_tables()  # detect the tables
    for i, tab in enumerate(tabs):  # iterate over all tables
        df = tab.to_pandas()
        # Add the table to a string to be used in the prompt
        content_of_pdf += df.to_string()
    return content_of_pdf

def extract_and_save_images(page, doc, PATH):
    """
    Extracts all images from a given page and saves them to a specified path.

    Parameters:
    - page: The page object from which to extract images.
    - doc: The document object containing the page.
    - PATH: The file path where images will be saved.
    """
    for i in page.get_images(full=True):
        xref = i[0]
        image = fitz.Pixmap(doc, xref)
        with open(f'{PATH}/image_{xref}.png', 'wb') as f:
            f.write(image.tobytes())

def extract_spans_from_blocks(block_dict):
    # Initialize an empty list to store row data
    rows = []
    
    # Iterate through each page and its blocks
    for page_num, blocks in block_dict.items():
        for block in blocks:
            # Check if the block is of type 0 (text)
            if block['type'] == 0:
                for line in block['lines']:
                    for span in line['spans']:
                        # Extract bounding box and other span properties
                        xmin, ymin, xmax, ymax = list(span['bbox'])
                        font_size = span['size']
                        text = unidecode(span['text'])
                        span_font = span['font']
                        is_bold = "bold" in span_font.lower()
                        
                        # Ensure the text is not just whitespace
                        if text.replace(" ", ""):
                            rows.append((xmin, ymin, xmax, ymax, text, is_bold, span_font, font_size, page_num))
    
    # Create a DataFrame from the rows
    span_df = pd.DataFrame(rows, columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'is_bold', 'span_font', 'font_size', 'page_num'])
    return span_df

def get_title(span_df):
    title_page = span_df[span_df['page_num'] == 1]
    unique_font_sizes_title = title_page['font_size'].unique()
    title = ""
    for index, row in title_page.iterrows():
        #Check if the row is bold and if the font size is the greatest font size
        if row['font_size'] == max(unique_font_sizes_title): 
            title += row['text']
    return title

def extract_toc_as_df(pdf_path):
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Extract the table of contents
    toc = doc.get_toc()
    
    # Close the document
    doc.close()
    
    # Convert TOC to DataFrame
    toc_df = pd.DataFrame(toc, columns=['Level', 'Title', 'Page'])
    
    return toc_df

def extract_pdf_content(pdf_path, start_page, end_page):
    pdf_document = fitz.open(pdf_path)
    pdf_content = []
    block_dict = {}
    for page_number in range(start_page, end_page):
        page = pdf_document[page_number]
        file_dict = page.get_text('dict') # Get the page dictionary
        block = file_dict['blocks'] # Get the block information
        block_dict[page_number] = block

        page_content = page.get_text()
        pdf_content.append(page_content)

        # Extract images from the page
        extract_and_save_images(page, pdf_document, PATH)

        # Extract tables from the page
        pdf_content.append(extract_tables_to_string(page))

    display(analyse_content(extract_spans_from_blocks(block_dict),page))
    
    return pdf_content


def analyse_content(span_df, page):
    # Initialize lists to hold categorized content
    site_header = []
    headers = []
    subheader = []
    subsubheader = []
    important_texts = []
    picture_descriptions = []
    page_content = []

    # Calculate font size and font type statistics
    font_size_counts = span_df['font_size'].value_counts()
    font_count = span_df['span_font'].value_counts()
    unique_font_sizes = sorted(span_df['font_size'].unique(), reverse=True)

    # Define thresholds
    header_threshold = page.rect.height * 0.07

    # Iterate through each row in the DataFrame
    for index, row in span_df.iterrows():
        text = row['text']
        font_size = row['font_size']
        is_bold = row['is_bold']
        ymin = row['ymin']
        
        # Site Header - typically at the top of the page
        if ymin < header_threshold:
            site_header.append(text)

        # Determine if text is a header or subheader based on font size and boldness
        if is_bold 
            
        
        and font_size == unique_font_sizes[0]:
            headers.append(text)
        elif is_bold and font_size == unique_font_sizes[1]:
            subheader.append(text)
        elif is_bold and font_size != font_size_counts.idxmax():
            subsubheader.append(text)

        # Bold text that is not categorized as headers or subheaders
        if is_bold and font_size == font_size_counts.idxmax():
            important_texts.append(text)

        # Check for figure descriptions
        if "figure" in text.lower() or "fig." in text.lower() or "image" in text.lower():
            picture_descriptions.append(text)

        # Default page content
        page_content.append(text)

    # Store results in a dictionary
    data = {
        "site_title": site_header,
        "headers": headers,
        "subheader": subheader,
        "subsubheader": subsubheader,
        "important_texts": important_texts,
        "picture_descriptions": picture_descriptions,
        "page_content": page_content
    }
    
    return data

In [163]:
test = extract_pdf_content(BOOK_PDF, 43, 44)
print(test)

{'site_title': ['What is an ICS?     23'],
 'headers': ['The opportunities', 'The risk'],
 'subheader': ['Internet Protocol',
  'IP',
  'Transport Control Protocol',
  'TCP',
  'User Datagram Protocol',
  'UDP'],
 'subsubheader': [],
 'important_texts': ['Internet Protocol',
  'IP',
  'Transport Control Protocol',
  'TCP',
  'User Datagram Protocol',
  'UDP'],
 'picture_descriptions': [],
 'page_content': ['What is an ICS?     23',
  'Knowing that some of the processes an OT/ICS system controls are expected to run ',
  'flawlessly and uninterrupted for weeks--or even months--at a time, it is easy to see ',
  'why here, the availability requirement is king. Integrity is a close second, as we want ',
  'to make sure that data that the OT systems and operators are making decisions on is ',
  'free from error or manipulation. Confidentiality is hardly a major concern in a typical ',
  'OT environment, other than maybe with some historical data stored in a database or ',
  'in log files on 

["What is an ICS?     23\nKnowing that some of the processes an OT/ICS system controls are expected to run \nflawlessly and uninterrupted for weeks—or even months—at a time, it is easy to see \nwhy here, the availability requirement is king. Integrity is a close second, as we want \nto make sure that data that the OT systems and operators are making decisions on is \nfree from error or manipulation. Confidentiality is hardly a major concern in a typical \nOT environment, other than maybe with some historical data stored in a database or \nin log files on a server or PC. This is because by the time the data is used, it is pretty \nmuch worthless already; though as I said, stored production data, recipes, and—in some \ncases—the control applications that are stored on storage media do have some value, and \ntherefore the C in the CIA triad should not be completely ignored.\nThe opportunities\nWhy did IT and OT convergence occur? Wouldn't it make much more sense to keep the \ntwo separate

In [None]:
# Create a document object
doc = fitz.open(BOOK_PDF)

content_of_pdf = ""
block_dict = {}


#Iterate over all pages in the documents
for i in range(doc.page_count):
#for i in range(0,100):
  page = doc.load_page(i)
  file_dict = page.get_text('dict') # Get the page dictionary
  block = file_dict['blocks'] # Get the block information
  block_dict[i] = block
  # read text and print it
  text = page.get_text()
  #Add the text to a string to be used in the prompt
  content_of_pdf = content_of_pdf + text

  ### IMAGES ###
  # Extract all the images on the page and save the images
  for i in page.get_images(full=True):
    xref = i[0]
    base_image = doc.extract_image(xref)
    image_bytes = base_image["image"]
    image = fitz.Pixmap(doc, xref)
    with open(f'{PATH}/image_{xref}.png', 'wb') as f:
      f.write(image.tobytes())

  ## TABLES ##
  # Extract all the tables on the page and save the tables
  tabs = page.find_tables()  # detect the tables
  for i,tab in enumerate(tabs):  # iterate over all tables
      print(f"Table {i} column names: {tab.header.names}, external: {tab.header.external}")
      tab = tabs[i]
      df = tab.to_pandas()
      #Add the table to a string to be used in the prompt
      content_of_pdf += df.to_string()

In [None]:
#Make a function out of that
spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
rows = []
for page_num, blocks in block_dict.items():
    for block in blocks:
        if block['type'] == 0:
            for line in block['lines']:
                for span in line['spans']:
                    xmin, ymin, xmax, ymax = list(span['bbox'])
                    font_size = span['size']
                    text = unidecode(span['text'])
                    span_font = span['font']
                    is_bold = False
                    if "bold" in span_font.lower():
                        is_bold = True
                    if text.replace(" ","") !=  "":
                        rows.append((xmin, ymin, xmax, ymax, text, is_bold, span_font, font_size, page_num))
                        span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text','is_bold','span_font', 'font_size', 'page_num'])

In [None]:
#Find text with table of content in span_df
toc = span_df[span_df['text'].str.contains("table of content", case=False)]


In [None]:
print(content_of_pdf)

In [None]:
prompt_text = """You are an assistant tasked with summarizing texts. \ 
Give a concise summary of the text. Text chunk: {element} """

In [None]:
model_path = WORKSPACE_DIC + "/Phi/Phi3/Phi-3-mini-4k-instruct-q4.gguf"
llm = LlamaCpp(
    model_path= model_path,
    n_gpu_layers=-1,
    n_batch=4096,
    n_ctx=4096,
    temperature=1,
    top_p=1,
    max_tokens = 5000,
    #callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

In [None]:
prompt_template = PromptTemplate.from_template(PROMPT)

#Create text splitter to split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=128,
)

# Split the text into chunks
texts = text_splitter.split_text(content_of_pdf)

prompt = PromptTemplate.from_template(prompt_text)

summarize_chain = {"element": lambda x: x} | prompt | llm | StrOutputParser()

text_summaries = summarize_chain.batch(texts, {"max_concurrency": 50})

In [None]:
#Save the summaries to a file
with open('summaries.txt', 'w') as f:
    for item in text_summaries:
        f.write("%s\n" % item)

In [None]:
#Load the summaries from the file
with open('summaries.txt', 'r') as f:
    text_summaries = f.readlines()

In [None]:
embedding_function = SentenceTransformerEmbeddings(model_name= "sentence-transformers/all-mpnet-base-v2")

db = Chroma.from_texts(text_summaries, embedding_function)

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 6})

print(retriever)

template2 = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""

template = """Create multiple choice question in one of the following format:
Question: Which two options are the best reasons to use an IPV4 private IP space? (Choose two.)
A. to enable intra-enterprise communication
B. to implement NAT
C. to connect applications
D. to conserve global address space
E. to manage routing overhead
Answer: AD

Question: The corporate security policy requires multiple elements to be matched in an authorization policy. Which elements can be combined to meet the requirement?
A. Device registration status and device activation status
B. Network access device and time condition
C. User credentials and server certificate
D. Built-in profile and custom profile
Answer: B

using the following context:
{context}
"""

#prompt = hub.pull("rlm/rag-prompt")
# Retrieve and generate using the relevant snippets of the blog.
retriever = db.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("""Create multiple choice question in one in the following format out of the provided context""")

In [None]:
display(llm_answer)