References:
* https://medium.com/@nageshmashette32/building-a-document-based-question-answering-system-with-langchain-using-open-source-llm-model-3b49c0d4a8b8
* https://iamholumeedey007.medium.com/building-a-pdf-summarizer-with-langchain-a1dea8d2cd3a

In [1]:
!pip install langchain
!pip install chromadb
!pip install pypdf
!pip install faiss-cpu
!pip install git+https://github.com/huggingface/transformers
!pip install --upgrade huggingface_hub
!pip install --upgrade gradio
!export COMMANDLINE_ARGS="--no-gradio-queue"

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/user/1012/pip-req-build-_pxp0c3w
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/user/1012/pip-req-build-_pxp0c3w
  Resolved https://github.com/huggingface/transformers to commit 2788f8d8d5f9cee2fe33a9292b0f3570bd566a6d
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone


In [2]:
from langchain.chains.summarize import load_summarize_chain
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from langchain import HuggingFacePipeline

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.text_splitter import CharacterTextSplitter
from huggingface_hub import login
import gradio as gr

import torch
import textwrap
import os
import glob


import api_tokens

  from .autonotebook import tqdm as notebook_tqdm
`AnnotionFormat` is deprecated and will be removed in v4.38. Please use `transformers.image_utils.AnnotationFormat` instead.
2023-12-14 11:06:21.943179: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-14 11:06:21.994148: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-14 11:06:21.994187: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-14 11:06:21.994214: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to regi

### Query PDF documents (Q&A)

* I tried several models from HuggingFace, such as Google Research's T5 and LongT5, but obtaining errors and truncated output with many configurations.
* I didn't want to use OpenAI's ChatGPT due to being paid for.
* The only one that worked without truncating the output was https://huggingface.co/meta-llama/Llama-2-7b-chat-hf.

**Note:** Requires asking for permission to the model's authors, but took only a few minutes.

##### Storing embeddings in a FAISS vector store

In [3]:
def store_embeddings_in_FAISS(base_dir: str) -> FAISS:
    loader = PyPDFDirectoryLoader(base_dir)
    docs = loader.load()

    text_splitter = CharacterTextSplitter(chunk_size=650, chunk_overlap=150)
    texts = text_splitter.split_documents(docs)
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    db = FAISS.from_documents(texts, embeddings)

    return db

In [4]:
# def build_summarization_langchain_pipeline(model_name: str) -> HuggingFacePipeline:
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModelForCausalLM.from_pretrained(model_name,
#                                                 device_map='auto',
#                                                 torch_dtype=torch.float16,
#                                                 use_auth_token=True,
#                                                 load_in_8bit=True,
#                                                 )

#     pipe = pipeline("text-generation",
#                     model=model,
#                     tokenizer= tokenizer,
#                     torch_dtype=torch.bfloat16,
#                     trust_remote_code=True,
#                     device_map="auto",
#                     max_length=3000, # Prevent issues when the document is too long
#                     do_sample=True,
#                     top_k=10,
#                     num_return_sequences=1,
#                     eos_token_id=tokenizer.eos_token_id
#                     )

#     llm = HuggingFacePipeline(pipeline=pipe,
#                                 model_kwargs={
#                                     'temperature':0 # Gives direct summaries and prevent randomness
#                                 },
#                             )
    
#     return llm

In [5]:
def build_langchain_pipeline(model_name: str) -> HuggingFacePipeline:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name,
                                                device_map='auto',
                                                torch_dtype=torch.float16,
                                                use_auth_token=True,
                                                load_in_8bit=True,
                                                )

    pipe = pipeline("text-generation",
                    model=model,
                    tokenizer= tokenizer,
                    torch_dtype=torch.bfloat16,
                    device_map="auto",
                    max_new_tokens = 1024,
                    do_sample=True,
                    top_k=10,
                    num_return_sequences=1,
                    eos_token_id=tokenizer.eos_token_id
                    )

    llm = HuggingFacePipeline(pipeline=pipe,
                                model_kwargs={
                                    'temperature':0
                                },
                            )
    
    return llm

In [6]:
embeddings_name = 'sentence-transformers/all-MiniLM-L6-v2'
model_name = 'meta-llama/Llama-2-7b-chat-hf'

login(token=api_tokens.HUGGINGFACEHUB_API_TOKEN)
#llm_summarization = build_summarization_langchain_pipeline(model_name)
llm = build_langchain_pipeline(model_name)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/dlopes/.cache/huggingface/token
Login successful


Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.50s/it]


##### Summarizing all pdfs in pdfs folder

In [7]:
def summarize_pdfs_from_folder(pdfs_folder, llm):
    summaries = []
    for pdf_file in glob.glob(pdfs_folder + "/*.pdf"):
        loader = PyPDFLoader(pdf_file)
        docs = loader.load_and_split()

        chain = load_summarize_chain(llm, chain_type="map_reduce")
        # template = """
        #       Write a summary of the following text delimited by triple backticks.
        #       Return your response which covers the key points of the text.
        #       ```{text}```
        #       SUMMARY:
        #    """

        # prompt = PromptTemplate(template=template, input_variables=["text"])
        # chain = LLMChain(prompt=prompt, llm=llm)

        summary = chain.run(docs)
        print("Summary for: ", pdf_file)
        print(summary)
        print("\n")
        summaries.append(summary)
    
    return summaries

In [8]:
pdf_folder = "./pdfs"
if not os.path.exists(pdf_folder):
    os.makedirs(pdf_folder)
#summaries = summarize_pdfs_from_folder(pdf_folder, llm_summarization)
summaries = summarize_pdfs_from_folder(pdf_folder, llm)

with open("summaries.txt", "w") as f:
    for summary in summaries:
        f.write(summary + "\n"*3)

Token indices sequence length is longer than the specified maximum sequence length for this model (3270 > 1024). Running this sequence through the model will result in indexing errors


Summary for:  ./pdfs/AI-IDS_Application_of_Deep_Learning_to_Real-Time_Web_Intrusion_Detection.pdf


The article discusses the application of deep learning to real-time web intrusion detection using AI-IDS. The system combines CNNs and LSTM networks to analyze HTTP traffic and detect malicious patterns. The system was evaluated on a real-world dataset and achieved high accuracy and precision. The authors also discuss the importance of scalability and precision in addition to accuracy when evaluating deep learning-based IDSs. The paper provides a comprehensive overview of current state of IDSs based on deep learning techniques and proposes a new approach that combines traditional IDSs with deep learning techniques.






Summary for:  ./pdfs/separating.pdf


This paper discusses the issue of privacy in encrypted tunnel traffic and proposes a novel deep neural network architecture to detect individual anomalous packets in a flow. The approach is tested on synthetic flows and real-world network traces, showing high accuracy in separating flows. The paper demonstrates that common packet-level encryption may not be sufficient to achieve high levels of privacy and highlights the relevance of the proposed approach in real-world scenarios. The authors propose an approach for analyzing encrypted tunnel traffic and evaluate it on several publicly available real-world network traces. They show that their proposed approach can separate packets from different flows with good accuracy, laying the groundwork for implementing intrusion detection on encrypted tunnel traffic.






Summary for:  ./pdfs/Comparison of machine-learning algorithms for classification of VPN network traffic flow using time-related features.pdf



Six machine-learning models (Random Forest, Gradient Boosting Tree, Naive Bayes, Decision Trees, Support Vector Machines, and Linear Regression) were compared for classifying VPN and non-VPN network traffic flow data based on time-related features from various network flow categories. The optimized Random Forest and Gradient Boosting Tree models were found to outperform the other models in terms of accuracy and computational efficiency. The study also showed that a small number of time-related features could achieve over 90% accuracy for each network flow category. The results can be employed in the development of fast rule-based classifiers.






Summary for:  ./pdfs/15008-Article Text-18527-1-2-20201228.pdf

The paper investigates the illegal streaming cyberlocker ecosystem, focusing on the characteristics of cyberlockers, their hosting providers, and the actions of copyright enforcers. The authors find that the system is highly centralized with a few networks, countries, and cyberlockers providing most of the content, and that copyright enforcers tend to target small subsets of the ecosystem with some success in removing content. They also observe that individual pirates operate multiple websites, and that there are third-party domains that facilitate the distribution of infringing content. The study provides insights into the structure and monetization strategies of cyberlockers and highlights the challenges of identifying and analyzing them.






Summary for:  ./pdfs/Multilayer_Perceptron_Neural_Network_fo...ction_of_Encrypted_VPN_Network_Traffic.pdf


The paper proposes a novel approach to detecting encrypted VPN traffic using a multilayer perceptron neural network. The approach is evaluated using a real-world VPN network dataset and shows high accuracy in detecting encrypted VPN traffic. The paper was published in the IEEE International Conference on Cyber Situational Awareness, Data Analytics and Assessment in 2018.




This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Summary for:  ./pdfs/Accepted version.pdf
MSMS MSMSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS MSMS M

##### Querying pdfs in pdfs folder

In [9]:
def query_pdfs(query: str, qa: RetrievalQA) -> str:
    result = qa({"query": query})
    wrapped_text = textwrap.fill(result['result'], width=500)
    print(wrapped_text)
    return wrapped_text

In [10]:
db = store_embeddings_in_FAISS(pdf_folder)
qa = RetrievalQA.from_chain_type(llm=llm, 
                                 chain_type="stuff", 
                                 retriever=db.as_retriever(
                                     search_type="similarity", 
                                     search_kwargs={"k": 2}), # We are interested in the top 2 results
                                 return_source_documents=True)


In [11]:
query_pdfs("What is the core idea behind the Movie Pirates of the Caribbean paper?", qa)



 The paper explores the online video piracy (OVP) ecosystem, focusing on streaming cyberlockers, which are central to the distribution of pirated content. The authors characterize the content, streaming cyberlockers' individual attributes, and the actions of copyright enforcers. They find a centralized system with a few networks, countries, and cyberlockers dominating provisioning, and a remarkable success rate of copyright notices in removing content. Additional Information:  * The paper
performs the ﬁrst exploration of the new ecosystem of online video piracy. * It characterizes the content, as well as the streaming cyberlockers' individual attributes. * The authors investigate the actions of copyright enforcers and find a high success rate in removing content. * They identify a centralized system with a few networks, countries, and cyberlockers dominating provisioning. * The paper has implications for understanding modern copyright infringement and its impact on content
creators and

" The paper explores the online video piracy (OVP) ecosystem, focusing on streaming cyberlockers, which are central to the distribution of pirated content. The authors characterize the content, streaming cyberlockers' individual attributes, and the actions of copyright enforcers. They find a centralized system with a few networks, countries, and cyberlockers dominating provisioning, and a remarkable success rate of copyright notices in removing content. Additional Information:  * The paper\nperforms the ﬁrst exploration of the new ecosystem of online video piracy. * It characterizes the content, as well as the streaming cyberlockers' individual attributes. * The authors investigate the actions of copyright enforcers and find a high success rate in removing content. * They identify a centralized system with a few networks, countries, and cyberlockers dominating provisioning. * The paper has implications for understanding modern copyright infringement and its impact on content\ncreators 

### Build an interface

This uses Gradio to build a simple interface to query pdfs.

In [12]:
def query_pdfs_interface(query: str, pdfs_folder) -> str:
    db = store_embeddings_in_FAISS(pdf_folder)
    qa = RetrievalQA.from_chain_type(llm=llm, 
                                    chain_type="stuff", 
                                    retriever=db.as_retriever(
                                        search_type="similarity", 
                                        search_kwargs={"k": 2}), # We are interested in the top 2 results
                                    return_source_documents=True)

    result = qa({"query": query})
    wrapped_text = textwrap.fill(result['result'], width=500)
    return wrapped_text

In [13]:
def build_interface():
    interface = gr.Interface(
        fn = query_pdfs_interface,
        inputs = ["text", "text"],
        outputs = "text",
        title = "PDF Q&A",
        description = "This app allows you to query your PDF files.",
    ).launch(share=True)

In [14]:
build_interface()

Running on local URL:  http://127.0.0.1:7860


Exception in thread Thread-8:
Traceback (most recent call last):
  File "/home/dlopes/anaconda3/envs/pytorch2/lib/python3.9/site-packages/httpcore/_exceptions.py", line 10, in map_exceptions
    yield
  File "/home/dlopes/anaconda3/envs/pytorch2/lib/python3.9/site-packages/httpcore/_backends/sync.py", line 206, in connect_tcp
    sock = socket.create_connection(
  File "/home/dlopes/anaconda3/envs/pytorch2/lib/python3.9/socket.py", line 844, in create_connection
    raise err
  File "/home/dlopes/anaconda3/envs/pytorch2/lib/python3.9/socket.py", line 832, in create_connection
    sock.connect(sa)
socket.timeout: timed out

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/dlopes/anaconda3/envs/pytorch2/lib/python3.9/site-packages/httpx/_transports/default.py", line 66, in map_httpcore_exceptions
    yield
  File "/home/dlopes/anaconda3/envs/pytorch2/lib/python3.9/site-packages/httpx/_transports/default.py", line 228, 


Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


2023/12/14 11:56:41 [W] [service.go:132] login to server failed: dial tcp 44.237.78.176:7000: i/o timeout
