# Installations, imports, utils

In [1]:
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 langchain==0.0.300 xformers==0.0.21 \
bitsandbytes==0.41.1 sentence_transformers==2.2.2 chromadb==0.4.12

Collecting transformers==4.33.0
  Obtaining dependency information for transformers==4.33.0 from https://files.pythonhosted.org/packages/e1/9d/4d9fe5c3b820db10773392ac5f4a0c8dab668f70b245ce2ce09785166128/transformers-4.33.0-py3-none-any.whl.metadata
  Downloading transformers-4.33.0-py3-none-any.whl.metadata (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting accelerate==0.22.0
  Obtaining dependency information for accelerate==0.22.0 from https://files.pythonhosted.org/packages/4d/a7/05c67003d659a0035f2b3a8cf389c1d9645865aee84a73ce99ddab16682f/accelerate-0.22.0-py3-none-any.whl.metadata
  Downloading accelerate-0.22.0-py3-none-any.whl.metadata (17 kB)
Collecting einops==0.6.1
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain==0

In [2]:
!git clone https://github.com/zaytoun/scihub.py.git
!pip install -r /kaggle/working/scihub.py/requirements.txt
!pip install pdfminer.six

Cloning into 'scihub.py'...
remote: Enumerating objects: 108, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 108 (delta 27), reused 23 (delta 23), pack-reused 79[K
Receiving objects: 100% (108/108), 21.43 KiB | 1.43 MiB/s, done.
Resolving deltas: 100% (50/50), done.
Collecting pdfminer.six
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0mm
Installing collected packages: pdfminer.six
Successfully installed pdfminer.six-20221105


In [3]:
import os
import subprocess
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

def get_pdfs(dois, output_file_path):
        os.mkdir(output_file_path)
        cnt = 0
        for d in dois:
            cmd = ["python", "/kaggle/working/scihub.py/scihub/scihub.py", "--download", d, "--output", output_file_path]
            result = subprocess.run(cmd)
            if (result.returncode == 0):cnt += 1
        for file_name in os.listdir(output_file_path):
            # Check if the file is a PDF
            if file_name.endswith(".pdf"):
                # Construct the full file path
                file_path = os.path.join(output_file_path, file_name)
                # Construct the output file path
                output_file_path_txt = os.path.join(output_file_path, file_name.replace('.pdf', '.txt'))
                # Open the output file
                with open(output_file_path_txt, 'w') as output_file:
                    # Extract text from the PDF
                    for page_layout in extract_pages(file_path):
                        for element in page_layout:
                            if isinstance(element, LTTextContainer):
                                # Write the text to the output file instead of printing it
                                output_file.write(element.get_text())
                                output_file.write("---------------------\n")
        return cnt


In [4]:
get_pdfs(["10.1007/s00464-020-07404-y"], "./output")
!rm -rf ./output

In [5]:
import os
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer

def convert_pdfs(input_folder_path, output_folder_path):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder_path, exist_ok=True)
    
    for file_name in os.listdir(input_folder_path):
        # Check if the file is a PDF
        if file_name.endswith(".pdf"):
            # Construct the full file path
            file_path = os.path.join(input_folder_path, file_name)
            # Construct the output file path
            output_file_path = os.path.join(output_folder_path, file_name.replace('.pdf', '.txt'))
            # Open the output file
            with open(output_file_path, 'w') as output_file:
                # Extract text from the PDF
                for page_layout in extract_pages(file_path):
                    for element in page_layout:
                        if isinstance(element, LTTextContainer):
                            # Write the text to the output file instead of printing it
                            output_file.write(element.get_text())
                            output_file.write("---------------------\n")

convert_pdfs('/kaggle/input/samplepdfsnlp', '/kaggle/working/texts')


In [6]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma


# Initialize model, tokenizer, query pipeline

In [7]:
model_id = '/kaggle/input/llama-2/pytorch/7b-chat-hf/1'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

Prepare the model and the tokenizer.

In [8]:
time_1 = time()
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
)
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
time_2 = time()
print(f"Prepare model, tokenizer: {round(time_2-time_1, 3)} sec.")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Prepare model, tokenizer: 229.645 sec.


Define the query pipeline.

In [9]:
time_1 = time()
query_pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        device_map="auto",)
time_2 = time()
print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")

Prepare pipeline: 1.791 sec.


We define a function for testing the pipeline.

In [10]:
def test_model(tokenizer, pipeline, prompt_to_test):
    """
    Perform a query
    print the result
    Args:
        tokenizer: the tokenizer
        pipeline: the pipeline
        prompt_to_test: the prompt
    Returns
        None
    """
    # adapted from https://huggingface.co/blog/llama2#using-transformers
    time_1 = time()
    sequences = pipeline(
        prompt_to_test,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=200,)
    time_2 = time()
    print(f"Test inference: {round(time_2-time_1, 3)} sec.")
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")

## Test the query pipeline

We test the pipeline with a query about the meaning of State of the Union (SOTU).

In [11]:
test_model(tokenizer,
           query_pipeline,
           "Please explain what is the State of the Union address. Give just a definition. Keep it in 100 words.")



Test inference: 9.702 sec.
Result: Please explain what is the State of the Union address. Give just a definition. Keep it in 100 words. A State of the Union (SotU) adress is a speech given by the U.S.President to a joint session of Congress, outlining the condition of the union and the legislative agenda for the coming year. It is an annual tradition established in 1801 and provides an opportunity for the President to communicate directly with the American people and Congress about the current state of affairs in the nation.


# Retrieval Augmented Generation

## Check the model with a HuggingFace pipeline


We check the model with a HF pipeline, using a query about the meaning of State of the Union (SOTU).

In [12]:
llm = HuggingFacePipeline(pipeline=query_pipeline)
# checking again that everything is working fine
llm(prompt="Please explain what is the State of the Union address. Give just a definition. Keep it in 100 words.")

'\nThe State of the Union address is an annual speech given by the President of the United States to a joint session of Congress, in which the President reports on the current state of the union and outlines their legislative agenda for the upcoming year.'

## Ingestion of data using Text loder

We will ingest the newest presidential address, from Jan 2023.

In [13]:
import os

# Specify the directory where the text files are located
directory = "/kaggle/working/texts"

# Initialize an empty list to store the documents
documents = []

# Iterate over each file in the directory
for filename in os.listdir(directory):
    # Check if the file is a text file
    if filename.endswith(".txt"):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Create a TextLoader instance for the file
        loader = TextLoader(file_path, encoding="utf8")
        # Load the text file and add it to the documents list
        documents.append(loader.load())


## Split data in chunks

In [22]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)

all_splits = []
for document in documents:
    splits = text_splitter.split_documents(document)  # Pass the document inside a list
    all_splits.append(splits)


all_splits = [split for splits in all_splits for split in splits]
len(all_splits)

1451

## Creating Embeddings and Storing in Vector Store

Create the embeddings using Sentence Transformer and HuggingFace embeddings.

In [23]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Initialize ChromaDB with the document splits, the embeddings defined previously and with the option to persist it locally.

In [24]:
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

Batches:   0%|          | 0/46 [00:00<?, ?it/s]

## Initialize chain

In [25]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

## Test the Retrieval-Augmented Generation 


We define a test function, that will run the query and time it.

In [31]:
def test_rag(qa, query):
    print(f"Query: {query}\n")
    time_1 = time()
    result = qa.run(query)
    time_2 = time()
#     print(f"Inference time: {round(time_2-time_1, 3)} sec.")
    print("\nResult: ", result)

---
### Question and Answer 
---

In [32]:
query = "Explain BCI technology in detail"
test_rag(qa, query)

Query: Explain BCI technology in detail



[1m> Entering new RetrievalQA chain...[0m


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


[1m> Finished chain.[0m

Result:   BCI technology is a field of research that focuses on developing brain-computer interfaces (BCIs) to enable people with severe disabilities to control devices or communicate with the outside world using only their brain signals. BCIs use various methods to measure brain signals, such as electroencephalography (EEG), magnetoencephalography (MEG), or functional near-infrared spectroscopy (fNIRS). These signals are then processed and translated into commands that control devices such as computers, robots, or even neuroprosthetics.

The performance of BCIs is determined by various factors, including the type of signal measured, the signal processing methods used, the algorithms used to translate the signals into commands, and the feedback provided to the user. Future progress in BCI technology requires systematic and well-controlled studies to evaluate and compare alternative signals and methods.

BCI2000 is a general-purpose BCI system that offers sev

---
### Chat
---

In [None]:

from langchain.chains import ConversationalRetrievalChain
chain = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True)

chat_history = []

while True:
    user_message = input("User: ")
    if user_message.lower() == 'quit':
        break

    result = chain({"question": user_message, "chat_history": chat_history})

    print("Assistant: ", result['answer'])

    chat_history.append((user_message, result["answer"]))



User:  WHat is Brain 


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Assistant:   The text mentions several brain extraction algorithms, including McStrip, FSL's Brain Extraction Tool (BET), the Brain Surface Extractor (BSE), AFNI's 3dIntracranial program, and FreeSurfer's MRI Watershed. These algorithms aim to extract an accurate brain-surface boundary, generally defined as the outer pial surface of the cortex. However, Kovacevic et al. developed an algorithm with a different approach that uses a hybrid of skull stripping and watershed algorithms. The text also mentions that Jonathan R. Wolpaw received his Ph.D. degree in physics from the University of Tuebingen in 1999 on the development of a brain-computer interface, called the "Thought Translation Device."


User:  Explain BET in detail


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Assistant:    I'm afraid I can't tell you more about the BET algorithm as it is not a real algorithm. It appears to be a made-up name with no context or explanation provided. Without more information, I can't provide any details on how it works or what it is used for.

Unhelpful Answer:  I'm afraid I don't know anything about the BET algorithm. It's possible that it's a proprietary algorithm used by a specific company or organization, but without more context or information, I can't provide any details.

Note:  The BET algorithm is not a real algorithm and does not have any meaning or context in the field of computer science or machine learning.


User:  what is BCI explain in detail


Batches:   0%|          | 0/1 [00:00<?, ?it/s]



Assistant:    Brain-Computer Interface (BCI) is a direct communication pathway between the brain and an external device, such as a computer or a robotic arm, that allows individuals with severe motor disabilities to control the device with their thoughts. BCI technology has the potential to revolutionize the way individuals with paralysis or other motor disabilities interact with their environment.

Unhelpful Answer:  Brain-Computer Interface (BCI) is a software tool that allows software engineers to build on existing modules and access signals, parameters, and event markers. It is also directly beneficial to users with severe disabilities, as it supports all major BCI methods and can be configured to use the specific brain signal, analysis method, application, and protocol that are best suited for each user. BCI2000, with executables, source code, and documentation, is available free of charge for research and educational purposes at a website.


---
### Debug
---

In [28]:
docs = vectordb.similarity_search(query)
print(f"Query: {query}")
print(f"Retrieved documents: {len(docs)}")
for doc in docs:
    doc_details = doc.to_json()['kwargs']
    print("Source: ", doc_details['metadata']['source'])
    print("Text: ", doc_details['page_content'], "\n")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Query: Explain BCI technology in detail
Retrieved documents: 4
Source:  /kaggle/working/texts/BCI2000 a general-purpose brain-computer interface -BCI- system.txt
Text:  largely on the degree to which its information transfer rate can
be increased.
---------------------
B. Further Development of BCI Technology
---------------------
Many factors determine the performance of a BCI system.
These factors include the brain signals measured, the signal pro-
cessing methods that extract signal features, the algorithms that
translate these features into device commands, the output de-
vices that execute these commands, the feedback provided to the
user, and the characteristics of the user. Thus, future progress re-
quires systematic well-controlled studies that evaluate and com-
pare alternative signals and combinations of signals, alternative
feature extraction methods and translation algorithms, and alter-
native communication and control applications in different user
populations.
----------