## 1. Read PDF files with research papers using Llama-Index

In [1]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [17]:
import os
import time
import pandas as pd

<b>1.1 Setup model for embeddings</b>

In [3]:
# import any embedding model on HF hub (https://huggingface.co/spaces/mteb/leaderboard)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.embed_model = HuggingFaceEmbedding(model_name="thenlper/gte-large") # alternative model

Settings.llm = None
Settings.chunk_size = 256
Settings.chunk_overlap = 25

LLM is explicitly disabled. Using MockLLM.


<b>1.2 Read files one by one from the directory</b>

In [4]:
reader = SimpleDirectoryReader(input_dir="stnets", recursive=True)
reader.__dict__

{'fs': <fsspec.implementations.local.LocalFileSystem at 0x1f3d6bc0580>,
 'errors': 'ignore',
 'encoding': 'utf-8',
 'exclude': None,
 'recursive': True,
 'exclude_hidden': True,
 'required_exts': None,
 'num_files_limit': None,
 'raise_on_error': False,
 'input_dir': WindowsPath('stnets'),
 'input_files': [WindowsPath('C:/Users/18623/Desktop/PhiAi/Jupyter/stnets/Applications_of_the_Streaming_Networks.pdf'),
  WindowsPath('C:/Users/18623/Desktop/PhiAi/Jupyter/stnets/arxiv_streaming_networks_ext.pdf'),
  WindowsPath('C:/Users/18623/Desktop/PhiAi/Jupyter/stnets/cvpr2020.pdf'),
  WindowsPath('C:/Users/18623/Desktop/PhiAi/Jupyter/stnets/Streaming_Networks_Enable_A_Robust_Classification_.pdf')],
 'file_extractor': {},
 'file_metadata': <llama_index.core.readers.file.base._DefaultFileMetadataFunc at 0x1f3b11d22e0>,
 'filename_as_id': False}

<b>1.3 Store Text chunks in a dict to trace chunk-paper mapping</b>

In [5]:
pdf_files = reader.__dict__['input_files']
all_docs = []
doc_dict = {}
for i,docs in enumerate(reader.iter_data()):
    pdf_name = os.path.basename(pdf_files[i])
    print(f'Extract from file {pdf_name}')
    if pdf_name in doc_dict:
        doc_dict[pdf_name] += [docs]
    else:
        doc_dict[pdf_name] = [docs]
    all_docs.extend(docs)

Extract from file Applications_of_the_Streaming_Networks.pdf
Extract from file arxiv_streaming_networks_ext.pdf
Extract from file cvpr2020.pdf
Extract from file Streaming_Networks_Enable_A_Robust_Classification_.pdf


<b>1.4 Inspect Llama-Index document class</b>

In [6]:
doc_dict['Applications_of_the_Streaming_Networks.pdf'][0][0].__dict__

{'id_': '2f5ab9e9-5989-44dc-b555-f6898eaa1ffb',
 'embedding': None,
 'metadata': {'page_label': '1',
  'file_name': 'Applications_of_the_Streaming_Networks.pdf',
  'file_path': 'C:\\Users\\18623\\Desktop\\PhiAi\\Jupyter\\stnets\\Applications_of_the_Streaming_Networks.pdf',
  'file_type': 'application/pdf',
  'file_size': 1175055,
  'creation_date': '2024-12-03',
  'last_modified_date': '2024-12-03'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'text': 'See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/340225121\nApplications of the Streaming Networks\nConference Paper · March 2020\nCITATIONS\n2\nREADS\n211\n2 authors:\nSergey Tarasenko\n34 PUBLICATIONS\xa0\xa0\xa03

<b>1.5 Get total number of collected chunks</b>

In [7]:
len(all_docs)

37

<b>1.6 Preprocess chunk to drop first page and  Reshape a way to store the Llama-index docs</b>

In [8]:
articles = list(doc_dict.keys())

for key in articles:
    clean_docs = []
    for doc in doc_dict[key][0]:
        if 'See discussions, stats' in doc.text:
            continue
        clean_docs.append(doc)
    doc_dict[key] = clean_docs
    

<b>1.7 Get chuck distribution accross papers</b>

In [9]:
for key in doc_dict:
    num_chunks = len(doc_dict[key])
    print(f'{key} >>> {num_chunks} chunks')

Applications_of_the_Streaming_Networks.pdf >>> 4 chunks
arxiv_streaming_networks_ext.pdf >>> 17 chunks
cvpr2020.pdf >>> 3 chunks
Streaming_Networks_Enable_A_Robust_Classification_.pdf >>> 10 chunks


<b>1.8 Collect all chunks for for each paper in one text</b>

In [10]:
articles = list(doc_dict.keys())
papers = {} 

for paper_name in articles:
    papers[paper_name] = ''
    for doc in doc_dict[paper_name]:
        papers[paper_name] += ' ' + doc.text

In [11]:
for paper_name in articles:
    papers[paper_name] = papers[paper_name].replace('.\n','. ')
    papers[paper_name] = papers[paper_name].replace('\n',' ')    

In [12]:
#papers[articles[3]]

## 2. Get Research Paper Summary

In [13]:
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

# by default encoder-attention is `block_sparse` with num_random_blocks=3, block_size=64
#model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")

# decoder attention type can't be changed & will be "original_full"
# you can change `attention_type` (encoder only) to full attention like this:
model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv", 
                                                               attention_type="original_full")

# you can change `block_size` & `num_random_blocks` like this:
#model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv", block_size=16, num_random_blocks=2)

<b>2.1 Summarize Papers</b>

In [14]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#pipe = pipeline(task="image-to-image", model="caidas/swin2SR-lightweight-x2-64", device=device)

In [15]:
summary_list = []
processed_papers = []
for i,paper in enumerate(articles):
    try:
        print(f'{i+1}/{len(articles)} process paper {paper}')
        time_start = time.time()
        inputs = tokenizer(papers[paper], return_tensors='pt')
        prediction = model.generate(**inputs)
        prediction = tokenizer.batch_decode(prediction)
        end_start = time.time()
        #print()
        print(f'Time taken {end_start - time_start}')
        #print(f'SUMMARY of {articles[0]}: \n {prediction}')
        processed_papers.append(paper)
        summary_list.append(prediction)
    except:
        print(f' >> SKIP: paper is tooo long')

1/4 process paper Applications_of_the_Streaming_Networks.pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (8810 > 4096). Running this sequence through the model will result in indexing errors


Time taken 72.80806946754456
2/4 process paper arxiv_streaming_networks_ext.pdf
 >> SKIP: paper is tooo long
3/4 process paper cvpr2020.pdf
Time taken 57.43613600730896
4/4 process paper Streaming_Networks_Enable_A_Robust_Classification_.pdf
 >> SKIP: paper is tooo long


In [18]:
df_summary = pd.DataFrame({'paper':processed_papers,
                           'summary':summary_list
                          })

In [19]:
df_summary['summary'].values

array([list(['<s> in this paper , we present a new method for the classification of images of arbitrary shape and texture .<n> the method is based on the decomposition of the image into a sum of two parts .<n> the first part is a linear combination of a wavelet transform of the image and the second part is a projection of the wavelet function onto the subspace spanned by the input and the output of the wavelet .<n> we demonstrate the performance of the method by applying it to the problem of image classification . <n> [ [ section ] ] in this paper , we present a new method for the classification of images of arbitrary shape and texture .<n> the method is based on the decomposition of the image into a sum of a wavelet transform of the image and the second part is a projection of the wavelet function onto the subspace spanned by the input and the output of the wavelet .<n> we demonstrate the performance of the method by applying it to the problem of image classification . <n> [ [ section

## 3. QA: Ask questions about the papers

<b>3.1 Roberta-base-sqaud2-distilled model</b>

In [20]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

In [21]:
dfs = {}

In [22]:
model_name = "deepset/roberta-base-squad2-distilled"

# a) Get predictions
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
nlp = pipeline('question-answering', 
               model=model_name, 
               tokenizer=model_name,
               device = device
              )


In [23]:
answer_list = []
score_list = []
for paper in articles:
    question = 'How to describe STNet in short?'
    QA_input = {
        'question': question,
        'context': papers[paper]
    }
    res = nlp(QA_input)
    #print(f'from paper {paper} \n we learn that answer to {question} is \n {res} \n')
    answer_list.append(res['answer'])
    score_list.append(res['score'])

dfs[question + model_name] = pd.DataFrame({'question':len(articles)*[question],
                                           'model name':len(articles)*[model_name],
                                           'paper':articles,
                                           'answer': answer_list,
                                           'score':score_list
                             })
dfs[question + model_name]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Unnamed: 0,question,model name,paper,answer,score
0,How to describe STNet in short?,deepset/roberta-base-squad2-distilled,Applications_of_the_Streaming_Networks.pdf,Streaming net- works,0.581112
1,How to describe STNet in short?,deepset/roberta-base-squad2-distilled,arxiv_streaming_networks_ext.pdf,Streaming Network,0.114451
2,How to describe STNet in short?,deepset/roberta-base-squad2-distilled,cvpr2020.pdf,Streaming networks,0.267675
3,How to describe STNet in short?,deepset/roberta-base-squad2-distilled,Streaming_Networks_Enable_A_Robust_Classificat...,pseudo-conv net,0.539523


In [24]:
answer_list = []
score_list = []
for paper in articles:
    question = 'what is main advantage of STNets?'
    QA_input = {
        'question': question,
        'context': papers[paper]
    }
    res = nlp(QA_input)
    #print(f'from paper {paper} \n we learn that answer to {question} is \n {res} \n')
    answer_list.append(res['answer'])
    score_list.append(res['score'])

dfs[question + model_name] = pd.DataFrame({'question':len(articles)*[question],
                              'model name':len(articles)*[model_name],
                              'paper':articles,
                              'answer': answer_list,
                              'score':score_list
                             })
dfs[question + model_name]



Unnamed: 0,question,model name,paper,answer,score
0,what is main advantage of STNets?,deepset/roberta-base-squad2-distilled,Applications_of_the_Streaming_Networks.pdf,robust recognition of corrupted images under v...,0.382967
1,what is main advantage of STNets?,deepset/roberta-base-squad2-distilled,arxiv_streaming_networks_ext.pdf,noise robustness,0.874971
2,what is main advantage of STNets?,deepset/roberta-base-squad2-distilled,cvpr2020.pdf,recognizing zero noise- corrupted images with ...,0.342385
3,what is main advantage of STNets?,deepset/roberta-base-squad2-distilled,Streaming_Networks_Enable_A_Robust_Classificat...,state-of-the-art performance,0.315538


In [25]:
answer_list = []
score_list = []
for paper in articles:
    question = 'what are key features of STNets?'
    QA_input = {
        'question': question,
        'context': papers[paper]
    }
    res = nlp(QA_input)
    #print(f'from paper {paper} \n we learn that answer to {question} is \n {res} \n')
    answer_list.append(res['answer'])
    score_list.append(res['score'])

dfs[question + model_name] = pd.DataFrame({'question':len(articles)*[question],
                              'model name':len(articles)*[model_name],
                              'paper':articles,
                              'answer': answer_list,
                              'score':score_list
                             })
dfs[question + model_name]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Unnamed: 0,question,model name,paper,answer,score
0,what are key features of STNets?,deepset/roberta-base-squad2-distilled,Applications_of_the_Streaming_Networks.pdf,capable of recognition of zero noise- corrupte...,0.066198
1,what are key features of STNets?,deepset/roberta-base-squad2-distilled,arxiv_streaming_networks_ext.pdf,hard- wired and input-induced sparsity,0.669279
2,what are key features of STNets?,deepset/roberta-base-squad2-distilled,cvpr2020.pdf,capable of recognizing zero noise- corrupted i...,0.316081
3,what are key features of STNets?,deepset/roberta-base-squad2-distilled,Streaming_Networks_Enable_A_Robust_Classificat...,images without any noise,0.00924


<b>3.2 HuggingFace examplar pipeline</b>

In [26]:
qa_model = pipeline("question-answering", 
                    device=device)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [27]:
answer_list = []
score_list = []
for paper in articles:
    question = 'what is main advantage of STNets?'
    res = qa_model(question = question, context = papers[paper])
    #print(f'from paper {paper} \n we learn that answer to {question} is \n {res} \n')
    answer_list.append(res['answer'])
    score_list.append(res['score'])

dfs[question+'_nomodel'] = pd.DataFrame({'question':len(articles)*[question],
                                         'model name': len(articles)*['default'],
                                         'paper':articles,
                                         'answer': answer_list,
                                         'score':score_list
                                        })
dfs[question+'_nomodel']

Unnamed: 0,question,model name,paper,answer,score
0,what is main advantage of STNets?,default,Applications_of_the_Streaming_Networks.pdf,recognition of low- light images,0.111861
1,what is main advantage of STNets?,default,arxiv_streaming_networks_ext.pdf,higher capacity,0.696631
2,what is main advantage of STNets?,default,cvpr2020.pdf,moderate accuracy,0.185213
3,what is main advantage of STNets?,default,Streaming_Networks_Enable_A_Robust_Classificat...,increase conv net robustness against the noise,0.166835


In [28]:
answer_list = []
score_list = []
for paper in articles:
    question = 'How to describe STNet in short?'
    res = qa_model(question = question, context = papers[paper])
    #print(f'from paper {paper} \n we learn that answer to {question} is \n {res} \n')
    answer_list.append(res['answer'])
    score_list.append(res['score'])

dfs[question+'_nomodel'] = pd.DataFrame({'question':len(articles)*[question],
                                         'model name': len(articles)*['default'],
                                         'paper':articles,
                                         'answer': answer_list,
                                         'score':score_list
                                        })
dfs[question+'_nomodel']

Unnamed: 0,question,model name,paper,answer,score
0,How to describe STNet in short?,default,Applications_of_the_Streaming_Networks.pdf,hybrid STnets,0.570117
1,How to describe STNet in short?,default,arxiv_streaming_networks_ext.pdf,Kullback–Leibler divergence,0.302576
2,How to describe STNet in short?,default,cvpr2020.pdf,5- stream,0.726802
3,How to describe STNet in short?,default,Streaming_Networks_Enable_A_Robust_Classificat...,Streaming Net,0.474929


In [29]:
answer_list = []
score_list = []
for paper in articles:
    question = 'what are key features of STNets?'
    res = qa_model(question = question, context = papers[paper])
    #print(f'from paper {paper} \n we learn that answer to {question} is \n {res} \n')
    answer_list.append(res['answer'])
    score_list.append(res['score'])

dfs[question+'_nomodel'] = pd.DataFrame({'question':len(articles)*[question],
                                         'model name': len(articles)*['default'],
                                         'paper':articles,
                                         'answer': answer_list,
                                         'score':score_list
                                        })
dfs[question+'_nomodel']

Unnamed: 0,question,model name,paper,answer,score
0,what are key features of STNets?,default,Applications_of_the_Streaming_Networks.pdf,"lighting or weather condition, speed, distance...",0.537666
1,what are key features of STNets?,default,arxiv_streaming_networks_ext.pdf,higher capacity,0.316742
2,what are key features of STNets?,default,cvpr2020.pdf,capable of recognizing zero noise- corrupted i...,0.059329
3,what are key features of STNets?,default,Streaming_Networks_Enable_A_Robust_Classificat...,Spike- based strategies for rapid processing,0.19297


<b>3.3 Summarize QA results in one DataFrame</b>

In [30]:
qa_keys = list(dfs.keys())

merged_df = pd.concat([dfs[key] for key in qa_keys])

In [31]:
merged_df

Unnamed: 0,question,model name,paper,answer,score
0,How to describe STNet in short?,deepset/roberta-base-squad2-distilled,Applications_of_the_Streaming_Networks.pdf,Streaming net- works,0.581112
1,How to describe STNet in short?,deepset/roberta-base-squad2-distilled,arxiv_streaming_networks_ext.pdf,Streaming Network,0.114451
2,How to describe STNet in short?,deepset/roberta-base-squad2-distilled,cvpr2020.pdf,Streaming networks,0.267675
3,How to describe STNet in short?,deepset/roberta-base-squad2-distilled,Streaming_Networks_Enable_A_Robust_Classificat...,pseudo-conv net,0.539523
0,what is main advantage of STNets?,deepset/roberta-base-squad2-distilled,Applications_of_the_Streaming_Networks.pdf,robust recognition of corrupted images under v...,0.382967
1,what is main advantage of STNets?,deepset/roberta-base-squad2-distilled,arxiv_streaming_networks_ext.pdf,noise robustness,0.874971
2,what is main advantage of STNets?,deepset/roberta-base-squad2-distilled,cvpr2020.pdf,recognizing zero noise- corrupted images with ...,0.342385
3,what is main advantage of STNets?,deepset/roberta-base-squad2-distilled,Streaming_Networks_Enable_A_Robust_Classificat...,state-of-the-art performance,0.315538
0,what are key features of STNets?,deepset/roberta-base-squad2-distilled,Applications_of_the_Streaming_Networks.pdf,capable of recognition of zero noise- corrupte...,0.066198
1,what are key features of STNets?,deepset/roberta-base-squad2-distilled,arxiv_streaming_networks_ext.pdf,hard- wired and input-induced sparsity,0.669279


In [32]:
merged_df = merged_df.reset_index()
cols = list(merged_df.columns)
print(cols)
cols.remove('index')
merged_df = merged_df.groupby(['question','model name'])[cols].apply(lambda x: x)

['index', 'question', 'model name', 'paper', 'answer', 'score']


In [33]:
merged_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,question,model name,paper,answer,score
question,model name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
How to describe STNet in short?,deepset/roberta-base-squad2-distilled,0,How to describe STNet in short?,deepset/roberta-base-squad2-distilled,Applications_of_the_Streaming_Networks.pdf,Streaming net- works,0.581112
How to describe STNet in short?,deepset/roberta-base-squad2-distilled,1,How to describe STNet in short?,deepset/roberta-base-squad2-distilled,arxiv_streaming_networks_ext.pdf,Streaming Network,0.114451
How to describe STNet in short?,deepset/roberta-base-squad2-distilled,2,How to describe STNet in short?,deepset/roberta-base-squad2-distilled,cvpr2020.pdf,Streaming networks,0.267675
How to describe STNet in short?,deepset/roberta-base-squad2-distilled,3,How to describe STNet in short?,deepset/roberta-base-squad2-distilled,Streaming_Networks_Enable_A_Robust_Classificat...,pseudo-conv net,0.539523
How to describe STNet in short?,default,16,How to describe STNet in short?,default,Applications_of_the_Streaming_Networks.pdf,hybrid STnets,0.570117
How to describe STNet in short?,default,17,How to describe STNet in short?,default,arxiv_streaming_networks_ext.pdf,Kullback–Leibler divergence,0.302576
How to describe STNet in short?,default,18,How to describe STNet in short?,default,cvpr2020.pdf,5- stream,0.726802
How to describe STNet in short?,default,19,How to describe STNet in short?,default,Streaming_Networks_Enable_A_Robust_Classificat...,Streaming Net,0.474929
what are key features of STNets?,deepset/roberta-base-squad2-distilled,8,what are key features of STNets?,deepset/roberta-base-squad2-distilled,Applications_of_the_Streaming_Networks.pdf,capable of recognition of zero noise- corrupte...,0.066198
what are key features of STNets?,deepset/roberta-base-squad2-distilled,9,what are key features of STNets?,deepset/roberta-base-squad2-distilled,arxiv_streaming_networks_ext.pdf,hard- wired and input-induced sparsity,0.669279
