In [41]:
import sys
sys.path.append("..")
import os
import json
import math
import uuid
from rag_eval.doc_loader import RandomQueriesPaperSearchGraph
from rag_eval.chunk_eval import ChunkEvalGraph
from fcgb.cfg.precompiled import get_llm, get_checkpointer

import asyncio
from tqdm.notebook import tqdm as notebook_tqdm

In [42]:
def get_filename(file):
    return '.'.join(os.path.basename(file).split('.')[:-1])

def get_files(path):
    """
    Return list of files from a given folder.
    """
    return [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]

def get_filenames_list(path):
    """
    Return list of files from a given folder without extensions.
    """
    return [get_filename(f) for f in get_files(path)]

def get_files_without(path, filenames):
    """
    Returns a list of files from a given folder that are not in the provided filenames list.
    """
    return [f for f in get_files(path) if get_filename(f) not in filenames]

def get_filenames_without(path, filenames):
    """
    Returns a list of files from a given folder without extensions that are not in the provided filenames list.
    """
    return [get_filename(f) for f in get_files_without(path, filenames)]

def remove_files(path, filenames):
    """
    Remove files from a given folder that are in the provided filenames list.
    """
    for f in filenames:
        os.remove(os.path.join(path, f))

def load_json(path):
    """
    Load a JSON file from the given path.
    """
    with open(path, 'r') as f:
        return json.load(f)

In [3]:
evaluated_files = get_filenames_list('../data/example_chunk_eval')
print(f"Evaluated files: {evaluated_files}")

files_to_evaluate = get_files_without('../data/docs_metadata', evaluated_files)
print(f"Files to evaluate: {files_to_evaluate}")
len(files_to_evaluate)

Evaluated files: ['2502.09625v1_4', '2502.09625v1_0', '2502.09625v1_2', '2312.15235v1', '2502.09625v1_1', '2502.09625v1_7', '2502.09625v1_3', '2502.09625v1_5', '2502.09625v1_8', '2502.09625v1_6', '2502.09625v1_9', '2502.09625v1']
Files to evaluate: ['2110.03478v2.json', '2206.12415v1.json', '2009.08294v1.json', '1811.08212v1.json', '2108.02501v3.json', '2406.11389v1.json', '2506.01945v1.json', '2406.05517v1.json', '2406.15962v1.json', '2306.17794v1.json', '2409.13406v1.json', '2408.12408v1.json', '2310.07427v3.json']


13

In [54]:
class ChunkEvalBaseBuilder:
    def __init__(
            self,
            llm,
            output_path: str,
            builder_config: dict = {
                'max_pages': 15,
                'eval_batch_size': 5
            },
            prompts_config: dict = {
                'path': '../prompts',
                'random_queries': 'random_queries',
                'paper_queries': 'paper_queries',
                'chunk_eval_system': 'chunk_eval_system',
                'chunk_eval_task': 'chunk_eval_task',
                'doc_context_system': 'doc_context_system',
                'doc_context_update': 'doc_context_update',
                'doc_context_aggregation': 'doc_context_aggregation'
            },
            doc_search_config: dict = {
                'main_queries_num': 4,
                'paper_queries_num': 5,
                'max_results': 5,
            },
            chunk_eval_config: dict = {
                'chunk_size': 600,
                'chunk_overlap': 0,
                'max_queries': 5,
                'context_agg_interval': 5
            },
            memory=None,
            prompt_manager_spec: dict = {}
        ):

        self.llm = llm
        self.output_path = output_path
        self.builder_config = builder_config
        self.prompts_config = prompts_config
        self.doc_search_config = doc_search_config
        self.chunk_eval_config = chunk_eval_config
        self.memory = memory
        self.prompt_manager_spec = prompt_manager_spec

        self.build()

    def build(self):

        self.docs_metadata_path = os.path.join(self.output_path, 'docs_metadata')
        self.docs_path = os.path.join(self.output_path, 'docs')
        self.eval_path = os.path.join(self.output_path, 'chunks_eval')

        self.doc_loader = RandomQueriesPaperSearchGraph(
            llm=self.llm,
            prompts_config=self.prompts_config,
            docs_path=self.docs_path,
            docs_metadata_path=self.docs_metadata_path,
            memory=self.memory,
            prompt_manager_spec=self.prompt_manager_spec,
            **self.doc_search_config
        )

        self.chunk_eval = ChunkEvalGraph(
            llm=self.llm,
            prompts_config=self.prompts_config,
            docs_metadata_path=self.docs_metadata_path,
            saving_path=self.eval_path,
            memory=self.memory,
            prompt_manager_spec=self.prompt_manager_spec,
            **self.chunk_eval_config
        )
    
    def _evaluated_docs(self):
        """
        Returns a list of evaluated documents.
        """
        return get_filenames_list(self.eval_path)

    def _docs_without_evaluation(self):
        """
        Returns a list of documents that have not been evaluated yet.
        """
        return get_filenames_without(self.docs_metadata_path, self._evaluated_docs())
    
    def _all_docs(self):
        """
        Returns a list of all documents.
        """
        return get_filenames_list(self.docs_metadata_path)
    
    @property
    def docs_to_evaluate(self):
        """
        Returns a number of documents that need to be evaluated.
        """
        return len(self._docs_without_evaluation())
    
    @property
    def docs_evaluated(self):
        """
        Returns a number of documents that have been evaluated.
        """
        return len(self._evaluated_docs())
    
    @property
    def all_docs(self):
        """
        Returns a number of all documents.
        """
        return len(self._all_docs())
    
    @property
    def new_docs_per_turn(self):
        return self.doc_search_config['main_queries_num'] * self.doc_search_config['max_results']
    
    @property
    def evaluations_per_turn(self):
        return self.builder_config['eval_batch_size']
    
    @staticmethod
    def _is_doc_oversized(path, pages_limit):
        doc_metadata = load_json(path)
        return doc_metadata['pages_count'] > pages_limit
    
    @staticmethod
    def _add_json_extension(paths):
        """
        Adds '.json' extension to a list of paths.
        """
        return [f + '.json' for f in paths]
    
    @staticmethod
    def _add_pdf_extension(paths):
        """
        Adds '.pdf' extension to a list of paths.
        """
        return [f + '.pdf' for f in paths]
    
    def _remove_oversized_docs(self):
        """
        Removes oversized documents based on the max_pages limit specified in the builder_config.
        """
        if self.builder_config.get('max_pages'):
            oversized_docs = [
                f for f in self._docs_without_evaluation()
                if self._is_doc_oversized(
                    os.path.join(self.docs_metadata_path, f + '.json'),
                    self.builder_config['max_pages']
                )
            ]
            print(f"Removing oversized docs: {oversized_docs}")
            docs_before = self.all_docs
            remove_files(self.docs_metadata_path, self._add_json_extension(oversized_docs))
            remove_files(self.docs_path, self._add_pdf_extension(oversized_docs))
            docs_after = self.all_docs
            docs_removed = docs_before - docs_after
            print(f"Removed {docs_removed} oversized docs. Remaining docs: {docs_after}")

    def extend_docs(self, target_docs: int):

        current_docs_num = self.all_docs
        docs_per_turn = self.new_docs_per_turn

        turns_needed = math.ceil((target_docs - current_docs_num) / docs_per_turn)

        print(f"Current docs: {current_docs_num}, Target docs: {target_docs}, Turns needed: {turns_needed}")

        pbar = notebook_tqdm(total=turns_needed, desc="Collecting docs", postfix={'All docs': current_docs_num, 'New docs': 0})

        for _ in range(turns_needed):

            try:
                thread_id = uuid.uuid4().hex
                self.doc_loader.run(thread_id=thread_id)
            except Exception as e:
                print(f"Error during doc loading: {e}")
            
            all_docs = self.all_docs
            new_docs = all_docs - current_docs_num

            pbar.update(1)
            pbar.set_postfix({'All docs': all_docs, 'New docs': new_docs})

    """async def evaluate_docs(self, target_docs: int):

        self._remove_oversized_docs()

        batch_size = self.evaluations_per_turn
        turns_needed = math.ceil((target_docs - self.docs_evaluated) / batch_size)
        docs_paths = self._add_json_extension(self._docs_without_evaluation())

        async def run_query_async(metadata_file: str):
            thread_id = uuid.uuid4().hex
            state = await self.chunk_eval.run_with_progress_async(metadata_file=metadata_file, thread_id=thread_id)
            return metadata_file, state
        
        process_pbar = notebook_tqdm(total=turns_needed, desc="Evaluating docs", postfix={'Target Docs': target_docs, 'Evaluated': self.docs_evaluated})
        
        for _ in range(turns_needed):

            batch_files = docs_paths[:batch_size]
            docs_paths = docs_paths[batch_size:]

            tasks = [run_query_async(metadata_file) for metadata_file in batch_files]
            results = await asyncio.gather(*tasks)

            process_pbar.set_postfix({'Target Docs': target_docs, 'Evaluated': self.docs_evaluated})"""
    
    async def evaluate_docs(self, target_docs: int):
        self._remove_oversized_docs()

        batch_size = self.evaluations_per_turn
        docs_needed = max(target_docs - self.docs_evaluated, 0)
        docs_paths = self._add_json_extension(self._docs_without_evaluation())[:docs_needed]
        total_docs = len(docs_paths)

        # Progress bar
        process_pbar = notebook_tqdm(total=total_docs, desc="Evaluating docs", postfix={'Target Docs': target_docs, 'Evaluated': self.docs_evaluated})

        async def worker(queue):
            while True:
                metadata_file = await queue.get()
                if metadata_file is None:  # Sentinel to stop the worker
                    break
                try:
                    thread_id = uuid.uuid4().hex
                    state = await self.chunk_eval.run_with_progress_async(metadata_file=metadata_file, thread_id=thread_id)
                except Exception as e:
                    print(f"Error processing {metadata_file}: {e}")
                finally:
                    process_pbar.update(1)
                    process_pbar.set_postfix({'Target Docs': target_docs, 'Evaluated': self.docs_evaluated})
                    queue.task_done()

        # Create a queue and populate it with files to process
        queue = asyncio.Queue()
        for metadata_file in docs_paths:
            await queue.put(metadata_file)

        # Start worker tasks
        num_workers = batch_size  # Number of concurrent workers
        workers = [asyncio.create_task(worker(queue)) for _ in range(num_workers)]


        # Wait for all tasks to complete
        await queue.join()

        # Stop workers
        for _ in range(num_workers):
            await queue.put(None)
        await asyncio.gather(*workers)

In [61]:
base_builder = ChunkEvalBaseBuilder(
    llm=get_llm(llm_model='google'),
    output_path='../data',
    builder_config={
        'max_pages': 15,
        'eval_batch_size': 8
    },
    prompts_config={
        'path': '../prompts',
        'random_queries': 'random_queries',
        'paper_queries': 'paper_queries',
        'chunk_eval_system': 'chunk_eval_system',
        'chunk_eval_task': 'chunk_eval_task',
        'doc_context_system': 'doc_context_system',
        'doc_context_update': 'doc_context_update',
        'doc_context_aggregation': 'doc_context_aggregation'
    },
    doc_search_config={
        'main_queries_num': 4,
        'paper_queries_num': 5,
        'max_results': 5,
    },
    chunk_eval_config={
        'chunk_size': 600,
        'chunk_overlap': 0,
        'max_queries': 5,
        'context_agg_interval': 5
    },
    memory=get_checkpointer(checkpointer_mode='local')
)

In [30]:
base_builder.extend_docs(target_docs=25)

Current docs: 0, Target docs: 25, Turns needed: 3


Collecting docs:   0%|          | 0/3 [00:00<?, ?it/s, All docs=0, New docs=0]

Searching for papers with query: exoplanet atmospheric characterization JWST NIRSpec
Searching for papers with query: high-redshift galaxy formation simulations feedback
Searching for papers with query: CRISPR-Cas gene editing therapy ethical implications
Searching for papers with query: topological insulator quantum computing Majorana fermions
Searching for papers with query: federated learning privacy attacks defense mechanisms
Searching for papers with query: explainable AI methods for medical image diagnosis
Searching for papers with query: CRISPR gene editing ethical considerations review
Searching for papers with query: blockchain technology supply chain management applications
Searching for papers with query: graphene based photodetectors for infrared imaging
Searching for papers with query: explainable AI methods for fraud detection in finance
Searching for papers with query: federated learning privacy preserving techniques healthcare
Searching for papers with query: blockchain

In [35]:
base_builder._remove_oversized_docs()

Removing oversized docs: []
Removed 0 oversized docs. Remaining docs: 35


In [63]:
base_builder.docs_evaluated

25

In [62]:
await base_builder.evaluate_docs(target_docs=25)

Removing oversized docs: []
Removed 0 oversized docs. Remaining docs: 35


Evaluating docs:   0%|          | 0/14 [00:00<?, ?it/s, Evaluated=11, Target Docs=25]

Ignoring wrong pointing object 11 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 25 0 (offset 0)
Ignoring wrong pointing object 27 0 (offset 0)
Ignoring wrong pointing object 29 0 (offset 0)
Ignoring wrong pointing object 31 0 (offset 0)
Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 37 0 (offset 0)
Ignoring wrong pointing object 39 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 47 0 (offset 0)
Ignoring wrong pointing object 49 0 (offset 0)
Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 55 0 (offset 0)
Ignoring wrong pointing object 57 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 77 0 (offset 0)
Ignoring wron

Evaluating Chunks:   0%|          | 0/39 [00:00<?, ?it/s, Negative=0, Positive=0]

Ignoring wrong pointing object 182 0 (offset 0)
Ignoring wrong pointing object 184 0 (offset 0)


Evaluating Chunks:   0%|          | 0/64 [00:00<?, ?it/s, Negative=0, Positive=0]

Ignoring wrong pointing object 190 0 (offset 0)


Evaluating Chunks:   0%|          | 0/59 [00:00<?, ?it/s, Negative=0, Positive=0]

Evaluating Chunks:   0%|          | 0/84 [00:00<?, ?it/s, Negative=0, Positive=0]

Evaluating Chunks:   0%|          | 0/117 [00:00<?, ?it/s, Negative=0, Positive=0]

Evaluating Chunks:   0%|          | 0/111 [00:00<?, ?it/s, Negative=0, Positive=0]

Evaluating Chunks:   0%|          | 0/78 [00:00<?, ?it/s, Negative=0, Positive=0]

Evaluating Chunks:   0%|          | 0/43 [00:00<?, ?it/s, Negative=0, Positive=0]

Saving evaluation results on document ../data/docs/2403.01927v1.pdf...


Evaluating Chunks:   0%|          | 0/77 [00:00<?, ?it/s, Negative=0, Positive=0]

Saving evaluation results on document ../data/docs/1507.02655v1.pdf...


Evaluating Chunks:   0%|          | 0/30 [00:00<?, ?it/s, Negative=0, Positive=0]

Saving evaluation results on document ../data/docs/2209.04917v1.pdf...


Evaluating Chunks:   0%|          | 0/100 [00:00<?, ?it/s, Negative=0, Positive=0]

Saving evaluation results on document ../data/docs/2501.06887v1.pdf...


Evaluating Chunks:   0%|          | 0/53 [00:00<?, ?it/s, Negative=0, Positive=0]

Saving evaluation results on document ../data/docs/0903.2196v1.pdf...


Evaluating Chunks:   0%|          | 0/62 [00:00<?, ?it/s, Negative=0, Positive=0]

Saving evaluation results on document ../data/docs/2206.00769v1.pdf...


Evaluating Chunks:   0%|          | 0/32 [00:00<?, ?it/s, Negative=0, Positive=0]

Saving evaluation results on document ../data/docs/1811.03230v1.pdf...
Saving evaluation results on document ../data/docs/1810.09203v1.pdf...
Saving evaluation results on document ../data/docs/2409.05938v1.pdf...
Saving evaluation results on document ../data/docs/1310.3528v1.pdf...
Saving evaluation results on document ../data/docs/2010.10572v1.pdf...
Saving evaluation results on document ../data/docs/2312.00586v1.pdf...
Saving evaluation results on document ../data/docs/2203.04173v1.pdf...
Saving evaluation results on document ../data/docs/1906.01831v1.pdf...
