In [1]:
import os
from pathlib import Path

import pandas as pd

from src.fixed_token_chunker import FixedTokenChunker
from src.recursive_token_chunker import RecursiveTokenChunker
from src.Embedding import SentenceTransformersEmbedding, GPTEmbedding
from src.EvaluationPipeline import EvaluationPipeline
from src.Utils import Utils

In [2]:
script_dir = os.path.dirname(Path().resolve())
os.chdir(script_dir)
os.makedirs(os.path.join(script_dir, 'output'), exist_ok=True)

In [3]:
class Experiments:
    def __init__(self):
        self.chunk_sizes_overlap_sizes = [(800, 400),
                                          (800, 200),
                                          (800, 125),
                                          (400, 200),
                                          (400, 125),
                                          (400, 0),
                                          (250, 125),
                                          (250, 0),
                                          (200, 0)]
        self.top_ks = [5, 10]
        self.corpora = 'wikitexts'
        self.output_dir = 'output'

    def run_with_embd(self, embd_func):
        try:
            print(f"Running with {embd_func[0]} embedding")
            results = []
            for chunk_size, overlap_size in self.chunk_sizes_overlap_sizes:
                try:
                    chunkers = {
                        'fixed-size': FixedTokenChunker(
                            chunk_size=chunk_size,
                            chunk_overlap=overlap_size,
                            length_function=Utils.cl100k_base_length
                        ),
                        'recursive': RecursiveTokenChunker(
                            chunk_size=chunk_size,
                            chunk_overlap=overlap_size,
                            length_function=Utils.cl100k_base_length
                        )
                    }

                    for name, chunker in chunkers.items():
                        for top_k in self.top_ks:
                            try:
                                print(f"Chunk size: {chunk_size}, Overlap size: {overlap_size} with {name} chunker and {top_k} top k")
                                eval_pipeline = EvaluationPipeline(chunker, embd_func[1], top_k, reranker=False)
                                metrics_mean, metrics_std = eval_pipeline.evaluate_retrievals(self.corpora)
                                info_dict = {
                                    'chunker': name,
                                    'chunk_size': chunk_size,
                                    'overlap_size': overlap_size,
                                    'top_k': top_k
                                }
                                results.append({**info_dict, **metrics_mean, **metrics_std})
                            except Exception as e:
                                print(f"Error evaluating configuration: {e}")
                                continue

                        try:
                            print(f"Chunk size: {chunk_size}, Overlap size: {overlap_size} with {name} chunker and minimalized top k")
                            eval_pipeline = EvaluationPipeline(chunker, embd_func[1], 5, reranker=False)
                            metrics_mean, metrics_std = eval_pipeline.evaluate_retrievals(self.corpora, min_top_k=True)
                            info_dict = {
                                'chunker': name,
                                'chunk_size': chunk_size,
                                'overlap_size': overlap_size,
                                'top_k': 'Min'
                            }
                            results.append({**info_dict, **metrics_mean, **metrics_std})
                        except Exception as e:
                            print(f"Error evaluating minimalized configuration: {e}")
                            continue

                except Exception as e:
                    print(f"Error processing chunk configuration {chunk_size}, {overlap_size}: {e}")
                    continue

            if results:
                results_df = pd.DataFrame(results)
                output_path = os.path.join(self.output_dir, f'{embd_func[0]}_results.csv')
                results_df.to_csv(output_path, index=False)
            else:
                print("No results to save")

        except Exception as e:
            print(f"Error in run_with_embd: {e}")

    def run_experiments(self):
        try:
            self.run_with_embd(('sentence-transformers', SentenceTransformersEmbedding()))
            self.run_with_embd(('gpt', GPTEmbedding()))
        except Exception as e:
            print(f"Error running experiments: {e}")

In [4]:
ex = Experiments()
ex.run_experiments()

Running with sentence-transformers embedding
Chunk size: 800, Overlap size: 400 with fixed-size chunker and 5 top k
Chunk size: 800, Overlap size: 400 with fixed-size chunker and 10 top k
Chunk size: 800, Overlap size: 400 with fixed-size chunker and minimalized top k
Chunk size: 800, Overlap size: 400 with recursive chunker and 5 top k
Chunk size: 800, Overlap size: 400 with recursive chunker and 10 top k
Chunk size: 800, Overlap size: 400 with recursive chunker and minimalized top k
Chunk size: 800, Overlap size: 200 with fixed-size chunker and 5 top k
Chunk size: 800, Overlap size: 200 with fixed-size chunker and 10 top k
Chunk size: 800, Overlap size: 200 with fixed-size chunker and minimalized top k
Chunk size: 800, Overlap size: 200 with recursive chunker and 5 top k
Chunk size: 800, Overlap size: 200 with recursive chunker and 10 top k
Chunk size: 800, Overlap size: 200 with recursive chunker and minimalized top k
Chunk size: 800, Overlap size: 125 with fixed-size chunker and 5 