In [1]:
import os
import docx
import fitz  # PyMuPDF

## Chunking

In [2]:
# Function to read .docx files
def read_docx(file_path):
    try:
        doc = docx.Document(file_path)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        return '\n'.join(full_text)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""
    

# Function to read .pdf files
def read_pdf(file_path):
    try:
        pdf_document = fitz.open(file_path)
        text = ""
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

# Directory containing the files
directory = "../research"

In [3]:
# Dictionary to store file content
file_contents = {}

# Traverse the directory and read files
for root, dirs, files in os.walk(directory):
    for file in files:
        file_path = os.path.join(root, file)
        if file in ('MoE Notes FINAL.docx', 'MoE Notes.docx'):
            content = read_docx(file_path)
            file_contents[file] = {}
            file_contents[file]["content"] = content
        elif file.endswith('.pdf'):
            content = read_pdf(file_path)
            file_contents[file] = {}
            file_contents[file]["content"] = content

In [4]:
paper_names = {'MoE Notes.docx': ("MoE NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE%20Notes.docx"),
 'MoE Notes FINAL.docx': ("MoE NOTES FINAL", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE%20Notes%20FINAL.docx"),
 'Unified_Scaling_Laws_NOTES.pdf': ("Unified Scaling Laws for Routes Language Models NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Scaling_and_Stability/Unified_Scaling_Laws_NOTES.pdf"),
 'Switch_Transformers.pdf': ("Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity", "https://arxiv.org/abs/2101.03961"),
 'ST_MoE.docx': ("ST-MoE: Designing Stable and Transferable Sparse Expert Models", "https://arxiv.org/abs/2202.08906"),
 'GLaM.pdf': ("GLaM: Efficient Scaling of Language Models with Mixture-of-Experts", "https://arxiv.org/abs/2112.06905"),
 'GShard.pdf': ("GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding", "https://arxiv.org/abs/2006.16668"),
 'Unified_Scaling_Laws.pdf': ("Unified Scaling Laws for Routes Language Models", "https://arxiv.org/abs/2202.01169"),
 'ST_MoE.pdf': ("ST-MoE: Designing Stable and Transferable Sparse Expert Models", "https://arxiv.org/abs/2202.08906"),
 'GLaM_NOTES.pdf': ("GLaM: Efficient Scaling of Language Models with Mixture-of-Experts NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Scaling_and_Stability/GLaM_NOTES.pdf"),
 'Efficient_Large_Scale_LM.docx': ("Efficient Large Scale Language Modeling with Mixtures of Experts", "https://arxiv.org/abs/2112.10684"),
 'Unified_Scaling_Laws.docx': ("Unified Scaling Laws for Routed Language Models", "https://arxiv.org/abs/2202.01169"),
 'GShard.docx': ("GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding", "https://arxiv.org/abs/2006.16668"),
 'ST_MoE_NOTES.pdf': ("ST-MoE: Designing Stable and Transferable Sparse Expert Models NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Scaling_and_Stability/ST_MoE_NOTES.pdf"),
 'Switch_Transformers.docx': ("Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity", "https://arxiv.org/abs/2101.03961"),
 'GLaM.docx': ("GLaM: Efficient Scaling of Language Models with Mixture-of-Experts", "https://arxiv.org/abs/2112.06905"),
 'Switch_Transformers_NOTES.pdf': ("Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Scaling_and_Stability/Switch_Transformers_NOTES.pdf"),
 'GShard_NOTES.pdf': ("GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Scaling_and_Stability/GShard_NOTES.pdf"),
 'Efficient_Large_Scale_LM.pdf': ("Efficient Large Scale Language Modeling with Mixtures of Experts", "https://arxiv.org/abs/2112.10684"),
 'Efficient_Large_Scale_LM_NOTES.pdf': ("Efficient Large Scale Language Modeling with Mixtures of Experts NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Scaling_and_Stability/Efficient_Large_Scale_LM_NOTES.pdf"),
 'Benefits_of_ELMs_NOTES.docx': ("Exploring the Benefits of Training Expert Language Models over Instruction Tuning NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/TaskDomain_Level_MoE/Benefits_of_ELMs_NOTES.pdf"),
 'BTM_NOTES.docx': ("Branch-Train-Merge: Embarrassingly Parallel Training of Expert Language Models NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/TaskDomain_Level_MoE/BTM_NOTES.pdf"),
 'BTM.pdf': ("Branch-Train-Merge: Embarrassingly Parallel Training of Expert Language Models", "https://arxiv.org/abs/2208.03306"),
 'Benefits_of_ELMs.pdf': ("Exploring the Benefits of Training Expert Language Models over Instruction Tuning NOTES", "https://arxiv.org/abs/2302.03202"),
 'Expert_Gate_NOTES.pdf': ("Expert Gate: Lifelong Learning with a Network of Experts NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/TaskDomain_Level_MoE/Expert_Gate_NOTES.pdf"),
 'BeyondDistillation_Task_Level_MoE.pdf': ("Beyond Distillation: Task-level Mixture-of-Experts for Efficient Inference", "https://arxiv.org/abs/2110.03742"),
 'cBTM_NOTES.docx': ("Scaling Expert Language Models with Unsupervised Domain Discovery NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/TaskDomain_Level_MoE/cBTM_NOTES.pdf"),
 'Expert_Gate_NOTES.docx': ("Expert Gate: Lifelong Learning with a Network of Experts NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/TaskDomain_Level_MoE/Expert_Gate_NOTES.pdf"),
 'BeyondDistillation_Task_Level_MoE_NOTES.docx': ("Beyond Distillation: Task-level Mixture-of-Experts for Efficient Inference NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/TaskDomain_Level_MoE/BeyondDistillation_Task_Level_MoE_NOTES.pdf"),
 'DEMix_NOTES.pdf': ("DEMix Layers: Disentangling Domains for Modular Language Modeling NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/TaskDomain_Level_MoE/DEMix_NOTES.pdf"),
 'BTM_NOTES.pdf': ("Branch-Train-Merge: Embarrassingly Parallel Training of Expert Language Models NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/TaskDomain_Level_MoE/BTM_NOTES.pdf"),
 'DEMix.pdf': ("DEMix Layers: Disentangling Domains for Modular Language Modeling NOTES", "https://arxiv.org/abs/2108.05036"),
 'cBTM_NOTES.pdf': ("Scaling Expert Language Models with Unsupervised Domain Discovery NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/TaskDomain_Level_MoE/cBTM_NOTES.pdf"),
 'BeyondDistillation_Task_Level_MoE_NOTES.pdf': ("Beyond Distillation: Task-level Mixture-of-Experts for Efficient Inference NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/TaskDomain_Level_MoE/BeyondDistillation_Task_Level_MoE_NOTES.pdf"),
 'Expert_Gate.pdf': ("Expert Gate: Lifelong Learning with a Network of Experts", "https://arxiv.org/abs/1611.06194"),
 'cBTM.pdf': ("Scaling Expert Language Models with Unsupervised Domain Discovery", "https://arxiv.org/abs/2303.14177"),
 'DEMix_NOTES.docx': ("DEMix Layers: Disentangling Domains for Modular Language Modeling NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/TaskDomain_Level_MoE/DEMix_NOTES.pdf"),
 'Benefits_of_ELMs_NOTES.pdf': ("Exploring the Benefits of Training Expert Language Models over Instruction Tuning NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/TaskDomain_Level_MoE/Benefits_of_ELMs_NOTES.pdf"),
 'MoE_Mamba.pdf': ("MoE-Mamba: Efficient Selective State Space Models with Mixture of Experts", "https://arxiv.org/abs/2401.04081"),
 'MoE_meets_instruction_tuning_NOTES.docx': ("Mixture-of-Experts Meets Instruction Tuning: A Winning Combination for Large Language Models NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/MoE_meets_instruction_tuning_NOTES.pdf"),
 'BlackMamba.pdf': ("BlackMamba: Mixture of Experts for State-Space Models", "https://arxiv.org/abs/2402.01771"),
 'Soft_Merging_of_Experts_NOTES.docx': ("Soft Merging of Experts with Adaptive Routing NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/Soft_Merging_of_Experts_NOTES.pdf"),
 'MoE_meets_instruction_tuning.pdf': ("Mixture-of-Experts Meets Instruction Tuning: A Winning Combination for Large Language Models", "https://arxiv.org/abs/2305.14705"),
 'EvoMoE_NOTES.docx': ("EvoMoE: An Evolutional Mixture-of-Experts Training Framework via Dense-To-Sparse Gate NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/EvoMoE_NOTES.docx"),
 'MoE_Mamba_NOTES.pdf': ("MoE-Mamba: Efficient Selective State Space Models with Mixture of Experts NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/MoE_Mamba_NOTES.pdf"),
 'Sparse_Upcycling.pdf': ("Sparse Upcycling: Training Mixture-of-Experts from Dense Checkpoints", "https://arxiv.org/abs/2212.05055"),
 'Soft_Merging_of_Experts.pdf': ("Soft Merging of Experts with Adaptive Routing", "https://arxiv.org/abs/2306.03745"),
 'Sparse_Upcycling_NOTES.pdf': ("Sparse Upcycling: Training Mixture-of-Experts from Dense Checkpoints", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/Sparse_Upcycling_NOTES.pdf"),
 'MoE_meets_instruction_tuning_NOTES.pdf': ("Mixture-of-Experts Meets Instruction Tuning: A Winning Combination for Large Language Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/MoE_meets_instruction_tuning_NOTES.pdf"),
 'Soft_Merging_of_Experts_NOTES.pdf': ("Soft Merging of Experts with Adaptive Routing NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/Soft_Merging_of_Experts_NOTES.pdf"),
 'EvoMoE.pdf': ("EvoMoE: An Evolutional Mixture-of-Experts Training Framework via Dense-To-Sparse Gate", "https://arxiv.org/abs/2112.14397"),
 'BlackMamba_NOTES.pdf': ("BlackMamba: Mixture of Experts for State-Space Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/BlackMamba_NOTES.pdf"),
 'MoE_Mamba_NOTES.docx': ("MoE-Mamba: Efficient Selective State Space Models with Mixture of Experts NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/MoE_Mamba_NOTES.pdf"),
 'Sparse_Upcycling_NOTES.docx': ("Sparse Upcycling: Training Mixture-of-Experts from Dense Checkpoints", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/Sparse_Upcycling_NOTES.pdf"),
 'EvoMoE_NOTES.pdf': ("EvoMoE: An Evolutional Mixture-of-Experts Training Framework via Dense-To-Sparse Gate NOTES", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/EvoMoE_NOTES.docx"),
 'BlackMamba_NOTES.docx': ("BlackMamba: Mixture of Experts for State-Space Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/BlackMamba_NOTES.pdf"),
 'PE_SparsityCrafting_NOTES.pdf': ("Parameter-Efficient Sparsity Crafting from Dense to Mixture-of-Experts for Instruction Tuning on General Tasks", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/PE_SparsityCrafting_NOTES.pdf"),
 'MegaBlocks.pdf': ("MegaBlocks: Efficient Sparse Training with Mixture-of-Experts", "https://arxiv.org/abs/2211.15841"),
 'QMoE.pdf': ("QMoE: Practical Sub-1-Bit Compression of Trillion-Parameter Models", "https://arxiv.org/abs/2310.16795"),
 'QMoE_NOTES.pdf': ("QMoE: Practical Sub-1-Bit Compression of Trillion-Parameter Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/QMoE_NOTES.pdf"),
 'PE_SparsityCrafting_NOTES.docx': ("Parameter-Efficient Sparsity Crafting from Dense to Mixture-of-Experts for Instruction Tuning on General Tasks", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/PE_SparsityCrafting_NOTES.pdf"),
 'PE_MoE_for_LMs_NOTES.docx': ("Parameter-Efficient Mixture-of-Experts Architecture for Pre-Trained Language Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/PE_MoE_for_LMs_NOTES.pdf"),
 'FastInferenceMoE.pdf': ("Fast-Inference of Mixture-of-Experts Language Models with Offloading", "https://arxiv.org/abs/2312.17238"),
 'ExtremelyPE_MoE_for_InstructionTuning_NOTES.docx': ("Pushing Mixture-of-Experts to the Limit: Extremely Parameter Efficient MoE for Instruction Tuning", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/ExtremelyPE_MoE_for_InstructionTuning_NOTES.docx"),
 'QMoE_NOTES.docx': ("QMoE: Practical Sub-1-Bit Compression of Trillion-Parameter Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/QMoE_NOTES.pdf"),
 'ExtremelyPE_MoE_for_InstructionTuning.pdf': ("Pushing Mixture-of-Experts to the Limit: Extremely Parameter Efficient MoE for Instruction Tuning", "https://arxiv.org/abs/2309.05444"),
 'FastInferenceMoE_NOTES.docx': ("Fast-Inference of Mixture-of-Experts Language Models with Offloading", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/FastInferenceMoE_NOTES.pdf"),
 'MegaBlocks_NOTES.pdf': ("MegaBlocks: Efficient Sparse Training with Mixture-of-Experts", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/MegaBlocks_NOTES.pdf"),
 'PE_SparsityCrafting.pdf': ("Parameter-Efficient Sparsity Crafting from Dense to Mixture-of-Experts for Instruction Tuning on General Tasks", "https://arxiv.org/abs/2401.02731"),
 'FastInferenceMoE_NOTES.pdf': ("Fast-Inference of Mixture-of-Experts Language Models with Offloading", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/FastInferenceMoE_NOTES.pdf"),
 'MegaBlocks_NOTES.docx': ("MegaBlocks: Efficient Sparse Training with Mixture-of-Experts", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/MegaBlocks_NOTES.pdf"),
 'PE_MoE_for_LMs.pdf': ("Parameter-Efficient Mixture-of-Experts Architecture for Pre-Trained Language Models", "https://arxiv.org/abs/2203.01104"),
 'PE_MoE_for_LMs_NOTES.pdf': ("Parameter-Efficient Mixture-of-Experts Architecture for Pre-Trained Language Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/PE_MoE_for_LMs_NOTES.pdf"),
 'FFFs_NOTES.docx': ("Fast Feedforward Networks", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/FFFs/FFFs_NOTES.pdf"),
 'FFFs_NOTES.pdf': ("Fast Feedforward Networks", "https://github.com/guiOsorio/Research-MoE/blob/master/research/MoE_Efficiency/FFFs/FFFs_NOTES.pdf"),
 'FFF.pdf': ("Fast Feedforward Networks", "https://arxiv.org/abs/2308.14711"),
 'FFF_to_language.pdf': ("Exponentially Faster Language Modeling", "https://arxiv.org/abs/2311.10770"),
 'MixtureOfTokens_NOTES.docx': ("Mixture-of-Experts Meets Instruction Tuning: A Winning Combination for Large Language Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/MoE_meets_instruction_tuning_NOTES.pdf"),
 'BASE_layers_NOTES.docx': ("BASE Layers: Simplifying Training of Large, Sparse Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/BASE_layers_NOTES.pdf"),
 'DSelect_k_NOTES.pdf': ("DSelect-k: Differentiable Selection in the Mixture of Experts with Applications to Multi-Task Learning", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/DSelect_k_NOTES.pdf"),
 'StableMoE_NOTES.docx': ("StableMoE: Stable Routing Strategy for Mixture of Experts", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/StableMoE_NOTES.pdf"),
 'Hash_Layers_NOTES.pdf': ("Hash Layers for Large Sparse Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/Hash_Layers_NOTES.pdf"),
 'Expert_Choice_NOTES.docx': ("Mixture-of-Experts with Expert Choice Routing", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/Expert_Choice_NOTES.pdf"),
 'Soft_MoE_NOTES.pdf': ("From Sparse to Soft Mixture of Experts", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/Soft_MoE_NOTES.pdf"),
 'Soft_MoE_NOTES.docx': ("From Sparse to Soft Mixture of Experts", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/Soft_MoE_NOTES.pdf"),
 'Expert_Choice_NOTES.pdf': ("Mixture-of-Experts with Expert Choice Routing", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/Expert_Choice_NOTES.pdf"),
 'Expert_Choice.pdf': ("Mixture-of-Experts with Expert Choice Routing", "https://arxiv.org/abs/2202.09368"),
 'Soft_MoE.pdf': ("From Sparse to Soft Mixture of Experts", "https://arxiv.org/abs/2308.00951"),
 'BASE_layers.pdf': ("BASE Layers: Simplifying Training of Large, Sparse Models", "https://arxiv.org/abs/2103.16716"),
 'MixtureOfTokens.pdf': ("Mixture-of-Tokens: Efficient LLMs Through Cross-Example Aggregation", "https://arxiv.org/abs/2310.15961"),
 'DeepSeekMoE.pdf': ("DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models", "https://arxiv.org/abs/2401.06066"),
 'StableMoE.pdf': ("StableMoE: Stable Routing Strategy for Mixture of Experts", "https://arxiv.org/abs/2204.08396"),
 'DeepSeekMoE_NOTES.pdf': ("DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/DeepSeekMoE_NOTES.pdf"),
 'DSelect_k_NOTES.docx': ("DSelect-k: Differentiable Selection in the Mixture of Experts with Applications to Multi-Task Learning", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/DSelect_k_NOTES.pdf"),
 'MixtureOfTokens_NOTES.pdf': ("Mixture-of-Experts Meets Instruction Tuning: A Winning Combination for Large Language Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Hybrid_Approaches/MoE_meets_instruction_tuning_NOTES.pdf"),
 'Hash_Layers_NOTES.docx': ("Hash Layers for Large Sparse Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/Hash_Layers_NOTES.pdf"),
 'StableMoE_NOTES.pdf': ("StableMoE: Stable Routing Strategy for Mixture of Experts", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/StableMoE_NOTES.pdf"),
 'BASE_layers_NOTES.pdf': ("BASE Layers: Simplifying Training of Large, Sparse Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/BASE_layers_NOTES.pdf"),
 'DeepSeekMoE_NOTES.docx': ("DeepSeekMoE: Towards Ultimate Expert Specialization in Mixture-of-Experts Language Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/DeepSeekMoE_NOTES.pdf"),
 'DSelect_k.pdf': ("DSelect-k: Differentiable Selection in the Mixture of Experts with Applications to Multi-Task Learning", "https://arxiv.org/abs/2106.03760"),
 'Hash_Layers.pdf': ("Hash Layers for Large Sparse Models", "https://arxiv.org/abs/2106.04426"),
 'Mistral.pdf': ("Mistral 7B", "https://arxiv.org/abs/2310.06825"),
 'Mixtral.pdf': ("Mixtral of Experts", "https://arxiv.org/abs/2401.04088"),
 'Mixtral_NOTES.docx': ("Mixtral of Experts", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/Mixtral/Mixtral_NOTES.pdf"),
 'Mixtral_NOTES.pdf': ("Mixtral of Experts", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Routing_and_Architecture/Mixtral/Mixtral_NOTES.pdf"),
 'Towards_Understanding_MoE_NOTES.pdf': ("Towards Understanding MoE", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Understanding_MoE/Towards_Understanding_MoE_NOTES.pdf"),
 'Towards_Understanding_MoE.pdf': ("Towards Understanding MoE", "https://arxiv.org/abs/2208.02813"),
 'Mixture of Experts Explained_HF_NOTES.docx': ("HuggingFace MoE Article", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Understanding_MoE/Mixture%20of%20Experts%20Explained_HF_NOTES.pdf"),
 'Mixture of Experts Explained_HF_NOTES.pdf': ("HuggingFace MoE Article", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Understanding_MoE/Mixture%20of%20Experts%20Explained_HF_NOTES.pdf"),
 'Sparsely_Gated_MoE_NOTES.pdf': ("Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Understanding_MoE/Sparsely_Gated_MoE_NOTES.pdf"),
 'Sparsely_Gated_MoE.pdf': ("Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer", "https://arxiv.org/abs/1701.06538"),
 'Towards_Understanding_MoE_NOTES.docx': ("Towards Understanding MoE", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Understanding_MoE/Towards_Understanding_MoE_NOTES.pdf"),
 'Learning_Factored_Representations_NOTES.docx': ("Learning Factorized Representations in a Deep Mixture-of-Experts", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Understanding_MoE/Learning_Factored_Representations_NOTES.pdf"),
 'Original_MoE.pdf': ("Adaptive Mixture of Local Experts", "https://www.cs.toronto.edu/~hinton/absps/jjnh91.pdf"),
 'Learning_Factored_Representations_NOTES.pdf': ("Learning Factorized Representations in a Deep Mixture-of-Experts", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Understanding_MoE/Learning_Factored_Representations_NOTES.pdf"),
 'Mixture of Experts Explained_HF.pdf': ("HuggingFace MoE Article", "https://huggingface.co/blog/moe"),
 'Sparsely_Gated_MoE_NOTES.docx': ("Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Understanding_MoE/Sparsely_Gated_MoE_NOTES.pdf"),
 'OpenMoE.pdf': ("OpenMoE: An Early Effort on Open Mixture-of-Experts Language Models", "https://arxiv.org/abs/2402.01739"),
 'OpenMoE_NOTES.docx': ("OpenMoE: An Early Effort on Open Mixture-of-Experts Language Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Understanding_MoE/OpenMoE_NOTES.pdf"),
 'MoESurvey.pdf': ("A Review of Sparse Expert Models in Deep Learning", "https://arxiv.org/abs/2209.01667"),
 'OpenMoE_NOTES.pdf': ("OpenMoE: An Early Effort on Open Mixture-of-Experts Language Models", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Understanding_MoE/OpenMoE_NOTES.pdf"),
 'Learning_Factored_Representations.pdf': ("Learning Factorized Representations in a Deep Mixture-of-Experts", "https://github.com/guiOsorio/Research-MoE/blob/master/research/Understanding_MoE/Learning_Factored_Representations_NOTES.pdf"),
 }

In [6]:
for (file, content) in paper_names.items():
    if (file in ('MoE Notes FINAL.docx', 'MoE Notes.docx')) or (file.endswith('.pdf')):
        file_contents[file]['source_name'] = content[0]
        file_contents[file]['source_url'] = content[1]

In [7]:
# Display the contents of the files
for i, (file, content) in enumerate(file_contents.items()):
    if file[-5:] == ".docx":
        print(f"{file} #{i+1}")
        print(content)  # Print the first 500 characters for brevity
        print("\n" + "="*80 + "\n")

MoE Notes.docx #1
{'content': "MOE PAPER REVIEWS\nEarly Days of MoE\n\nLearning Factored Representations in a Deep Mixture-of-Experts\n\nMain Idea:\nTo apply stacked layers of mixture-of-experts, so to have multiple sets of (gating, experts). This allows multiple combinations of experts to be called while keeping a modest model size.\nThe problem they are trying to solve for is that deep neural networks are expensive to compute at inference time since all the neurons are used.\nThe solution proposed is to implement stacked MoE layers, where multiple expert combinations are possible, and the gating mechanism ensures only useful neurons for that input are used (experts on the specific input space). This gives better computational efficiency at inference, allowing for a model that is both large and efficient.\n\nApproach:\nThe input is first passed through the first MoE layer (represented by z1):\nwhere  and represent the gating probability and expert output for expert i at layer 1, respe

In [8]:
# Function to chunk text into chunks of specified size
def chunk_text(text, chunk_size=1000, overlap=250):
    start = 0
    chunks = []
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

In [9]:
# Chunk file contents
chunked_contents = {
    key: {
        'chunks': [],
        'source_name': '',
        'source_url': ''
    }
    for key in file_contents.keys()
}
for file, content in file_contents.items():
    chunks = chunk_text(content['content'])
    chunked_contents[file]['chunks'] = chunks
    chunked_contents[file]['source_name'] = file_contents[file]['source_name']
    chunked_contents[file]['source_url'] = file_contents[file]['source_url']

In [10]:
for i, (file, content) in enumerate(chunked_contents.items()):
    print(file)
    break

MoE Notes.docx


In [11]:
len(chunked_contents["MoE Notes.docx"]['chunks'])

269

In [12]:
chunked_contents["MoE Notes.docx"]

{'chunks': ['MOE PAPER REVIEWS\nEarly Days of MoE\n\nLearning Factored Representations in a Deep Mixture-of-Experts\n\nMain Idea:\nTo apply stacked layers of mixture-of-experts, so to have multiple sets of (gating, experts). This allows multiple combinations of experts to be called while keeping a modest model size.\nThe problem they are trying to solve for is that deep neural networks are expensive to compute at inference time since all the neurons are used.\nThe solution proposed is to implement stacked MoE layers, where multiple expert combinations are possible, and the gating mechanism ensures only useful neurons for that input are used (experts on the specific input space). This gives better computational efficiency at inference, allowing for a model that is both large and efficient.\n\nApproach:\nThe input is first passed through the first MoE layer (represented by z1):\nwhere  and represent the gating probability and expert output for expert i at layer 1, respectively.\nboth the

In [13]:
print(chunked_contents["MoE Notes.docx"]['chunks'][0])

MOE PAPER REVIEWS
Early Days of MoE

Learning Factored Representations in a Deep Mixture-of-Experts

Main Idea:
To apply stacked layers of mixture-of-experts, so to have multiple sets of (gating, experts). This allows multiple combinations of experts to be called while keeping a modest model size.
The problem they are trying to solve for is that deep neural networks are expensive to compute at inference time since all the neurons are used.
The solution proposed is to implement stacked MoE layers, where multiple expert combinations are possible, and the gating mechanism ensures only useful neurons for that input are used (experts on the specific input space). This gives better computational efficiency at inference, allowing for a model that is both large and efficient.

Approach:
The input is first passed through the first MoE layer (represented by z1):
where  and represent the gating probability and expert output for expert i at layer 1, respectively.
both the gating mechanism and the 

## Pinecone + SQLite Insert

In [14]:
from openai import OpenAI
from pinecone import Pinecone
from dotenv import load_dotenv
import os
import sqlite3

load_dotenv(dotenv_path="../.env")
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
client = OpenAI()

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_INDEX_HOST = os.getenv('PINECONE_INDEX_HOST')
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(host=PINECONE_INDEX_HOST)

  from tqdm.autonotebook import tqdm


In [15]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [16]:
total_chunks = 0
for file in chunked_contents:
    for i, chunk in enumerate(chunked_contents[file]['chunks']):
        continue
    total_chunks += i
print(total_chunks)

4632


In [17]:
for k in chunked_contents.keys():
    print(k)

MoE Notes.docx
MoE Notes FINAL.docx
Unified_Scaling_Laws_NOTES.pdf
Switch_Transformers.pdf
GLaM.pdf
GShard.pdf
Unified_Scaling_Laws.pdf
ST_MoE.pdf
GLaM_NOTES.pdf
ST_MoE_NOTES.pdf
Switch_Transformers_NOTES.pdf
GShard_NOTES.pdf
Efficient_Large_Scale_LM.pdf
Efficient_Large_Scale_LM_NOTES.pdf
BTM.pdf
Benefits_of_ELMs.pdf
Expert_Gate_NOTES.pdf
BeyondDistillation_Task_Level_MoE.pdf
DEMix_NOTES.pdf
BTM_NOTES.pdf
DEMix.pdf
cBTM_NOTES.pdf
BeyondDistillation_Task_Level_MoE_NOTES.pdf
Expert_Gate.pdf
cBTM.pdf
Benefits_of_ELMs_NOTES.pdf
MoE_Mamba.pdf
BlackMamba.pdf
MoE_meets_instruction_tuning.pdf
MoE_Mamba_NOTES.pdf
Sparse_Upcycling.pdf
Soft_Merging_of_Experts.pdf
Sparse_Upcycling_NOTES.pdf
MoE_meets_instruction_tuning_NOTES.pdf
Soft_Merging_of_Experts_NOTES.pdf
EvoMoE.pdf
BlackMamba_NOTES.pdf
EvoMoE_NOTES.pdf
PE_SparsityCrafting_NOTES.pdf
MegaBlocks.pdf
QMoE.pdf
QMoE_NOTES.pdf
FastInferenceMoE.pdf
ExtremelyPE_MoE_for_InstructionTuning.pdf
MegaBlocks_NOTES.pdf
PE_SparsityCrafting.pdf
FastInference

In [19]:
# Connect to SQLite database (it will create the database file if it doesn't exist)
conn = sqlite3.connect('../chunks.db')
cursor = conn.cursor()

# Create a table to store chunks
cursor.execute('''
CREATE TABLE IF NOT EXISTS chunks (
    chunk_id TEXT PRIMARY KEY,
    content TEXT,
    source_name TEXT,
    source_url TEXT
)
''')

# Commit and close the connection
conn.commit()
conn.close()

In [25]:
# Function to insert chunk data into the database
def insert_chunk(chunk_id, content, source_name, source_url):
    conn = sqlite3.connect('chunks.db')
    cursor = conn.cursor()
    cursor.execute('''
    INSERT INTO chunks (chunk_id, content, source_name, source_url) VALUES (?, ?, ?, ?)
    ''', (chunk_id, content, source_name, source_url))
    conn.commit()
    conn.close()

In [1]:
for file, contents in chunked_contents.items():
    source_name = contents['source_name']
    source_url = contents['source_url']
    for i, chunk in enumerate(contents['chunks']):
        chunk_id = f"{file}_chunk_{i}"
        # SQLite3 insert
        insert_chunk(chunk_id, chunk, source_name, source_url)
        # Pinecone insert
        metadata = {"file_name": file, "source_name": source_name, "source_url": source_url}
        embed = get_embedding(chunk)
        upsert_response = index.upsert(
            vectors=[
                (chunk_id, embed, metadata),
            ]
        )

In [27]:
for key in chunked_contents.keys():
    print(chunked_contents[key])
    break

{'chunks': ['MOE PAPER REVIEWS\nEarly Days of MoE\n\nLearning Factored Representations in a Deep Mixture-of-Experts\n\nMain Idea:\nTo apply stacked layers of mixture-of-experts, so to have multiple sets of (gating, experts). This allows multiple combinations of experts to be called while keeping a modest model size.\nThe problem they are trying to solve for is that deep neural networks are expensive to compute at inference time since all the neurons are used.\nThe solution proposed is to implement stacked MoE layers, where multiple expert combinations are possible, and the gating mechanism ensures only useful neurons for that input are used (experts on the specific input space). This gives better computational efficiency at inference, allowing for a model that is both large and efficient.\n\nApproach:\nThe input is first passed through the first MoE layer (represented by z1):\nwhere  and represent the gating probability and expert output for expert i at layer 1, respectively.\nboth the