In [2]:
import os
import docx
import fitz  # PyMuPDF

## Chunking

In [8]:
# Function to read .docx files
def read_docx(file_path):
    try:
        doc = docx.Document(file_path)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        return '\n'.join(full_text)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""
    

# Function to read .pdf files
def read_pdf(file_path):
    try:
        pdf_document = fitz.open(file_path)
        text = ""
        for page_num in range(len(pdf_document)):
            page = pdf_document.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

# Directory containing the files
directory = "../research"

In [12]:
# Dictionary to store file content
file_contents = {}

# Traverse the directory and read files
for root, dirs, files in os.walk(directory):
    for file in files:
        file_path = os.path.join(root, file)
        if file.endswith('.docx'):
            content = read_docx(file_path)
            file_contents[file] = content
        elif file.endswith('.pdf'):
            content = read_pdf(file_path)
            file_contents[file] = content


Error reading ../research/MoE_Efficiency/ExtremelyPE_MoE_for_InstructionTuning_NOTES.docx: Package not found at '../research/MoE_Efficiency/ExtremelyPE_MoE_for_InstructionTuning_NOTES.docx'


In [22]:
# Display the contents of the files
for i, (file, content) in enumerate(file_contents.items()):
    if file[-5:] == ".docx":
        print(f"{file} #{i+1}")
        print(content)  # Print the first 500 characters for brevity
        print("\n" + "="*80 + "\n")
        break

MoE Notes.docx #1
MOE PAPER REVIEWS
Early Days of MoE

Learning Factored Representations in a Deep Mixture-of-Experts

Main Idea:
To apply stacked layers of mixture-of-experts, so to have multiple sets of (gating, experts). This allows multiple combinations of experts to be called while keeping a modest model size.
The problem they are trying to solve for is that deep neural networks are expensive to compute at inference time since all the neurons are used.
The solution proposed is to implement stacked MoE layers, where multiple expert combinations are possible, and the gating mechanism ensures only useful neurons for that input are used (experts on the specific input space). This gives better computational efficiency at inference, allowing for a model that is both large and efficient.

Approach:
The input is first passed through the first MoE layer (represented by z1):
where  and represent the gating probability and expert output for expert i at layer 1, respectively.
both the gating 

In [32]:
# Function to chunk text into chunks of specified size
def chunk_text(text, chunk_size=1000, overlap=250):
    start = 0
    chunks = []
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

In [33]:
# Chunk file contents
chunked_contents = {}
for file, content in file_contents.items():
    chunks = chunk_text(content)
    chunked_contents[file] = chunks

In [34]:
for i, (file, content) in enumerate(chunked_contents.items()):
    print(file)
    break

MoE Notes.docx


In [35]:
len(chunked_contents["MoE Notes.docx"])

269

In [36]:
print(chunked_contents["MoE Notes.docx"][0])

MOE PAPER REVIEWS
Early Days of MoE

Learning Factored Representations in a Deep Mixture-of-Experts

Main Idea:
To apply stacked layers of mixture-of-experts, so to have multiple sets of (gating, experts). This allows multiple combinations of experts to be called while keeping a modest model size.
The problem they are trying to solve for is that deep neural networks are expensive to compute at inference time since all the neurons are used.
The solution proposed is to implement stacked MoE layers, where multiple expert combinations are possible, and the gating mechanism ensures only useful neurons for that input are used (experts on the specific input space). This gives better computational efficiency at inference, allowing for a model that is both large and efficient.

Approach:
The input is first passed through the first MoE layer (represented by z1):
where  and represent the gating probability and expert output for expert i at layer 1, respectively.
both the gating mechanism and the 

In [37]:
print(chunked_contents["MoE Notes.docx"][1])

is both large and efficient.

Approach:
The input is first passed through the first MoE layer (represented by z1):
where  and represent the gating probability and expert output for expert i at layer 1, respectively.
both the gating mechanism and the expert function use a non-linearity (ReLU)
The outputs of the first layer (z1) are then passed as an input to the next MoE layer z2, which replaces x with z1.
z2 is then passed through a final layer (f3) and a softmax is applied (in the context of classification)

The network is trained with SGD with a caveat to help balance the training through the experts:
The mean of all experts’ total assignment is compared to each expert’s running total assignment. If an expert is found to have a running total assignment significantly higher than the mean, its training is paused temporarily to allow for the training of other experts.
This strategy is found to mostly be useful in early stages of training, where the experts have not yet specialized signi

## Pinecone + SQLite Insert

In [78]:
from openai import OpenAI
from pinecone import Pinecone
from dotenv import load_dotenv
import os
import sqlite3

load_dotenv(dotenv_path="../.env")
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
client = OpenAI()

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_INDEX_HOST = os.getenv('PINECONE_INDEX_HOST')
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(host=PINECONE_INDEX_HOST)

In [72]:
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [73]:
total_chunks = 0
for file in chunked_contents:
    for i, chunk in enumerate(chunked_contents[file]):
        continue
    total_chunks += i
print(total_chunks)

4856


In [80]:
# Connect to SQLite database (it will create the database file if it doesn't exist)
conn = sqlite3.connect('../chunks.db')
cursor = conn.cursor()

# Create a table to store chunks
cursor.execute('''
CREATE TABLE IF NOT EXISTS chunks (
    chunk_id TEXT PRIMARY KEY,
    content TEXT
)
''')

# Commit and close the connection
conn.commit()
conn.close()

In [None]:
# Function to insert chunk data into the database
def insert_chunk(chunk_id, content):
    conn = sqlite3.connect('chunks.db')
    cursor = conn.cursor()
    cursor.execute('''
    INSERT INTO chunks (chunk_id, content) VALUES (?, ?)
    ''', (chunk_id, content))
    conn.commit()
    conn.close()

In [1]:
for file, chunks in chunked_contents.items():
    for i, chunk in enumerate(chunks):
        chunk_id = f"{file}_chunk_{i}"
        # SQLite3 insert
        insert_chunk(chunk_id, chunk)
        # Pinecone insert
        metadata = {"file_name": file}
        embed = get_embedding(chunk)
        upsert_response = index.upsert(
            vectors=[
                (chunk_id, embed, metadata),
            ]
        )
    