## Step 1: Import Required Libraries


In [2]:
## URL Format : https://github.com/<owner>/<repository>/archive/refs/heads/<branch_name>.zip
import io
import zipfile
import requests
import frontmatter

## Step 2: Download the Repository
- GitHub's ZIP URL format:
https://codeload.github.com/{owner}/{repo}/zip/refs/heads/{branch}


In [3]:
url = 'https://codeload.github.com/fsamura01/taREDACTED_OPENAI_KEY-app/zip/refs/heads/main'
resp = requests.get(url)
resp

<Response [200]>

## Step 3: Process the ZIP File in Memory


In [4]:
repository_data = []

# Create a ZipFile object from the downloaded content
zf = zipfile.ZipFile(io.BytesIO(resp.content))

for file_info in zf.infolist():
    filename = file_info.filename.lower()

    # Only process markdown files
    if not filename.endswith('.md'):
        continue

    # Read and parse each file
    with zf.open(file_info) as f_in:
        content = f_in.read()
        post = frontmatter.loads(content)
        data = post.to_dict()
        data['filename'] = filename
        repository_data.append(data)

zf.close()

In [5]:
print(f"Total documents extracted: {len(repository_data)}")

Total documents extracted: 3


In [7]:
# Look at multiple documents to find one with frontmatter
for i, doc in enumerate(repository_data[:5]):
    print(f"\n--- Document {i} ---")
    print(f"Filename: {doc.get('filename')}")
    print(f"Keys: {list(doc.keys())}")
    if 'question' in doc:
        print(f"Question: {doc.get('question')}")
        break


--- Document 0 ---
Filename: taREDACTED_OPENAI_KEY-app-main/readme.md
Keys: ['content', 'filename']

--- Document 1 ---
Filename: taREDACTED_OPENAI_KEY-app-main/client/readme.md
Keys: ['content', 'filename']

--- Document 2 ---
Filename: taREDACTED_OPENAI_KEY-app-main/server/readme.md
Keys: ['content', 'filename']


In [13]:
# Find the document with the question
faq_doc = repository_data[0]
print(faq_doc)

{'content': '# Task Manager App\n\nA React-based task management application that allows users to create, edit, delete, and track tasks with due dates and completion status.\n\n## Features\n\n### Task Management\n- **Create Tasks**: Add new tasks with title, description, and due date\n- **Edit Tasks**: Modify existing tasks with inline editing\n- **Delete Tasks**: Remove tasks with confirmation dialog\n- **Toggle Completion**: Mark tasks as complete/incomplete with one click\n\n### User Experience\n- **Task Statistics**: View total, incomplete, and completed task counts\n- **Visual Feedback**: Different styling for completed vs incomplete tasks\n- **Loading States**: Clear feedback during API operations\n- **Error Handling**: Comprehensive error messages and validation\n\n### Form Validation\n- **Title**: Required, minimum 3 characters\n- **Description**: Required\n- **Due Date**: Required, cannot be in the past (for incomplete tasks)\n- **Real-time Validation**: Clear errors as user t

## Step 5: Support Multiple Markdown Types
- To include .mdx files (React markdown):



In [25]:
for file_info in zf.infolist():
    filename = file_info.filename.lower()

    if not (filename.endswith('.md') or filename.endswith('.mdx')):
        continue


## Step 6: Complete Reusable Function
- Here's the production-ready version with error handling:

In [17]:
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.

    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name

    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com'
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)

    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))

    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') or filename_lower.endswith('.mdx')):
            continue

        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    zf.close()
    return repository_data

## Step 7: Use the Function

In [26]:
# Download and process different repositories
task_manager_app_docs = read_repo_data('fsamura01', 'taREDACTED_OPENAI_KEY-app')
print(task_manager_app_docs[0])

{'content': '# Task Manager App\n\nA React-based task management application that allows users to create, edit, delete, and track tasks with due dates and completion status.\n\n## Features\n\n### Task Management\n- **Create Tasks**: Add new tasks with title, description, and due date\n- **Edit Tasks**: Modify existing tasks with inline editing\n- **Delete Tasks**: Remove tasks with confirmation dialog\n- **Toggle Completion**: Mark tasks as complete/incomplete with one click\n\n### User Experience\n- **Task Statistics**: View total, incomplete, and completed task counts\n- **Visual Feedback**: Different styling for completed vs incomplete tasks\n- **Loading States**: Clear feedback during API operations\n- **Error Handling**: Comprehensive error messages and validation\n\n### Form Validation\n- **Title**: Required, minimum 3 characters\n- **Description**: Required\n- **Due Date**: Required, cannot be in the past (for incomplete tasks)\n- **Real-time Validation**: Clear errors as user t

## Step 8: Inspect the Data

In [27]:
# Look at the first document
print(task_manager_app_docs[0])

{'content': '# Task Manager App\n\nA React-based task management application that allows users to create, edit, delete, and track tasks with due dates and completion status.\n\n## Features\n\n### Task Management\n- **Create Tasks**: Add new tasks with title, description, and due date\n- **Edit Tasks**: Modify existing tasks with inline editing\n- **Delete Tasks**: Remove tasks with confirmation dialog\n- **Toggle Completion**: Mark tasks as complete/incomplete with one click\n\n### User Experience\n- **Task Statistics**: View total, incomplete, and completed task counts\n- **Visual Feedback**: Different styling for completed vs incomplete tasks\n- **Loading States**: Clear feedback during API operations\n- **Error Handling**: Comprehensive error messages and validation\n\n### Form Validation\n- **Title**: Required, minimum 3 characters\n- **Description**: Required\n- **Due Date**: Required, cannot be in the past (for incomplete tasks)\n- **Real-time Validation**: Clear errors as user t

## Today’s Tasks (Day 2)

### 1. Simple Chunking

In [24]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [30]:
task_manager_app_chunks = []

for doc in task_manager_app_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    task_manager_app_chunks.extend(chunks)

In [31]:
task_manager_app_chunks

[{'start': 0,
  'chunk': '# Task Manager App\n\nA React-based task management application that allows users to create, edit, delete, and track tasks with due dates and completion status.\n\n## Features\n\n### Task Management\n- **Create Tasks**: Add new tasks with title, description, and due date\n- **Edit Tasks**: Modify existing tasks with inline editing\n- **Delete Tasks**: Remove tasks with confirmation dialog\n- **Toggle Completion**: Mark tasks as complete/incomplete with one click\n\n### User Experience\n- **Task Statistics**: View total, incomplete, and completed task counts\n- **Visual Feedback**: Different styling for completed vs incomplete tasks\n- **Loading States**: Clear feedback during API operations\n- **Error Handling**: Comprehensive error messages and validation\n\n### Form Validation\n- **Title**: Required, minimum 3 characters\n- **Description**: Required\n- **Due Date**: Required, cannot be in the past (for incomplete tasks)\n- **Real-time Validation**: Clear err

In [36]:
!uv add groq

[2mResolved [1m20 packages[0m [2min 834ms[0m[0m
[2mPrepared [1m2 packages[0m [2min 782ms[0m[0m
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
[2mInstalled [1m7 packages[0m [2min 563ms[0m[0m
 [32m+[39m [1mannotated-types[0m[2m==0.7.0[0m
 [32m+[39m [1mdistro[0m[2m==1.9.0[0m
 [32m+[39m [1mgroq[0m[2m==1.0.0[0m
 [32m+[39m [1mpydantic[0m[2m==2.12.5[0m
 [32m+[39m [1mpydantic-core[0m[2m==2.41.5[0m
 [32m+[39m [1msniffio[0m[2m==1.3.1[0m
 [32m+[39m [1mtyping-inspection[0m[2m==0.4.2[0m


In [None]:
GROQ_API_KEY="api-key"
#OPENAI_API_KEY="api-key"

In [None]:
#from openai import OpenAI
from groq import Groq

groq_client = Groq(api_key="api-key")

def llm(prompt, model='llama-3.1-8b-instant'): # Updated to a currently supported Groq model
    messages = [
        {"role": "user", "content": prompt}
    ]

    response = groq_client.chat.completions.create(
        model=model,
        messages=messages
    )

    return response.choices[0].message.content

In [40]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

In [41]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [43]:
!uv add tqdm

[2mResolved [1m22 packages[0m [2min 643ms[0m[0m
[2mPrepared [1m1 package[0m [2min 264ms[0m[0m
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
[2mInstalled [1m1 package[0m [2min 479ms[0m[0m
 [32m+[39m [1mtqdm[0m[2m==4.67.1[0m


In [45]:
from tqdm.auto import tqdm

task_manager_app_chunks = []

# Maximum characters allowed in doc_content for a single LLM call
# This is a heuristic to prevent 'Payload Too Large' errors (HTTP 413).
# A conservative estimate of 20000 characters for the document content
# aims to keep the total prompt (document + instructions) well within API limits.
MAX_DOC_CHARS_FOR_LLM = 20000

for doc in tqdm(task_manager_app_docs):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    if not doc_content.strip():
        # Skip empty documents to avoid sending empty prompts
        continue

    if len(doc_content) > MAX_DOC_CHARS_FOR_LLM:
        print(f"Skipping intelligent chunking for '{doc_copy.get('filename', 'Unknown')}' "
              f"because its content is too large ({len(doc_content)} chars > {MAX_DOC_CHARS_FOR_LLM} chars). "
              "Consider pre-chunking large documents with a simpler method before LLM processing.")
        continue

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        task_manager_app_chunks.append(section_doc)

  0%|          | 0/3 [00:00<?, ?it/s]

In [46]:
task_manager_app_chunks

[{'filename': 'taREDACTED_OPENAI_KEY-app-main/README.md',
  'section': '## Task Manager App Features\n\nThe Task Manager App offers various features for task management, user experience, form validation, and data management.\n\n### Task Management\n\n* Allows users to create, edit, delete, and track tasks with due dates and completion status\n* Includes features such as creating tasks, editing tasks, deleting tasks, and toggling task completion status\n\n### User Experience\n\n* Provides task statistics, including total, incomplete, and completed task counts\n* Offers visual feedback through different styling for completed and incomplete tasks\n* Manages loading states and error handling for a more comprehensive user experience\n\n### Form Validation\n\n* Validates task title, description, and due date fields\n* Ensures title has a minimum of 3 characters, description is required, and due date is not in the past for incomplete tasks\n* Offers real-time validation as the user types\n\n#