In [1]:
!uv add groq

[2mResolved [1m110 packages[0m [2min 0.88ms[0m[0m
[2mAudited [1m19 packages[0m [2min 0.50ms[0m[0m


In [2]:
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.
    
    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name
    
    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com' 
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)
    
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))
    
    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md') 
            or filename_lower.endswith('.mdx')):
            continue
    
        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    zf.close()
    return repository_data


In [3]:
brdt_mcp = read_repo_data('brightdata', 'brightdata-mcp')
ai_docs = read_repo_data('patchy631', 'ai-engineering-hub')

print(f"FAQ documents: {len(brdt_mcp)}")
print(f"Evidently documents: {len(ai_docs)}")


FAQ documents: 4
Evidently documents: 93


In [6]:
from groq import Groq

groq_client = Groq()  # Or use os.environ.get("GROQ_API_KEY")

prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

def llm(document, model='llama-3.1-8b-instant'):
    prompt = prompt_template.format(document=document)

    messages = [
        {"role": "system", "content": "You are a helpful assistant that organizes documents for Q&A systems."},
        {"role": "user", "content": prompt}
    ]

    response = groq_client.chat.completions.create(
        model=model,
        messages=messages
    )

    return response.choices[0].message.content


In [5]:
from groq import Groq

groq_client = Groq()  # api_key="your-api-key Or use environment variable

def llm(prompt, model='llama-3.1-8b-instant'):
    messages = [
        {"role": "user", "content": prompt}
    ]

    response = groq_client.chat.completions.create(
        model=model,
        messages=messages
    )

    return response.choices[0].message.content

In [None]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

In [7]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [9]:
!uv add tqdm

[2K[2mResolved [1m111 packages[0m [2min 53ms[0m[0m                                        [0m
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
[2K[2mInstalled [1m1 package[0m [2min 67ms[0m[0m                                 [0m
 [32m+[39m [1mtqdm[0m[2m==4.67.1[0m


In [10]:
from tqdm.auto import tqdm

evidently_chunks = []

for doc in tqdm(brdt_mcp):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

  0%|          | 0/4 [00:00<?, ?it/s]

In [24]:
evidently_chunks

[{'filename': 'brightdata-mcp-main/CHANGELOG.md',
  'section': '## Project Changelogs\n\nThe changelog document contains records of all the notable changes made to the project, including updates, bug fixes, and new features.'},
 {'filename': 'brightdata-mcp-main/CHANGELOG.md',
  'section': '## Release Notes for Version 1.9.x Series\n\nThe 1.9.x series focuses on expanding web data collection capabilities and improving authentication mechanisms. Key highlights include the addition of 23 new web data tools.\n\n### Changed\n\n- Updated browser authentication to use API_TOKEN instead of previous authentication method\n- BROWSER_ZONE is now an optional parameter, the default zone is `mcp_browser`\n- Removed duplicate web_data_ tools\n- Updated coding conventions and file formatting\n- Enhanced web data API endpoints integration\n\n### Fixed\n\n- Fixed spelling errors and improved coding conventions\n- Converted files back to Unix line endings for consistency'},
 {'filename': 'brightdata-mcp

In [14]:
import csv

def save_chunks_to_csv(data, filename='evidently_chunks.csv'):
    if not data:
        print("No data to save.")
        return

    # Get all unique keys across all dictionaries
    fieldnames = set()
    for row in data:
        fieldnames.update(row.keys())
    fieldnames = list(fieldnames)

    with open(filename, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

    print(f"Saved {len(data)} chunks to {filename}")



In [15]:
save_chunks_to_csv(evidently_chunks)

Saved 48 chunks to evidently_chunks.csv


In [25]:
def save_chunks_to_markdown(data, filename='evidently_chunks.md'):
    if not data:
        print("No data to save.")
        return

    with open(filename, 'w', encoding='utf-8') as f:
        for chunk in data:
            section_title = chunk.get("title", "Untitled Section")
            section_content = chunk.get("content", "")

            f.write(f"## {section_title}\n\n")
            f.write(f"{section_content.strip()}\n\n")
            f.write("---\n\n")

    print(f"Saved {len(data)} sections to {filename}")


In [26]:
save_chunks_to_markdown(evidently_chunks)

Saved 48 sections to evidently_chunks.md


In [28]:
def save_chunks_to_markdown(data, filename='evidently_chunks2.md'):
    if not data:
        print("No data to save.")
        return

    with open(filename, 'w', encoding='utf-8') as f:
        for chunk in data:
            #section_title = chunk.get("title", "Untitled Section")  # Optional: you can skip this if titles aren't available
            section_content = chunk.get("section", "").strip()

            #f.write(f"## {section_title}\n\n")
            f.write(f"{section_content}\n\n")
            f.write("---\n\n")

    print(f"Saved {len(data)} sections to {filename}")


In [29]:
save_chunks_to_markdown(evidently_chunks)

Saved 48 sections to evidently_chunks2.md
