In [1]:
import os
import subprocess
import datetime

In [2]:
# repo info

repo_url = "https://github.com/huggingface/smolagents"
# repo_url = "https://github.com/urchade/GLiNER"
repo_name = repo_url.split("/")[-1]

In [3]:
# set parameters
raw_directory = "../data/raw"
source_directory = f"{raw_directory}/{repo_name}"
processed_directory = "../data/processed"
output_markdown_file = f"{processed_directory}/{repo_name}-context.md"
#file_extensions = ["all"]
file_extensions = [".py", ".txt", ".md", ".ipynb"]

In [4]:
# set github_token
os.environ['GITHUB_TOKEN'] = os.getenv('GITHUB_TOKEN')

In [5]:
# Clone required repositories to source_directory
if not os.path.exists(source_directory):
    print(f"Cloning {repo_url} to {source_directory}")
    !git clone {repo_url} {source_directory}

In [6]:
def convert_and_cleanup_notebooks(directory):
    """
    Convert all `.ipynb` files in a directory (and subdirectories) to `.py` files,
    and delete the `.ipynb` files after successful conversion.

    Args:
        directory (str): Path to the directory to search for `.ipynb` files.
    """
    # Walk through the directory and subdirectories
    for root, _, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith(".ipynb"):  # Check for notebook files
                file_path = os.path.join(root, file_name)
                print(f"Converting {file_path} to script...")
                try:
                    # Run the nbconvert command
                    subprocess.run(["jupyter", "nbconvert", "--to", "script", file_path], check=True)
                    # Remove the original .ipynb file after successful conversion
                    os.remove(file_path)
                    print(f"Removed {file_path} after conversion.")
                except subprocess.CalledProcessError as e:
                    print(f"Error converting {file_path}: {e}")
                except OSError as e:
                    print(f"Error deleting {file_path}: {e}")

In [7]:
import os
from datetime import datetime
from collections import defaultdict
import mimetypes

def get_description(file_path):
    """
    Provide a description based on the file type.
    """
    if file_path.endswith('.py'):
        return "This is a Python script."
    elif file_path.endswith('.json'):
        return "This JSON file contains structured data."
    elif file_path.endswith('.csv'):
        return "This CSV file contains tabular data."
    elif file_path.endswith('.txt'):
        return "This text file contains general information."
    else:
        return "No specific description available."

def is_binary(file_path):
    """
    Check if a file is binary using MIME type.
    """
    mime_type, _ = mimetypes.guess_type(file_path)
    # Common MIME types for text files, including Markdown
    text_mime_types = [
        "text/plain",
        "application/json",
        "application/xml",
        "application/csv",
        "text/markdown"
    ]
    if mime_type in text_mime_types:
        return False
    try:
        with open(file_path, 'rb') as file:
            # Try decoding the first 1KB to UTF-8 to identify text files
            file.read(1024).decode('utf-8')
        return False
    except UnicodeDecodeError:
        return True
    except Exception:
        return True

def get_code_block_marker(file_path):
    """
    Return the language marker for Markdown code blocks based on file type.
    """
    if file_path.endswith('.py'):
        return "python"
    elif file_path.endswith('.json'):
        return "json"
    elif file_path.endswith('.csv'):
        return "csv"
    else:
        return "plaintext"

def summarize_directory(src_dir):
    """
    Summarize the directory contents by file type.
    """
    summary = defaultdict(int)
    for root, dirs, files in os.walk(src_dir):
        for file in files:
            ext = os.path.splitext(file)[1]
            summary[ext] += 1
    return summary

In [8]:
def create_combined_markdown(src_dir, output_file, file_extensions=["all"]):
    """
    Combine all files in the source directory into a single Markdown file.

    :param src_dir: Source directory to process.
    :param output_file: Path to the output Markdown file.
    :param file_extensions: List of file extensions to include. Use ["all"] to include all files.
    """
    error_log = []

    with open(output_file, 'w') as markdown_file:
        # Global metadata section
        markdown_file.write("# Combined Context for LLM\n\n")
        markdown_file.write(f"Source Directory: {src_dir}\n")
        markdown_file.write(f"Generated On: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

        # Directory summary
        markdown_file.write("## Directory Summary\n\n")
        summary = summarize_directory(src_dir)
        for ext, count in summary.items():
            markdown_file.write(f"* {ext if ext else 'No extension'}: {count} files\n")
        markdown_file.write("\n")

        # Table of Contents
        markdown_file.write("## Table of Contents\n\n")
        toc = []
        for root, dirs, files in os.walk(src_dir):
            for file in files:
                file_path = os.path.join(root, file)
                if "all" in file_extensions or any(file.endswith(ext) for ext in file_extensions):
                    relative_path = os.path.relpath(file_path, src_dir)
                    toc.append(relative_path)
        for relative_path in toc:
            markdown_file.write(f"* [{relative_path}](#file-{relative_path.replace('/', '-')})\n")
        markdown_file.write("\n")

        # File contents
        for root, dirs, files in os.walk(src_dir):
            for file in files:
                file_path = os.path.join(root, file)
                if "all" in file_extensions or any(file.endswith(ext) for ext in file_extensions):
                    relative_path = os.path.relpath(file_path, src_dir)
                    description = get_description(file_path)
                    code_block_marker = get_code_block_marker(file_path)

                    markdown_file.write(f"## File: {relative_path}\n\n")
                    markdown_file.write(f"<a name='file-{relative_path.replace('/', '-')}'></a>\n")
                    markdown_file.write(f"*Description*: {description}\n\n")

                    if is_binary(file_path):
                        markdown_file.write("*This file is binary and cannot be displayed as text.*\n")
                        continue

                    markdown_file.write(f"```{code_block_marker}\n")
                    try:
                        with open(file_path, 'r') as input_file:
                            content = input_file.readlines()
                            if len(content) > 512:  # Truncate large files
                                content = content[:512] + ["\n*Content truncated for brevity.*\n"]
                            markdown_file.writelines(content)
                    except Exception as e:
                        error_message = f"Error reading file {file_path}: {e}"
                        error_log.append(error_message)
                        markdown_file.write(f"Error reading file: {e}\n")

                    markdown_file.write("\n```\n\n")

    if error_log:
        with open("error_log.txt", "w") as log_file:
            log_file.write("\n".join(error_log))
        print("Errors logged to error_log.txt")

    print(f"Combined Markdown file created: {output_file}")


In [9]:
import os
import shutil

def cleanup_repository(repo_path):
    """
    Remove unwanted directories and files from the repository.

    Args:
        repo_path (str): Path to the cloned repository.
    """
    unwanted_paths = [
        os.path.join(repo_path, ".git"),
        os.path.join(repo_path, ".github"),
        os.path.join(repo_path, ".gitignore"),
        os.path.join(repo_path, "public"),
        os.path.join(repo_path, "data"),        
    ]

    for path in unwanted_paths:
        if os.path.exists(path):
            if os.path.isdir(path):
                shutil.rmtree(path)  # Remove directories
                print(f"Removed directory: {path}")
            else:
                os.remove(path)  # Remove files
                print(f"Removed file: {path}")


## repo

In [10]:
# clean up the repository
cleanup_repository(source_directory)

In [11]:
# Convert and clean up python notebooks
convert_and_cleanup_notebooks(source_directory)

In [12]:
# Convert the repository to a combined markdown file
create_combined_markdown(source_directory, output_markdown_file, file_extensions)

Combined Markdown file created: ../data/processed/smolagents-context.md
