In [1]:
#repo combiner

import os
import tempfile
import shutil
import mimetypes
import pandas as pd
import requests
import subprocess
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from pathlib import Path
import io
import re
import urllib.parse

class GitHubRepoFileCombiner:
    def __init__(self):
        self.non_informative_extensions = {
            'jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp',
            'tiff', 'svg', 'ico', 'heic', 'avif',
            'exe', 'bin', 'dll',
            'zip', 'tar', 'gz', 'rar',
            'mp3', 'wav', 'aac',
            'mp4', 'avi', 'mov', 'mkv',
            'log', 'md', 'lock',
            'tmp', 'bak',
            'ttf', 'otf', 'woff', 'woff2', 'eot',
            'doc', 'docx', 'odt', 'xlsx',
            'css', 'scss', 'less'
        }
        
        self.exclude_files = {
            'dataTables.js',
            'jquery-3.6.0.min.js'
        }
        
        self.exclude_directories = {
            '_locales',
            '_metadata',
            'sample_csv',
            '.git',
            '.github'
        }
        
        self.max_bytes = 6000
        self.file_data = []
        self.repo_name = ""
        self.setup_ui()

    def setup_ui(self):
        """Set up the user interface widgets"""
        self.url_input = widgets.Text(
            value='',
            placeholder='Enter GitHub repository URL (can include subdirectory path)',
            description='Repo URL:',
            layout=widgets.Layout(width='600px')
        )
        
        self.process_button = widgets.Button(
            description='Clone and Process Repository',
            button_style='primary'
        )
        self.process_button.on_click(self.on_process_button_clicked)
        
        self.output_area = widgets.Output()
        
        # Add a help text widget
        self.help_text = widgets.HTML(
            value="""<div style="background-color:#f8f9fa; padding:10px; border-radius:5px; margin-bottom:10px;">
            <p><strong>Instructions:</strong></p>
            <ul>
                <li>Enter a GitHub repository URL (e.g. <code>https://github.com/username/repo</code>)</li>
                <li>You can also specify a subdirectory with a URL like: <code>https://github.com/username/repo/tree/branch/subdir</code></li>
                <li>Click "Clone and Process Repository" to start analysis</li>
                <li>Results will be saved to <code>reponame.txt</code> and <code>reponame.csv</code></li>
                <li>Use the settings below to customize which files to process</li>
            </ul>
            </div>"""
        )
        
        # Settings widgets
        self.settings_header = widgets.HTML(value="<h2>Settings</h2>")
        
        # Non-informative extensions settings
        self.non_info_header = widgets.HTML(value="<h3>Non-Informative Extensions</h3>")
        self.non_info_input = widgets.Text(
            placeholder='Add extension e.g., txt',
            layout=widgets.Layout(width='250px')
        )
        self.non_info_add_button = widgets.Button(description='Add')
        self.non_info_add_button.on_click(self.add_non_informative)
        self.non_info_list = widgets.Select(
            options=sorted(list(self.non_informative_extensions)),
            layout=widgets.Layout(width='300px', height='150px')
        )
        self.non_info_remove_button = widgets.Button(description='Remove Selected')
        self.non_info_remove_button.on_click(self.remove_non_informative)
        
        # Exclude files settings
        self.exclude_files_header = widgets.HTML(value="<h3>Exclude Files</h3>")
        self.exclude_files_input = widgets.Text(
            placeholder='Add file e.g., example.txt',
            layout=widgets.Layout(width='250px')
        )
        self.exclude_files_add_button = widgets.Button(description='Add')
        self.exclude_files_add_button.on_click(self.add_exclude_file)
        self.exclude_files_list = widgets.Select(
            options=sorted(list(self.exclude_files)),
            layout=widgets.Layout(width='300px', height='150px')
        )
        self.exclude_files_remove_button = widgets.Button(description='Remove Selected')
        self.exclude_files_remove_button.on_click(self.remove_exclude_file)
        
        # Exclude directories settings
        self.exclude_dirs_header = widgets.HTML(value="<h3>Exclude Directories</h3>")
        self.exclude_dirs_input = widgets.Text(
            placeholder='Add directory e.g., temp',
            layout=widgets.Layout(width='250px')
        )
        self.exclude_dirs_add_button = widgets.Button(description='Add')
        self.exclude_dirs_add_button.on_click(self.add_exclude_dir)
        self.exclude_dirs_list = widgets.Select(
            options=sorted(list(self.exclude_directories)),
            layout=widgets.Layout(width='300px', height='150px')
        )
        self.exclude_dirs_remove_button = widgets.Button(description='Remove Selected')
        self.exclude_dirs_remove_button.on_click(self.remove_exclude_dir)
        
        # Display UI
        display(widgets.VBox([
            widgets.HTML(value="<h1>GitHub Repository File Combiner</h1>"),
            self.help_text,
            self.url_input,
            self.process_button,
            self.output_area,
            self.settings_header,
            widgets.HBox([
                widgets.VBox([
                    self.non_info_header,
                    self.non_info_input,
                    self.non_info_add_button,
                    self.non_info_list,
                    self.non_info_remove_button
                ]),
                widgets.VBox([
                    self.exclude_files_header,
                    self.exclude_files_input,
                    self.exclude_files_add_button,
                    self.exclude_files_list,
                    self.exclude_files_remove_button
                ]),
                widgets.VBox([
                    self.exclude_dirs_header,
                    self.exclude_dirs_input,
                    self.exclude_dirs_add_button,
                    self.exclude_dirs_list,
                    self.exclude_dirs_remove_button
                ])
            ])
        ]))

    def add_non_informative(self, _):
        """Add a new extension to non-informative list"""
        value = self.non_info_input.value.strip().lower()
        if value and value not in self.non_informative_extensions:
            self.non_informative_extensions.add(value)
            self.non_info_list.options = sorted(list(self.non_informative_extensions))
            self.non_info_input.value = ''
    
    def remove_non_informative(self, _):
        """Remove selected extension from non-informative list"""
        if self.non_info_list.value:
            self.non_informative_extensions.remove(self.non_info_list.value)
            self.non_info_list.options = sorted(list(self.non_informative_extensions))
    
    def add_exclude_file(self, _):
        """Add a new file to exclude list"""
        value = self.exclude_files_input.value.strip()
        if value and value not in self.exclude_files:
            self.exclude_files.add(value)
            self.exclude_files_list.options = sorted(list(self.exclude_files))
            self.exclude_files_input.value = ''
    
    def remove_exclude_file(self, _):
        """Remove selected file from exclude list"""
        if self.exclude_files_list.value:
            self.exclude_files.remove(self.exclude_files_list.value)
            self.exclude_files_list.options = sorted(list(self.exclude_files))
    
    def add_exclude_dir(self, _):
        """Add a new directory to exclude list"""
        value = self.exclude_dirs_input.value.strip()
        if value and value not in self.exclude_directories:
            self.exclude_directories.add(value)
            self.exclude_dirs_list.options = sorted(list(self.exclude_directories))
            self.exclude_dirs_input.value = ''
    
    def remove_exclude_dir(self, _):
        """Remove selected directory from exclude list"""
        if self.exclude_dirs_list.value:
            self.exclude_directories.remove(self.exclude_dirs_list.value)
            self.exclude_dirs_list.options = sorted(list(self.exclude_directories))

    def on_process_button_clicked(self, _):
        """Handle the process button click event"""
        with self.output_area:
            clear_output()
            repo_url = self.url_input.value.strip()
            if not repo_url:
                print("Please enter a valid GitHub repository URL.")
                return
            
            try:
                print(f"Processing repository: {repo_url}")
                self.process_repository(repo_url)
            except Exception as e:
                print(f"Error: {str(e)}")

    def extract_repo_info(self, repo_url):
        """Extract repository name and subdirectory path from URL"""
        # Handle both HTTPS and SSH GitHub URLs
        if repo_url.endswith('.git'):
            repo_url = repo_url[:-4]
            
        # Parse the URL
        parsed_url = urllib.parse.urlparse(repo_url)
        path = parsed_url.path.strip('/')
        
        # Handle SSH URLs (git@github.com:username/repo.git)
        if not parsed_url.scheme and ':' in parsed_url.path:
            path = parsed_url.path.split(':', 1)[1]
            
        # Extract the repo name and subdirectory from the path
        path_parts = path.split('/')
        
        # GitHub URLs typically have format: username/repo[/tree/branch/subdir]
        if len(path_parts) < 2:
            return "repository", "", ""
        
        repo_name = path_parts[1]
        subdirectory = ""
        branch = "main"  # Default branch
        
        # Check if URL points to a subdirectory
        if len(path_parts) > 3 and path_parts[2] == "tree":
            branch = path_parts[3]
            if len(path_parts) > 4:
                subdirectory = '/'.join(path_parts[4:])
        
        return repo_name, branch, subdirectory

    def process_repository(self, repo_url):
        """Clone and process the GitHub repository"""
        self.file_data = []
        
        repo_name, branch, subdirectory = self.extract_repo_info(repo_url)
        self.repo_name = repo_name
        
        if subdirectory:
            self.repo_name = f"{repo_name}_{subdirectory.replace('/', '_')}"
        
        with tempfile.TemporaryDirectory() as temp_dir:
            print(f"Cloning repository to temporary directory...")
            
            try:
                # Extract the main repository URL (without subdirectory part)
                main_repo_url = repo_url
                if "/tree/" in repo_url:
                    main_repo_url = repo_url.split("/tree/")[0]
                
                # Clone the specific branch
                subprocess.run(
                    ["git", "clone", "--depth=1", "--branch", branch, main_repo_url, temp_dir],
                    check=True,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE
                )
                
                print("Repository cloned successfully.")
                print("Processing files...")
                
                # Process only the subdirectory if specified
                process_path = temp_dir
                if subdirectory:
                    process_path = os.path.join(temp_dir, subdirectory)
                    print(f"Processing subdirectory: {subdirectory}")
                    # Verify the subdirectory exists
                    if not os.path.exists(process_path):
                        print(f"Error: Subdirectory '{subdirectory}' not found in repository")
                        return
                
                # Process the repository files
                self.process_directory(process_path)
                
                # Create DataFrame and display
                self.display_results()
                
                # Save to file
                self.save_to_file()
                
            except subprocess.CalledProcessError as e:
                error_msg = e.stderr.decode('utf-8') if hasattr(e, 'stderr') else str(e)
                print(f"Failed to clone repository: {error_msg}")
            except Exception as e:
                print(f"Error processing repository: {str(e)}")
            
            print("Temporary files have been cleaned up")
            # Note: The with statement automatically cleans up the temporary directory

    def is_non_informative_file(self, filename):
        """Check if the file has a non-informative extension"""
        extension = filename.split('.')[-1].lower() if '.' in filename else ''
        return extension in self.non_informative_extensions

    def is_excluded_path(self, path):
        """Check if the path should be excluded"""
        segments = path.replace('\\', '/').split('/')
        filename = segments[-1]
        
        # Check if the file name is in the excluded files set
        if filename in self.exclude_files:
            return True
        
        # Check if any directory in the path is in the excluded directories set
        for segment in segments:
            if segment in self.exclude_directories:
                return True
                
        return False

    def get_first_bytes(self, text, max_bytes):
        """Get the first max_bytes characters of text"""
        return text[:max_bytes]

    def process_directory(self, directory_path, rel_path=''):
        """Process all files in a directory recursively"""
        for entry in os.scandir(directory_path):
            full_path = entry.path
            rel_entry_path = os.path.join(rel_path, entry.name)
            
            if entry.is_file():
                if self.is_excluded_path(rel_entry_path):
                    continue
                    
                if self.is_non_informative_file(entry.name):
                    self.file_data.append([
                        rel_entry_path,
                        'Excluded by file type',
                        '',
                        0,
                        ''
                    ])
                    continue
                    
                try:
                    # Get file size
                    size = entry.stat().st_size
                    
                    # Determine MIME type
                    mime_type, _ = mimetypes.guess_type(full_path)
                    mime_type = mime_type or 'application/octet-stream'
                    
                    # Read file content
                    with open(full_path, 'r', errors='replace') as file:
                        text = file.read()
                    
                    processed_text = text
                    comment = 'Included'
                    
                    # Truncate if needed
                    if size > self.max_bytes:
                        processed_text = self.get_first_bytes(text, self.max_bytes)
                        comment = f'Truncated to {self.max_bytes} bytes due to size > {self.max_bytes}'
                    
                    self.file_data.append([
                        rel_entry_path,
                        comment,
                        mime_type,
                        size,
                        processed_text
                    ])
                    
                except Exception as e:
                    self.file_data.append([
                        rel_entry_path,
                        f'Error: {str(e)}',
                        '',
                        0,
                        ''
                    ])
                    
            elif entry.is_dir():
                # Skip excluded directories
                if self.is_excluded_path(rel_entry_path):
                    continue
                    
                # Process subdirectory
                self.process_directory(full_path, rel_entry_path)

    def display_results(self):
        """Display the processed file data in a table"""
        if not self.file_data:
            print("No files processed.")
            return
            
        df = pd.DataFrame(self.file_data, columns=[
            'Full Path',
            'Comments',
            'Mime Type',
            'Size (bytes)',
            'Text Content'
        ])
        
        print(f"Processed {len(df)} files.")
        
        # Format the DataFrame for display
        display_df = df.copy()
        display_df['Text Content'] = display_df['Text Content'].apply(
            lambda x: (x[:100] + '...') if len(x) > 100 else x
        )
        
        # Display the formatted table
        display(HTML(display_df.to_html(
            classes='table table-striped table-bordered',
            escape=True,
            index=False,
            max_cols=5,
            max_rows=20
        )))
        
        return df

    def save_to_file(self):
        """Save the results to a text file"""
        if not self.file_data:
            print("No data to save.")
            return
            
        # Sanitize repo name for filename
        safe_repo_name = re.sub(r'[^\w\-_.]', '_', self.repo_name)
        filename = f"{safe_repo_name}.txt"
        
        df = pd.DataFrame(self.file_data, columns=[
            'Full Path',
            'Comments',
            'Mime Type',
            'Size (bytes)',
            'Text Content'
        ])
        
        with open(filename, 'w', encoding='utf-8') as f:
            for _, row in df.iterrows():
                f.write(f"File: {row['Full Path']}\n")
                f.write(f"Comments: {row['Comments']}\n")
                f.write(f"Mime Type: {row['Mime Type']}\n")
                f.write(f"Size: {row['Size (bytes)']} bytes\n")
                f.write("Content:\n")
                f.write(row['Text Content'])
                f.write("\n" + "-"*80 + "\n\n")
        
        print(f"Results saved to {filename}")
        
        # Also save as CSV for easier data manipulation
        csv_filename = f"{safe_repo_name}.csv"
        df.to_csv(csv_filename, index=False)
        print(f"Results also saved to CSV: {csv_filename}")

# Create and display the combiner
combiner = GitHubRepoFileCombiner()


VBox(children=(HTML(value='<h1>GitHub Repository File Combiner</h1>'), HTML(value='<div style="background-colo…