In [1]:
import os
import pandas as pd
import mimetypes
import glob
from IPython.display import display, HTML
import ipywidgets as widgets
from pathlib import Path
import re
import base64

class FileExplorer:
    def __init__(self):
        # Configuration settings
        self.non_informative_extensions = {
            'jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp',
            'tiff', 'svg', 'ico', 'heic', 'avif',
            'exe', 'bin', 'dll',
            'zip', 'tar', 'gz', 'rar',
            'mp3', 'wav', 'aac',
            'mp4', 'avi', 'mov', 'mkv',
            'log', 'md', 'lock',
            'tmp', 'bak',
            'ttf', 'otf', 'woff', 'woff2', 'eot',
            'doc', 'docx', 'odt', 'xlsx',
            'css', 'scss', 'less'
        }
        
        self.exclude_files = {
            'dataTables.js',
            'jquery-3.6.0.min.js'
        }
        
        self.exclude_directories = {
            '_locales',
            '_metadata',
            'sample_csv',
            '.git',
            '.github',
            '__pycache__',
            '.ipynb_checkpoints'
        }
        
        self.max_bytes = 6000
        self.df = None
        
        # Initialize UI components
        self.setup_ui()
    
    def setup_ui(self):
        # Directory selection
        self.dir_path_input = widgets.Text(
            value='',
            placeholder='Enter directory path',
            description='Directory:',
            layout=widgets.Layout(width='70%')
        )
        
        self.scan_button = widgets.Button(
            description='Scan Directory',
            button_style='primary',
            tooltip='Click to scan the directory'
        )
        self.scan_button.on_click(self.on_scan_clicked)
        
        # Settings UI
        self.extension_input = widgets.Text(
            value='',
            placeholder='Add extension (e.g., txt)',
            description='Extension:',
            layout=widgets.Layout(width='50%')
        )
        self.add_extension_button = widgets.Button(
            description='Add Non-Informative Extension',
            tooltip='Add to excluded extensions'
        )
        self.add_extension_button.on_click(self.add_extension)
        
        self.exclude_file_input = widgets.Text(
            value='',
            placeholder='Add filename to exclude',
            description='Filename:',
            layout=widgets.Layout(width='50%')
        )
        self.add_exclude_file_button = widgets.Button(
            description='Add Excluded File',
            tooltip='Add to excluded files'
        )
        self.add_exclude_file_button.on_click(self.add_exclude_file)
        
        self.exclude_dir_input = widgets.Text(
            value='',
            placeholder='Add directory to exclude',
            description='Directory:',
            layout=widgets.Layout(width='50%')
        )
        self.add_exclude_dir_button = widgets.Button(
            description='Add Excluded Directory',
            tooltip='Add to excluded directories'
        )
        self.add_exclude_dir_button.on_click(self.add_exclude_dir)
        
        # Status message
        self.status_output = widgets.Output()
        
        # Search functionality
        self.search_input = widgets.Text(
            value='',
            placeholder='Search in files...',
            description='Search:',
            layout=widgets.Layout(width='70%')
        )
        self.search_button = widgets.Button(
            description='Search',
            button_style='info',
            tooltip='Search in file contents'
        )
        self.search_button.on_click(self.search_files)
        
        # Display current settings
        self.settings_output = widgets.Output()
        self.update_settings_display()
    
    def display_ui(self):
        # Main control panel
        main_controls = widgets.HBox([self.dir_path_input, self.scan_button])
        
        # Search controls
        search_controls = widgets.HBox([self.search_input, self.search_button])
        
        # Extension management
        extension_controls = widgets.VBox([
            widgets.HBox([self.extension_input, self.add_extension_button]),
            widgets.HBox([self.exclude_file_input, self.add_exclude_file_button]),
            widgets.HBox([self.exclude_dir_input, self.add_exclude_dir_button])
        ])
        
        # Layout all components
        display(widgets.HTML("<h2>File Directory Explorer</h2>"))
        display(main_controls)
        display(search_controls)
        display(widgets.HTML("<h3>Settings</h3>"))
        display(extension_controls)
        display(self.settings_output)
        display(self.status_output)
    
    def update_settings_display(self):
        self.settings_output.clear_output()
        with self.settings_output:
            display(HTML(f"""
            <h4>Current Settings:</h4>
            <p><b>Non-Informative Extensions:</b> {', '.join(sorted(self.non_informative_extensions))}</p>
            <p><b>Excluded Files:</b> {', '.join(sorted(self.exclude_files))}</p>
            <p><b>Excluded Directories:</b> {', '.join(sorted(self.exclude_directories))}</p>
            """))
    
    def add_extension(self, b):
        extension = self.extension_input.value.strip().lower()
        if extension:
            if extension.startswith('.'):
                extension = extension[1:]
            self.non_informative_extensions.add(extension)
            self.extension_input.value = ''
            self.update_settings_display()
    
    def add_exclude_file(self, b):
        filename = self.exclude_file_input.value.strip()
        if filename:
            self.exclude_files.add(filename)
            self.exclude_file_input.value = ''
            self.update_settings_display()
    
    def add_exclude_dir(self, b):
        dirname = self.exclude_dir_input.value.strip()
        if dirname:
            self.exclude_directories.add(dirname)
            self.exclude_dir_input.value = ''
            self.update_settings_display()
    
    def is_non_informative_file(self, filename):
        extension = filename.split('.')[-1].lower() if '.' in filename else ''
        return extension in self.non_informative_extensions
    
    def is_excluded_path(self, filepath):
        path_parts = filepath.split(os.sep)
        filename = path_parts[-1]
        
        # Check if the file is in the excluded files list
        if filename in self.exclude_files:
            return True
        
        # Check if any directory in the path is in the excluded directories list
        for part in path_parts:
            if part in self.exclude_directories:
                return True
        
        return False
    
    def process_directory(self, directory_path):
        file_data = []
        
        # Walk through all files in the directory
        for root, dirs, files in os.walk(directory_path):
            # Filter out excluded directories
            dirs[:] = [d for d in dirs if d not in self.exclude_directories]
            
            for file in files:
                # Create the full file path
                file_path = os.path.join(root, file)
                rel_path = os.path.relpath(file_path, directory_path)
                
                # Skip excluded paths
                if self.is_excluded_path(file_path):
                    continue
                
                # Process non-informative files differently
                if self.is_non_informative_file(file):
                    try:
                        file_size = os.path.getsize(file_path)
                        mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
                        file_data.append({
                            'full_path': rel_path,
                            'comments': 'Excluded by file type',
                            'mime_type': mime_type,
                            'size_bytes': file_size,
                            'text_content': ''
                        })
                    except Exception as e:
                        file_data.append({
                            'full_path': rel_path,
                            'comments': f'Error processing file: {str(e)}',
                            'mime_type': 'unknown',
                            'size_bytes': 0,
                            'text_content': ''
                        })
                    continue
                
                # Process regular text files
                try:
                    file_size = os.path.getsize(file_path)
                    mime_type = mimetypes.guess_type(file_path)[0] or 'application/octet-stream'
                    
                    # Check if the file is likely to be binary
                    if self.is_likely_binary(file_path):
                        file_data.append({
                            'full_path': rel_path,
                            'comments': 'Binary file (not shown)',
                            'mime_type': mime_type,
                            'size_bytes': file_size,
                            'text_content': ''
                        })
                        continue
                    
                    # Read text content with size limit
                    if file_size > self.max_bytes:
                        with open(file_path, 'r', errors='replace') as f:
                            content = f.read(self.max_bytes)
                        file_data.append({
                            'full_path': rel_path,
                            'comments': f'Truncated to {self.max_bytes} bytes due to size > {self.max_bytes}',
                            'mime_type': mime_type,
                            'size_bytes': file_size,
                            'text_content': content
                        })
                    else:
                        with open(file_path, 'r', errors='replace') as f:
                            content = f.read()
                        file_data.append({
                            'full_path': rel_path,
                            'comments': 'Included',
                            'mime_type': mime_type,
                            'size_bytes': file_size,
                            'text_content': content
                        })
                except Exception as e:
                    file_data.append({
                        'full_path': rel_path,
                        'comments': f'Error reading file: {str(e)}',
                        'mime_type': 'unknown',
                        'size_bytes': file_size if 'file_size' in locals() else 0,
                        'text_content': ''
                    })
        
        return pd.DataFrame(file_data)
    
    def is_likely_binary(self, filepath):
        """Check if a file is likely to be binary rather than text."""
        try:
            with open(filepath, 'rb') as f:
                chunk = f.read(1024)
                # Count null bytes and other non-text characters
                null_count = chunk.count(b'\x00')
                control_count = len([b for b in chunk if b < 9 and b != 0])
                
                # If there are nulls or too many control characters, likely binary
                if null_count > 0 or control_count > 0:
                    return True
                return False
        except:
            return True  # If we can't read it, assume it's binary
    
    def on_scan_clicked(self, b):
        directory = self.dir_path_input.value.strip()
        
        with self.status_output:
            self.status_output.clear_output()
            
            if not directory:
                print("Error: Please enter a directory path.")
                return
            
            if not os.path.exists(directory):
                print(f"Error: Directory '{directory}' does not exist.")
                return
            
            if not os.path.isdir(directory):
                print(f"Error: '{directory}' is not a directory.")
                return
            
            print(f"Scanning directory: {directory}...")
            try:
                self.df = self.process_directory(directory)
                print(f"Process complete! Found {len(self.df)} files.")
                
                # Display DataFrame with interactive features
                if not self.df.empty:
                    display(HTML('<h3>File Data</h3>'))
                    
                    # Create an interactive DataFrame display
                    pd.set_option('display.max_colwidth', 100)
                    display(self.df[['full_path', 'comments', 'mime_type', 'size_bytes']])
                    
                    # Store the DataFrame for later queries
                    print("\nUse `.query_files()` method to search or filter the data.")
                else:
                    print("No files found matching the criteria.")
            
            except Exception as e:
                print(f"Error processing directory: {str(e)}")
    
    def query_files(self, query=None):
        """Query the files using a pandas query string or display all files if no query."""
        if self.df is None or self.df.empty:
            print("No data available. Please scan a directory first.")
            return
        
        if query:
            try:
                result = self.df.query(query)
                if result.empty:
                    print(f"No results found for query: {query}")
                else:
                    print(f"Found {len(result)} matching files:")
                    return result
            except Exception as e:
                print(f"Error executing query: {str(e)}")
                print("Example queries: \"size_bytes > 1000\" or \"full_path.str.contains('py')\"")
        else:
            return self.df
    
    def search_files(self, b):
        search_term = self.search_input.value.strip()
        
        with self.status_output:
            self.status_output.clear_output()
            
            if not search_term:
                print("Please enter a search term.")
                return
            
            if self.df is None or self.df.empty:
                print("No data available. Please scan a directory first.")
                return
            
            try:
                # Search in file contents
                matches = self.df[self.df['text_content'].str.contains(search_term, case=False, na=False)]
                
                if matches.empty:
                    print(f"No files containing '{search_term}' found.")
                else:
                    print(f"Found {len(matches)} files containing '{search_term}':")
                    display(matches[['full_path', 'comments', 'mime_type', 'size_bytes']])
                    
                    # Show the first match with context
                    if len(matches) > 0:
                        first_file = matches.iloc[0]
                        file_content = first_file['text_content']
                        file_path = first_file['full_path']
                        
                        # Find the position of the match and extract context
                        match_pos = file_content.lower().find(search_term.lower())
                        if match_pos >= 0:
                            start = max(0, match_pos - 100)
                            end = min(len(file_content), match_pos + len(search_term) + 100)
                            context = file_content[start:end]
                            
                            # Highlight the match
                            pattern = re.compile(re.escape(search_term), re.IGNORECASE)
                            highlighted = pattern.sub(f"<mark><b>{search_term}</b></mark>", context)
                            
                            display(HTML(f"<h4>Match context in {file_path}:</h4>"))
                            display(HTML(f"<pre>{highlighted}</pre>"))
            
            except Exception as e:
                print(f"Error during search: {str(e)}")
    
    def view_file_content(self, file_index):
        """Display the full content of a specific file by its index."""
        if self.df is None or self.df.empty:
            print("No data available. Please scan a directory first.")
            return
        
        if file_index < 0 or file_index >= len(self.df):
            print(f"Invalid file index. Must be between 0 and {len(self.df)-1}.")
            return
        
        file = self.df.iloc[file_index]
        display(HTML(f"<h3>File: {file['full_path']}</h3>"))
        display(HTML(f"<p><b>MIME Type:</b> {file['mime_type']}</p>"))
        display(HTML(f"<p><b>Size:</b> {file['size_bytes']} bytes</p>"))
        display(HTML(f"<p><b>Comments:</b> {file['comments']}</p>"))
        
        if file['text_content']:
            display(HTML("<h4>Content:</h4>"))
            display(HTML(f"<pre>{file['text_content']}</pre>"))
        else:
            display(HTML("<p>No text content available for this file.</p>"))

# Example usage
explorer = FileExplorer()
explorer.display_ui()

# How to use this tool:
# 1. Enter a directory path and click 'Scan Directory'
# 2. Search for content in files using the search box
# 3. Query the dataset programmatically:
#    - explorer.query_files("size_bytes > 1000")
#    - explorer.query_files("full_path.str.contains('json')")
# 4. View content of a specific file:
#    - explorer.view_file_content(0)  # View first file

HTML(value='<h2>File Directory Explorer</h2>')

HBox(children=(Text(value='', description='Directory:', layout=Layout(width='70%'), placeholder='Enter directo…

HBox(children=(Text(value='', description='Search:', layout=Layout(width='70%'), placeholder='Search in files.…

HTML(value='<h3>Settings</h3>')

VBox(children=(HBox(children=(Text(value='', description='Extension:', layout=Layout(width='50%'), placeholder…

Output()

Output()