# üîç Google AI Studio File Search Tool (Optimized)
Fast search for 1000+ files without extension in your "/Google AI Studio" drive folder

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
import re
import json
from pathlib import Path
from IPython.display import HTML, display
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Install tqdm for progress bars if not already installed
try:
    from tqdm import tqdm
except ImportError:
    import subprocess
    subprocess.run(["pip", "install", "-q", "tqdm"])
    from tqdm import tqdm

# Configuration
DRIVE_FOLDER = '/content/drive/My Drive/Google AI Studio'
ENCODING = 'utf-8'
MAX_FILE_SIZE = 50 * 1024 * 1024  # 50MB - skip larger files
NUM_WORKERS = 4  # Parallel threads for searching

print("‚úÖ Google Drive mounted!")
print(f"‚öôÔ∏è  Using {NUM_WORKERS} parallel workers for fast searching")

In [None]:
def find_files_without_extension(folder_path, max_size=MAX_FILE_SIZE):
    """Find all files without extension in the specified folder and subfolders"""
    files_without_ext = []

    if not os.path.exists(folder_path):
        print(f"‚ùå Folder not found: {folder_path}")
        return files_without_ext

    try:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                # Check if file has no extension (no dot in filename, or dot is at the start)
                if '.' not in file or file.startswith('.'):
                    full_path = os.path.join(root, file)
                    try:
                        file_size = os.path.getsize(full_path)
                        # Skip files that are too large (likely binary)
                        if file_size <= max_size:
                            files_without_ext.append((full_path, file_size))
                    except (OSError, PermissionError):
                        pass
    except PermissionError as e:
        print(f"‚ö†Ô∏è  Permission denied: {e}")

    # Sort by file size (smaller files first for faster initial results)
    files_without_ext.sort(key=lambda x: x[1])
    return [f[0] for f in files_without_ext]

# Find files without extension
print("üìÇ Scanning for files without extension...")
start_time = time.time()
files_without_ext = find_files_without_extension(DRIVE_FOLDER)
scan_time = time.time() - start_time

if files_without_ext:
    total_size = sum(os.path.getsize(f) for f in files_without_ext if os.path.exists(f)) / (1024*1024)
    print(f"‚úÖ Found {len(files_without_ext)} files ({total_size:.1f}MB) in {scan_time:.2f}s\n")
    print("Sample files:")
    for file in files_without_ext[:10]:
        size = os.path.getsize(file) / 1024
        print(f"  ‚Ä¢ {file} ({size:.1f}KB)")
    if len(files_without_ext) > 10:
        print(f"  ... and {len(files_without_ext) - 10} more")
else:
    print("üì≠ No files without extension found")

print(f"\nTotal files: {len(files_without_ext)}")

In [None]:
def search_in_file(file_path, search_term, case_sensitive=False, flags=0):
    """Search for a term in a single file - optimized for parallel execution"""
    results = []
    
    try:
        with open(file_path, 'r', encoding=ENCODING, errors='ignore') as f:
            # Read file in chunks for very large files
            content = f.read()
            
            try:
                matches = list(re.finditer(search_term, content, flags))
            except re.error as e:
                return [(file_path, 0, f'‚ùå Regex error: {e}', '', 0)]
            
            if matches:
                lines = content.split('\n')
                for match in matches:
                    pos = match.start()
                    line_num = content[:pos].count('\n') + 1
                    line_content = lines[line_num - 1] if line_num <= len(lines) else ""
                    
                    results.append((file_path, line_num, match.group(), line_content, pos))
    except Exception as e:
        pass  # Silently skip files that can't be read
    
    return results

def search_in_files_parallel(files, search_term, case_sensitive=False, num_workers=NUM_WORKERS):
    """Search for a term in files using parallel processing"""
    results = []
    flags = 0 if case_sensitive else re.IGNORECASE
    
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Submit all tasks
        futures = {
            executor.submit(search_in_file, file_path, search_term, case_sensitive, flags): file_path
            for file_path in files
        }
        
        # Collect results with progress bar
        with tqdm(total=len(files), desc="üîç Searching", unit="file") as pbar:
            for future in as_completed(futures):
                file_results = future.result()
                results.extend(file_results)
                pbar.update(1)
    
    return results

def display_results_html(results, limit=50):
    """Display search results in HTML format"""
    if not results:
        display(HTML("<p><strong>üì≠ No matches found</strong></p>"))
        return

    html = f"<p><strong>‚úÖ Found {len(results)} matches</strong></p>"
    html += "<div style='border: 1px solid #ddd; padding: 10px; border-radius: 5px; max-height: 600px; overflow-y: auto;'>"

    for i, (file_path, line_num, match, line_content, pos) in enumerate(results[:limit], 1):
        file_name = file_path.replace('/content/drive/My Drive/', '')
        html += f"<div style='margin-bottom: 15px; padding-bottom: 10px; border-bottom: 1px solid #eee;'>"
        html += f"<p><strong>{i}. {file_name}</strong></p>"
        html += f"<p style='color: #666; font-size: 12px;'>Line {line_num}</p>"
        html += f"<p style='background-color: #f5f5f5; padding: 5px; border-radius: 3px; word-break: break-all;'>"
        html += f"<code><strong style='color: #d9534f;'>{match}</strong></code></p>"
        
        if line_content:
            context = line_content[:120]
            if len(line_content) > 120:
                context += "..."
            html += f"<p style='color: #999; font-size: 12px;'><em>{context}</em></p>"
        html += "</div>"

    if len(results) > limit:
        html += f"<p><em>... and {len(results) - limit} more matches</em></p>"

    html += "</div>"
    display(HTML(html))

print("‚ö° Optimized search functions ready!")

In [None]:
# Fast search - modify these values to search
search_term = "TODO"  # Change this to your search term
case_sensitive = False  # Set to True for case-sensitive search

print(f"üîç Searching for '{search_term}'...\n")
start_time = time.time()
results = search_in_files_parallel(files_without_ext, search_term, case_sensitive, num_workers=4)
search_time = time.time() - start_time

print(f"\n‚úÖ Found {len(results)} matches in {search_time:.2f}s")
print(f"Searched {len(files_without_ext)} files at ~{len(files_without_ext)/search_time:.0f} files/second\n")

display_results_html(results, limit=50)

In [None]:
# Advanced regex search example
search_term = r"def\s+\w+\("  # Find function definitions
case_sensitive = True

print(f"üîç Searching with regex: '{search_term}'...\n")
start_time = time.time()
results = search_in_files_parallel(files_without_ext, search_term, case_sensitive)
search_time = time.time() - start_time

print(f"\n‚úÖ Found {len(results)} matches in {search_time:.2f}s\n")
display_results_html(results, limit=50)

In [None]:
# Performance comparison: single vs parallel
import time

test_search = "class"
test_files = files_without_ext[:50]  # Test with first 50 files

print(f"üìä Performance Test: Searching for '{test_search}' in {len(test_files)} files\n")

# Single threaded (slow)
print("‚è±Ô∏è  Single-threaded search...")
start = time.time()
single_results = []
flags = re.IGNORECASE
for f in test_files:
    single_results.extend(search_in_file(f, test_search, False, flags))
single_time = time.time() - start
print(f"   Time: {single_time:.2f}s")

# Multi-threaded (fast)
print("\n‚ö° Parallel search (4 workers)...")
start = time.time()
parallel_results = search_in_files_parallel(test_files, test_search, False, num_workers=4)
parallel_time = time.time() - start
print(f"   Time: {parallel_time:.2f}s")

print(f"\nüìà Speedup: {single_time/parallel_time:.1f}x faster with parallel processing!")
print(f"   Found {len(parallel_results)} matches")