In [33]:
import sys
import os
import requests

# Get the absolute path of the current notebook directory
current_dir = os.path.dirname(os.path.abspath("__file__"))

# Navigate to the config directory (going up two levels from scripts/notebook to src)
config_path = os.path.join(notebook_dir, "components-generator")
sys.path.append(config_path)
from config import TXYZ_API_KEY

import json

In [34]:
# Constants with relative paths from vocab root
ROOT_DIR = os.path.abspath(os.path.join(current_dir, "../.."))
ARTICLES_PATH = os.path.join(ROOT_DIR, "src/content/articles")
OUTPUT_PATH = os.path.join(ROOT_DIR, "src/data/txyz-papers.json")
API_URL = "https://api.txyz.ai/v1/search/scholar"

In [35]:
def load_existing_results():
    """Load existing results and extract processed slugs."""
    try:
        if os.path.exists(OUTPUT_PATH):
            with open(OUTPUT_PATH, 'r', encoding='utf-8') as f:
                existing_results = json.load(f)
            processed_slugs = {item['slug'] for item in existing_results}
            return existing_results, processed_slugs
        return [], set()
    except Exception as e:
        print(f"Error loading existing results: {str(e)}")
        return [], set()

def get_next_unprocessed_files(processed_slugs, n=1000):
    """
    Get next n unprocessed markdown files. If n is greater than remaining files,
    process all remaining files.
    
    Args:
        processed_slugs (set): Set of already processed slugs
        n (int): Number of files to process, defaults to 5
        
    Returns:
        list: List of file paths to process
        int: Total number of remaining unprocessed files
    """
    try:
        files_to_process = []
        remaining_count = 0
        
        # First count total remaining files
        for file in sorted(os.listdir(ARTICLES_PATH)):
            if not file.endswith('.md'):
                continue
            
            file_path = os.path.join(ARTICLES_PATH, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    for line in content.split('\n'):
                        if 'slug:' in line:
                            slug = line.split('slug:')[1].strip()
                            if slug not in processed_slugs:
                                remaining_count += 1
                            break
            except Exception as e:
                print(f"Error reading file {file}: {str(e)}")
                continue

        # Then get the files to process
        for file in sorted(os.listdir(ARTICLES_PATH)):
            if not file.endswith('.md'):
                continue
            
            file_path = os.path.join(ARTICLES_PATH, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    for line in content.split('\n'):
                        if 'slug:' in line:
                            slug = line.split('slug:')[1].strip()
                            if slug not in processed_slugs:
                                files_to_process.append(file_path)
                            break
            except Exception as e:
                print(f"Error reading file {file}: {str(e)}")
                continue
            
            if len(files_to_process) == n:
                break
        
        if remaining_count < n:
            print(f"Note: Requested {n} files but only {remaining_count} unprocessed files remain.")
        
        return files_to_process, remaining_count
        
    except Exception as e:
        print(f"Error accessing directory: {str(e)}")
        return [], 0

def get_title_and_slug(file_path):
    """Extract title and slug from markdown content."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            title = ''
            slug = ''
            for line in lines:
                if 'title:' in line:
                    title = line.split('title:')[1].strip()
                elif 'slug:' in line:
                    slug = line.split('slug:')[1].strip()
                if title and slug:
                    break
        return title, slug
    except Exception as e:
        print(f"Error reading file {os.path.basename(file_path)}: {str(e)}")
        return '', ''

def process_file(file_path):
    """Process single file and get API response."""
    try:
        title, slug = get_title_and_slug(file_path)
        if not title or not slug:
            return {"slug": os.path.basename(file_path), "error": "Could not extract title or slug"}
        
        headers = {"Authorization": f"Bearer {TXYZ_API_KEY}"}
        query = {"query": f"most prominent papers on {title} related to AI or machine learning"}
        
        response = requests.post(API_URL, headers=headers, params=query)
        return {"slug": slug, "search_results": response.json()}
    except Exception as e:
        print(f"Error processing {os.path.basename(file_path)}: {str(e)}")
        return {"slug": os.path.basename(file_path), "error": str(e)}

def ensure_output_directory():
    """Ensure output directory exists with proper permissions."""
    try:
        output_dir = os.path.dirname(OUTPUT_PATH)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)
        return True
    except Exception as e:
        print(f"Error creating output directory: {str(e)}")
        return False

def main():
    # Load existing results
    existing_results, processed_slugs = load_existing_results()
    print(f"Found {len(existing_results)} previously processed files.")
    
    # Check output directory
    if not ensure_output_directory():
        print("Cannot create output directory. Exiting.")
        return
    
    # Get and process next batch of files
    files_to_process, remaining_count = get_next_unprocessed_files(processed_slugs)
    
    if not files_to_process:
        print("No new files to process.")
        return
        
    print(f"Processing {len(files_to_process)} files. {remaining_count - len(files_to_process)} files will remain.")
    
    new_results = []
    for file_path in files_to_process:
        try:
            result = process_file(file_path)
            new_results.append(result)
            print(f"Processed: {os.path.basename(file_path)}")
        except Exception as e:
            print(f"Error on file {os.path.basename(file_path)}: {str(e)}")
    
    # Combine and save results
    all_results = existing_results + new_results
    try:
        with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
            json.dump(all_results, f, indent=2)
        print(f"\nResults saved to {OUTPUT_PATH}")
        print(f"This batch: {len(new_results)} files")
        print(f"Total processed: {len(all_results)}")
        print(f"Remaining unprocessed: {remaining_count - len(files_to_process)}")
    except Exception as e:
        print(f"Error saving results: {str(e)}")

In [36]:
if __name__ == "__main__":
    main()

Found 15 previously processed files.
Note: Requested 1000 files but only 920 unprocessed files remain.
Processing 920 files. 0 files will remain.
Processed: active-inference.md
Processed: actor-critic-models.md
Processed: adam-adaptive-moment-estimation.md
Processed: adapter-layer.md
Processed: adapter.md
Processed: adaptive-dual-scale-denoising.md
Processed: adaptive-problem-solving.md
Processed: adversarial-attacks.md
Processed: adversarial-debiasing.md
Processed: adversarial-instructions.md
Processed: aeo-answer-engine-optimization.md
Processed: affective-computation.md
Processed: agent-to-agent-interaction.md
Processed: agent.md
Processed: agentic-ai-systems.md
Processed: agglomerative-clustering.md
Processed: agi-artificial-general-intelligence.md
Processed: ai-auditing.md
Processed: ai-effect.md
Processed: ai-failure-modes.md
Processed: ai-governance.md
Processed: ai-safety.md
Processed: ai-watchdog.md
Processed: ai-winter.md
Processed: ait-algorithmic-information-theory.md
Proce