# ENV SETUP

In [1]:
import json
import os
from dotenv import load_dotenv
from tavily import TavilyClient

load_dotenv()

client = TavilyClient(os.getenv("TAVILY_API_KEY"))

# EXTRACT BLOG POSTS

In [2]:
response = client.crawl(
    url="https://interviewing.io/blog",
    instructions="Get all blog posts", 
    exclude_paths=["/category/.*","/page/.*"],
    include_images=True
)

with open('exports/blog_posts/blog_posts.json', 'w') as f:
    json.dump(response, f, indent=2)

# CLEAN BLOG POSTS

In [3]:
with open('exports/blog_posts/blog_posts.json', 'r') as f:
    data = json.load(f)

with open('exports/blog_posts/blog_posts.md', 'w') as f:
    seen_paragraphs = set()
    
    # First pass - collect all cleaned paragraphs
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                seen_paragraphs.add(cleaned)
    
    # Second pass - only write paragraphs that appear once
    paragraph_counts = {}
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                paragraph_counts[cleaned] = paragraph_counts.get(cleaned, 0) + 1
    
    # Write content
    for result in data.get('results', []):
        content = result.get('raw_content')
        url = result.get('url', '')
            
        # Split content into paragraphs
        paragraphs = content.split('\n\n')
        
        # Write unique paragraphs
        first_h1 = True
        first_paragraph = True
        for paragraph in paragraphs:
            # Remove all whitespace before comparing
            cleaned = ''.join(paragraph.split())
            if cleaned and paragraph_counts[cleaned] == 1:
                # Check if this is an H1 heading (starts with single #)
                if first_paragraph:
                    first_paragraph = False
                    continue
                if paragraph.strip().startswith('# ') and first_h1:
                    title = paragraph.strip().replace('# ', '')
                    # Skip writing the title line since it will be included in the header
                    f.write(f'# [{title}]({url})\n\n')
                    first_h1 = False
                else:
                    # Check if paragraph contains an image with relative path
                    if '![' in paragraph and '](/' in paragraph:
                        # Add interviewing.io domain to relative image paths
                        paragraph = paragraph.replace('](/', '](https://interviewing.io/')
                    f.write(paragraph.strip() + '\n\n')
                
        f.write('\n')  # Add spacing between articles

In [None]:
response = client.crawl(
    url="https://interviewing.io/topics#companies",
    instructions="Get all interview resources by company", 
    include_images=True
)

with open('exports/company_guides/company_guides.json', 'w') as f:
    json.dump(response, f, indent=2)