# ENV SETUP

In [6]:
import json
import os
from dotenv import load_dotenv
from tavily import TavilyClient
import requests
from bs4 import BeautifulSoup

load_dotenv()

client = TavilyClient(os.getenv("TAVILY_API_KEY"))

# EXTRACT BLOG POSTS

In [2]:
response = client.crawl(
    url="https://interviewing.io/blog",
    instructions="Get all blog posts", 
    exclude_paths=["/category/.*","/page/.*"],
    include_images=True
)

with open('exports/blog_posts/blog_posts.json', 'w') as f:
    json.dump(response, f, indent=2)

# CLEAN BLOG POSTS

In [3]:
with open('exports/blog_posts/blog_posts.json', 'r') as f:
    data = json.load(f)

with open('exports/blog_posts/blog_posts.md', 'w') as f:
    seen_paragraphs = set()
    
    # First pass - collect all cleaned paragraphs
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                seen_paragraphs.add(cleaned)
    
    # Second pass - only write paragraphs that appear once
    paragraph_counts = {}
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                paragraph_counts[cleaned] = paragraph_counts.get(cleaned, 0) + 1
    
    # Write content
    for result in data.get('results', []):
        content = result.get('raw_content')
        url = result.get('url', '')
            
        # Split content into paragraphs
        paragraphs = content.split('\n\n')
        
        # Write unique paragraphs
        first_h1 = True
        first_paragraph = True
        for paragraph in paragraphs:
            # Remove all whitespace before comparing
            cleaned = ''.join(paragraph.split())
            if cleaned and paragraph_counts[cleaned] == 1:
                # Check if this is an H1 heading (starts with single #)
                if first_paragraph:
                    first_paragraph = False
                    continue
                if paragraph.strip().startswith('# ') and first_h1:
                    title = paragraph.strip().replace('# ', '')
                    # Skip writing the title line since it will be included in the header
                    f.write(f'# [{title}]({url})\n\n')
                    first_h1 = False
                else:
                    # Check if paragraph contains an image with relative path
                    if '![' in paragraph and '](/' in paragraph:
                        # Add interviewing.io domain to relative image paths
                        paragraph = paragraph.replace('](/', '](https://interviewing.io/')
                    f.write(paragraph.strip() + '\n\n')
                
        f.write('\n')  # Add spacing between articles

# COMPANY GUIDES

In [None]:
url = "https://interviewing.io/topics#companies"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all links containing "Interview process & questions"
interview_links = []
for link in soup.find_all('a'):
    if "Interview process & questions" in link.text:
        href = link.get('href')
        if href:
            # Prepend domain if href is relative path
            if href.startswith('/'):
                href = f"https://interviewing.io{href}"
            # Remove hash and everything after
            href = href.split('#')[0]
            interview_links.append(href)

print("Found interview process links:")
for link in interview_links:
    print(link)

Found interview process links:
https://interviewing.io/guides/hiring-process/google
https://interviewing.io/guides/hiring-process/meta-facebook
https://interviewing.io/guides/hiring-process/amazon
https://interviewing.io/guides/hiring-process/microsoft
https://interviewing.io/guides/hiring-process/netflix
https://interviewing.io/guides/hiring-process/apple
https://interviewing.io/affirm-interview-questions
https://interviewing.io/airbnb-interview-questions
https://interviewing.io/anduril-interview-questions
https://interviewing.io/anthropic-interview-questions
https://interviewing.io/atlassian-interview-questions
https://interviewing.io/block-interview-questions
https://interviewing.io/bloomberg-interview-questions
https://interviewing.io/capital-one-interview-questions
https://interviewing.io/coinbase-interview-questions
https://interviewing.io/databricks-interview-questions
https://interviewing.io/datadog-interview-questions
https://interviewing.io/doordash-interview-questions
https:

In [8]:
all_responses = []
for i in range(0, len(interview_links), 20):
    batch = interview_links[i:i+20]
    response = client.extract(
        urls=batch,
        extract_depth="advanced",
        include_images=True
    )
    all_responses.extend(response.get('results', []))

final_response = {'results': all_responses}

with open('exports/company_guides/company_guides.json', 'w') as f:
    json.dump(final_response, f, indent=2)

In [14]:
with open('exports/company_guides/company_guides.json', 'r') as f:
    data = json.load(f)

with open('exports/company_guides/company_guides.md', 'w') as f:
    seen_paragraphs = set()
    
    # First pass - collect all cleaned paragraphs
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                seen_paragraphs.add(cleaned)
    
    # Second pass - only write paragraphs that appear once
    paragraph_counts = {}
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                paragraph_counts[cleaned] = paragraph_counts.get(cleaned, 0) + 1
    
    # Write content
    for result in data.get('results', []):
        content = result.get('raw_content')
        url = result.get('url', '')
            
        # Split content into paragraphs
        paragraphs = content.split('\n\n')
        
        # Write unique paragraphs
        first_h1 = True
        first_paragraph = True
        for paragraph in paragraphs:
            # Remove all whitespace before comparing
            cleaned = ''.join(paragraph.split())
            if cleaned and paragraph_counts[cleaned] == 1:
                # Check if this is an H1 heading (starts with single #)
                if first_paragraph:
                    first_paragraph = False
                    continue
                if paragraph.strip().startswith('# ') and first_h1:
                    title = paragraph.strip().replace('# ', '')
                    # Skip writing the title line since it will be included in the header
                    f.write(f'# [{title}]({url})\n\n')
                    first_h1 = False
                else:
                    # Check if paragraph contains any relative links or images
                    if '](/' in paragraph:
                        # Add interviewing.io domain to all relative paths
                        paragraph = paragraph.replace('](/', '](https://interviewing.io/')
                    f.write(paragraph.strip() + '\n\n')
                
        f.write('\n')  # Add spacing between articles