# ENV SETUP

In [2]:
import json
import os
from dotenv import load_dotenv
from tavily import TavilyClient
import requests
from bs4 import BeautifulSoup
import json
import base64
import shutil
from pathlib import Path
from mistralai import Mistral, DocumentURLChunk
from mistralai.models import OCRResponse

load_dotenv()

client = TavilyClient(os.getenv("TAVILY_API_KEY"))

# EXTRACT BLOG POSTS

In [2]:
response = client.crawl(
    url="https://interviewing.io/blog",
    instructions="Get all blog posts", 
    exclude_paths=["/category/.*","/page/.*"],
    include_images=True
)

with open('exports/blog_posts/blog_posts.json', 'w') as f:
    json.dump(response, f, indent=2)

# CLEAN BLOG POSTS

In [3]:
with open('exports/blog_posts/blog_posts.json', 'r') as f:
    data = json.load(f)

with open('exports/blog_posts/blog_posts.md', 'w') as f:
    seen_paragraphs = set()
    
    # First pass - collect all cleaned paragraphs
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                seen_paragraphs.add(cleaned)
    
    # Second pass - only write paragraphs that appear once
    paragraph_counts = {}
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                paragraph_counts[cleaned] = paragraph_counts.get(cleaned, 0) + 1
    
    # Write content
    for result in data.get('results', []):
        content = result.get('raw_content')
        url = result.get('url', '')
            
        # Split content into paragraphs
        paragraphs = content.split('\n\n')
        
        # Write unique paragraphs
        first_h1 = True
        first_paragraph = True
        for paragraph in paragraphs:
            # Remove all whitespace before comparing
            cleaned = ''.join(paragraph.split())
            if cleaned and paragraph_counts[cleaned] == 1:
                # Check if this is an H1 heading (starts with single #)
                if first_paragraph:
                    first_paragraph = False
                    continue
                if paragraph.strip().startswith('# ') and first_h1:
                    title = paragraph.strip().replace('# ', '')
                    # Skip writing the title line since it will be included in the header
                    f.write(f'# [{title}]({url})\n\n')
                    first_h1 = False
                else:
                    # Check if paragraph contains an image with relative path
                    if '![' in paragraph and '](/' in paragraph:
                        # Add interviewing.io domain to relative image paths
                        paragraph = paragraph.replace('](/', '](https://interviewing.io/')
                    f.write(paragraph.strip() + '\n\n')
                
        f.write('\n')  # Add spacing between articles

# COMPANY GUIDES

In [None]:
url = "https://interviewing.io/topics#companies"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Find all links containing "Interview process & questions"
interview_links = []
for link in soup.find_all('a'):
    if "Interview process & questions" in link.text:
        href = link.get('href')
        if href:
            # Prepend domain if href is relative path
            if href.startswith('/'):
                href = f"https://interviewing.io{href}"
            # Remove hash and everything after
            href = href.split('#')[0]
            interview_links.append(href)

print("Found interview process links:")
for link in interview_links:
    print(link)

Found interview process links:
https://interviewing.io/guides/hiring-process/google
https://interviewing.io/guides/hiring-process/meta-facebook
https://interviewing.io/guides/hiring-process/amazon
https://interviewing.io/guides/hiring-process/microsoft
https://interviewing.io/guides/hiring-process/netflix
https://interviewing.io/guides/hiring-process/apple
https://interviewing.io/affirm-interview-questions
https://interviewing.io/airbnb-interview-questions
https://interviewing.io/anduril-interview-questions
https://interviewing.io/anthropic-interview-questions
https://interviewing.io/atlassian-interview-questions
https://interviewing.io/block-interview-questions
https://interviewing.io/bloomberg-interview-questions
https://interviewing.io/capital-one-interview-questions
https://interviewing.io/coinbase-interview-questions
https://interviewing.io/databricks-interview-questions
https://interviewing.io/datadog-interview-questions
https://interviewing.io/doordash-interview-questions
https:

In [8]:
all_responses = []
for i in range(0, len(interview_links), 20):
    batch = interview_links[i:i+20]
    response = client.extract(
        urls=batch,
        extract_depth="advanced",
        include_images=True
    )
    all_responses.extend(response.get('results', []))

final_response = {'results': all_responses}

with open('exports/company_guides/company_guides.json', 'w') as f:
    json.dump(final_response, f, indent=2)

In [14]:
with open('exports/company_guides/company_guides.json', 'r') as f:
    data = json.load(f)

with open('exports/company_guides/company_guides.md', 'w') as f:
    seen_paragraphs = set()
    
    # First pass - collect all cleaned paragraphs
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                seen_paragraphs.add(cleaned)
    
    # Second pass - only write paragraphs that appear once
    paragraph_counts = {}
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                paragraph_counts[cleaned] = paragraph_counts.get(cleaned, 0) + 1
    
    # Write content
    for result in data.get('results', []):
        content = result.get('raw_content')
        url = result.get('url', '')
            
        # Split content into paragraphs
        paragraphs = content.split('\n\n')
        
        # Write unique paragraphs
        first_h1 = True
        first_paragraph = True
        for paragraph in paragraphs:
            # Remove all whitespace before comparing
            cleaned = ''.join(paragraph.split())
            if cleaned and paragraph_counts[cleaned] == 1:
                # Check if this is an H1 heading (starts with single #)
                if first_paragraph:
                    first_paragraph = False
                    continue
                if paragraph.strip().startswith('# ') and first_h1:
                    title = paragraph.strip().replace('# ', '')
                    # Skip writing the title line since it will be included in the header
                    f.write(f'# [{title}]({url})\n\n')
                    first_h1 = False
                else:
                    # Check if paragraph contains any relative links or images
                    if '](/' in paragraph:
                        # Add interviewing.io domain to all relative paths
                        paragraph = paragraph.replace('](/', '](https://interviewing.io/')
                    f.write(paragraph.strip() + '\n\n')
                
        f.write('\n')  # Add spacing between articles

# Interview stuff

In [10]:
url = "https://interviewing.io/learn#interview-guides"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# Target class from the question
target_class = "col-span-2 mb-14 mt-6 grid grid-cols-1 gap-2 sm:grid-cols-2 sm:gap-4 lg:grid-cols-2 lg:gap-6"

# Find the target div
target_div = soup.find("div", class_=target_class)

# Extract all hrefs within that div
hrefs = []
if target_div:
    for a_tag in target_div.find_all("a", href=True):
        href = a_tag["href"]
        if href.startswith('/'):
            href = 'https://interviewing.io' + href
        hrefs.append(href)

# Display the results
for link in hrefs:
    print(link)

https://interviewing.io/guides/amazon-leadership-principles
https://interviewing.io/guides/system-design-interview
https://interviewing.io/guides/hiring-process


In [12]:
all_responses = []
for i in range(0, len(hrefs), 20):
    batch = hrefs[i:i+20]
    response = client.extract(
        urls=batch,
        extract_depth="advanced",
        include_images=True
    )
    all_responses.extend(response.get('results', []))

final_response = {'results': all_responses}

with open('exports/interview_guides/interview_guides.json', 'w') as f:
    json.dump(final_response, f, indent=2)

In [13]:
with open('exports/interview_guides/interview_guides.json', 'r') as f:
    data = json.load(f)

with open('exports/interview_guides/interview_guides.md', 'w') as f:
    seen_paragraphs = set()
    
    # First pass - collect all cleaned paragraphs
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                seen_paragraphs.add(cleaned)
    
    # Second pass - only write paragraphs that appear once
    paragraph_counts = {}
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                paragraph_counts[cleaned] = paragraph_counts.get(cleaned, 0) + 1
    
    # Write content
    for result in data.get('results', []):
        content = result.get('raw_content')
        url = result.get('url', '')
            
        # Split content into paragraphs
        paragraphs = content.split('\n\n')
        
        # Write unique paragraphs
        first_h1 = True
        first_paragraph = True
        for paragraph in paragraphs:
            # Remove all whitespace before comparing
            cleaned = ''.join(paragraph.split())
            if cleaned and paragraph_counts[cleaned] == 1:
                # Check if this is an H1 heading (starts with single #)
                if first_paragraph:
                    first_paragraph = False
                    continue
                if paragraph.strip().startswith('# ') and first_h1:
                    title = paragraph.strip().replace('# ', '')
                    # Skip writing the title line since it will be included in the header
                    f.write(f'# [{title}]({url})\n\n')
                    first_h1 = False
                else:
                    # Check if paragraph contains any relative links or images
                    if '](/' in paragraph:
                        # Add interviewing.io domain to all relative paths
                        paragraph = paragraph.replace('](/', '](https://interviewing.io/')
                    f.write(paragraph.strip() + '\n\n')
                
        f.write('\n')  # Add spacing between articles

In [14]:
response = client.crawl(
    url="https://nilmamano.com/blog/category/dsa",
    instructions="Get all blog posts",
    include_images=True
)

with open('exports/nilmamano/nilmamano.json', 'w') as f:
    json.dump(response, f, indent=2)

In [20]:
with open('exports/nilmamano/nilmamano.json', 'r') as f:
    data = json.load(f)

with open('exports/nilmamano/nilmamano.md', 'w') as f:
    seen_paragraphs = set()
    
    # First pass - collect all cleaned paragraphs
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                seen_paragraphs.add(cleaned)
    
    # Second pass - only write paragraphs that appear once
    paragraph_counts = {}
    for result in data.get('results', []):
        content = result.get('raw_content')
        paragraphs = content.split('\n\n')
        for paragraph in paragraphs:
            cleaned = ''.join(paragraph.split())
            if cleaned:
                paragraph_counts[cleaned] = paragraph_counts.get(cleaned, 0) + 1
    
    # Write content
    for result in data.get('results', []):
        content = result.get('raw_content')
        url = result.get('url', '')
            
        # Split content into paragraphs
        paragraphs = content.split('\n\n')
        paragraphs = [p.replace('```', '') for p in paragraphs]
        
        # Write unique paragraphs
        first_h1 = True
        first_paragraph = True
        for paragraph in paragraphs:
            # Remove all whitespace before comparing
            cleaned = ''.join(paragraph.split())
            # Add check that cleaned exists in paragraph_counts before accessing
            if cleaned and cleaned in paragraph_counts and paragraph_counts[cleaned] == 1:
                # Check if this is an H1 heading (starts with single #)
                if first_paragraph:
                    first_paragraph = False
                    continue
                if paragraph.strip().startswith('# ') and first_h1:
                    title = paragraph.strip().replace('# ', '')
                    # Skip writing the title line since it will be included in the header
                    f.write(f'# [{title}]({url})\n\n')
                    first_h1 = False
                else:
                    # Check if paragraph contains any relative links or images
                    if '](/' in paragraph:
                        # Add interviewing.io domain to all relative paths
                        paragraph = paragraph.replace('](/', '](https://nilmamano.com/')
                    f.write(paragraph.strip() + '\n\n')
                
        f.write('\n')  # Add spacing between articles

In [3]:
client = Mistral(api_key=os.getenv("MISTRAL_API_KEY"))
# Path configuration
INPUT_DIR = Path("exports/chapters")   # Folder where the user places the PDFs to be processed
DONE_DIR = Path("exports/chapters")            # Folder where processed PDFs will be moved
OUTPUT_ROOT_DIR = Path("exports/chapters")    # Root folder for conversion results

# Ensure directories exist
INPUT_DIR.mkdir(exist_ok=True)
DONE_DIR.mkdir(exist_ok=True)
OUTPUT_ROOT_DIR.mkdir(exist_ok=True)

def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    """
    This converts base64 encoded images directly in the markdown...
    And replaces them with links to external images, so the markdown is more readable and organized.
    """
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})")
    return markdown_str

def get_combined_markdown(ocr_response: OCRResponse) -> str:
    """
    Part of the response from the Mistral API, which is an OCRResponse object...
    And returns a single string with the combined markdown of all the pages of the PDF.
    """
    markdowns: list[str] = []
    for page in ocr_response.pages:
        image_data = {}
        for img in page.images:
            image_data[img.id] = img.image_base64
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))

    return "\n\n".join(markdowns)

In [4]:
def process_pdf(pdf_path: Path):
    # Process all PDFs in INPUT_DIR
    # - Important to be careful with the number of PDFs, as the Mistral API has a usage limit
    #   and it could cause errors by exceeding the limit.

    # PDF base name
    pdf_base = pdf_path.stem
    print(f"Processing {pdf_path.name} ...")
    
    # Output folders
    output_dir = OUTPUT_ROOT_DIR / pdf_base
    output_dir.mkdir(exist_ok=True)
    images_dir = output_dir / "images"
    images_dir.mkdir(exist_ok=True)
    
    # PDF -> OCR
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()
        
    uploaded_file = client.files.upload(
        file={
            "file_name": pdf_path.name,
            "content": pdf_bytes,
        },
        purpose="ocr"
    )
    
    signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
    
    ocr_response = client.ocr.process(
        document=DocumentURLChunk(document_url=signed_url.url),
        model="mistral-ocr-latest",
        include_image_base64=True
    )
    
    # Save OCR in JSON 
    # (in case something fails it could be reused, but it is not used in the rest of the code)
    ocr_json_path = output_dir / "ocr_response.json"
    with open(ocr_json_path, "w", encoding="utf-8") as json_file:
        json.dump(ocr_response.model_dump(), json_file, indent=4, ensure_ascii=False)
    print(f"OCR response saved in {ocr_json_path}")
    
    # OCR -> Markdown prepared for Obsidian
    # - That is, from base64 encoded images, it converts them to links to 
    #   external images and generates the images as such, in a subfolder.
    
    global_counter = 1
    updated_markdown_pages = []
    
    for page in ocr_response.pages:
        updated_markdown = page.markdown
        for image_obj in page.images:
            
            # base64 to image
            base64_str = image_obj.image_base64
            if base64_str.startswith("data:"):
                base64_str = base64_str.split(",", 1)[1]
            image_bytes = base64.b64decode(base64_str)
            
            # image extensions
            ext = Path(image_obj.id).suffix if Path(image_obj.id).suffix else ".png"
            new_image_name = f"{pdf_base}_img_{global_counter}{ext}"
            global_counter += 1
            
            # save in subfolder
            image_output_path = images_dir / new_image_name
            with open(image_output_path, "wb") as f:
                f.write(image_bytes)
            
            # Update markdown with wikilink: ![[nombre_imagen]]
            updated_markdown = updated_markdown.replace(
                f"![{image_obj.id}]({image_obj.id})",
                f"![[{new_image_name}]]"
            )
        updated_markdown_pages.append(updated_markdown)
    
    final_markdown = "\n\n".join(updated_markdown_pages)
    output_markdown_path = output_dir / "output.md"
    with open(output_markdown_path, "w", encoding="utf-8") as md_file:
        md_file.write(final_markdown)
    print(f"Markdown generated in {output_markdown_path}")

In [None]:
# Process all PDFs in INPUT_DIR
# - Important to be careful with the number of PDFs, as the Mistral API has a usage limit
#   and it could cause errors by exceeding the limit.

pdf_files = list(INPUT_DIR.glob("*.pdf"))      # Get all PDFs in pdfs_to_process. So make sure to place the PDFs there.
if not pdf_files:
    response = input("No PDFs to process. Pick them manually? y/n: ")
    if response.lower() == "y":
        pdf_files = [Path(input("Enter the path to the PDF: "))]
    else:
        print("Exiting...")
        exit()
    
for pdf_file in pdf_files:
    try:
        process_pdf(pdf_file)
        shutil.move(str(pdf_file), DONE_DIR)
        print(f"{pdf_file.name} moved to {DONE_DIR}")
    except Exception as e:
        print(f"Error processing {pdf_file.name}: {e}")

Processing Sneak Peek BCTCI - First 7 Chapters - What's Broken About Coding Interviews, What Recruiters Won't Tell You, How to Get In the Door, and more.pdf ...
OCR response saved in exports/chapters/Sneak Peek BCTCI - First 7 Chapters - What's Broken About Coding Interviews, What Recruiters Won't Tell You, How to Get In the Door, and more/ocr_response.json
Markdown generated in exports/chapters/Sneak Peek BCTCI - First 7 Chapters - What's Broken About Coding Interviews, What Recruiters Won't Tell You, How to Get In the Door, and more/output.md
Sneak Peek BCTCI - First 7 Chapters - What's Broken About Coding Interviews, What Recruiters Won't Tell You, How to Get In the Door, and more.pdf moved to exports/chapters
