In [45]:
import re
import pandas as pd


In [159]:
def remove_hyperlinks(chunk: str):
    # Pattern to match standard links: [text](url)
    standard_link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
    
    # Pattern to match image links: ![alt](url)
    image_link_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
    
    def replace_links(match):
        text, url = match.groups()
        # Check if it's an image link
        if match.re.pattern == image_link_pattern:
            return f'<img src="{url}" alt="{text}">'
        # For standard links, keep the original text
        return text
    
    # First process image links
    result = re.sub(image_link_pattern, replace_links, chunk)
    
    # Then process standard links
    result = re.sub(standard_link_pattern, replace_links, result)
    
    return result


In [209]:

def extract_image_urls(text):
    """
    Extracts image URLs from <img src="..."> tags and splits the text around these tags.
    
    Args:
        text (str): Input text containing image tags
    
    Returns:
        tuple: A tuple containing:
            - List of extracted image URLs
            - List of text segments split around image tags
    """
    # Regular expression to match <img src="..."> tags
    pattern = r'<img\s+src="([^"]+)"[^>]*>'
    
    # Find all matches of image URLs
    urls = re.findall(pattern, text)
    
    # Split the text using the image tag pattern
    text_segments = [segment for segment in re.split(pattern, text) if segment and segment not in urls]
    
    
    return urls, text_segments

def extract_image_urls_v2(text):
    """
    Extracts image URLs from <img src="..."> tags and splits the text around these tags.
    Also removes <div> sections that may contain or surround these tags.
    
    Args:
        text (str): Input text containing image tags and div sections
    
    Returns:
        tuple: A tuple containing:
            - List of extracted image URLs
            - List of text segments split around image and div tags
    """
    # Combined regex pattern to match and remove:
    # 1. Entire <div> sections (including nested content)
    # 2. <img> tags
    pattern = r'(<div[^>]*>.*?</div>|<img\s+src="[^"]+?"[^>]*>)'
    
    # Find all URLs within img tags
    img_pattern = r'<img\s+src="([^"]+)"[^>]*>'
    urls = re.findall(img_pattern, text)
    
    # Split the text and filter out empty segments and urls
    text_segments = [
        segment.strip() 
        for segment in re.split(pattern, text, flags=re.DOTALL) 
        if segment and 
           segment not in urls and 
           not re.match(r'<(div|img)', segment.strip())
    ]
    
    return urls, text_segments

import re

def extract_image_urls_v3(text):
    """
    Extracts image URLs from <img src="..."> tags and splits the text around these tags.
    
    Processing of <div> sections:
      - If a <div> does NOT contain an <img src="...">, it is removed completely.
      - If a <div> DOES contain an <img src="...">, it is replaced with just the <img> tag(s).
        This preserves the image as a splitting delimiter while discarding any extra div content.
    
    Args:
        text (str): Input text containing image tags and div sections.
    
    Returns:
        tuple: A tuple containing:
            - List of extracted image URLs.
            - List of text segments split around image tags.
    """
    # Step 1: Extract all image URLs from the original text.
    img_pattern = r'<img\s+src="([^"]+)"[^>]*>'
    urls = re.findall(img_pattern, text, flags=re.DOTALL)
    
    # Step 2: Process all <div>...</div> blocks.
    def div_replacer(match):
        div_block = match.group(0)
        # If this <div> contains at least one <img src="...">, extract the image tags.
        if re.search(r'<img\s+src="[^"]+"', div_block):
            imgs = re.findall(r'<img\s+src="[^"]+"[^>]*>', div_block, flags=re.DOTALL)
            # Replace the entire div with only its <img> tag(s).
            return ''.join(imgs)
        else:
            # Otherwise, remove the <div> entirely (and do not split on it).
            return ''
    
    # Replace all <div> sections using the helper.
    text = re.sub(r'<div[^>]*>.*?</div>', div_replacer, text, flags=re.DOTALL)
    
    # Step 3: Split the text on <img> tags.
    segments = [seg.strip() for seg in re.split(r'<img\s+src="[^"]+"[^>]*>', text, flags=re.DOTALL) if seg.strip()]
    
    return urls, segments


def match_image_to_text(urls,input_text_segments,original_markdown_path):

    text_segments = [remove_yaml_header(x) for x in input_text_segments]

    if len(urls) == 0:
        return pd.DataFrame({"image url" : [], "text" : [], "source markdown file" : []})
    
    elif len(urls) == 1:
        
        return pd.DataFrame({"image url": urls, "text" : ['\n'.join(text_segments)],"source markdown file" : [original_markdown_path]})
    
    else:
        

        try:
            text_for_url = []
            pointer = 0


            while pointer < len(urls):

                above_text = text_segments[pointer]

                if pointer + 1 < len(text_segments):
                    below_text = text_segments[pointer+1]
                    text_for_url.append(f'{above_text}\n{below_text}')
                else:
                    text_for_url.append(above_text)
                
                pointer +=1
            
            return pd.DataFrame({"image url" : urls, "text" : text_for_url, "source markdown file" : [original_markdown_path]*len(urls)})
        
        except Exception as e:
            print(original_markdown_path)
            print(urls)
            print(text_segments)
            print(f'url length: {len(urls)} seg length: {len(text_segments)}')
            return pd.DataFrame({"image url" : [], "text" : [], "source markdown file" : []})
            
def remove_yaml_header(text):
    """
    Removes the YAML header from the input text.
    The YAML header is defined as the section at the beginning of the text 
    that starts with a line containing only '---' and ends with another such line.
    
    Args:
        text (str): The input text that may contain a YAML header.
    
    Returns:
        str: The text with the YAML header removed.
    """
    # Pattern explanation:
    # ^---\s*\n    => Matches a line that starts with '---', possibly followed by whitespace, then a newline.
    # .*?         => Non-greedily matches any characters (including newlines, due to DOTALL flag).
    # \n---\s*\n? => Matches a newline, then a line that starts with '---' (optionally followed by whitespace)
    #                and an optional newline after it.
    pattern = r'^---\s*\n.*?\n---\s*\n?'
    cleaned_text = re.sub(pattern, '', text, flags=re.DOTALL)
    return cleaned_text

def remove_code(chunk : str):
    code_pattern = r'import([\s\S]*?)/>'
    return re.sub(code_pattern,'',chunk)


def remove_backtick_content(text):
    """
    Removes all content enclosed within matching backticks from the text.
    This will remove both inline code (e.g. `code`) and multi-line code blocks 
    (e.g. ```code block```).

    Args:
        text (str): The input string.

    Returns:
        str: The text with all backtick-enclosed segments removed.
    """
    # This pattern captures one or more backticks, then any content (non-greedily),
    # until the same sequence of backticks appears again.
    pattern = r'(`+).*?\1'
    cleaned_text = re.sub(pattern, '', text, flags=re.DOTALL)
    return cleaned_text

In [210]:
from pathlib import Path

In [213]:
df_list = []
for markdown_path in Path('').rglob('*.mdx'):

    with open(markdown_path, 'r', encoding='utf-8') as file:
        mdx_raw_string = file.read()
    
   # mdx_raw_string = clean_header(mdx_raw_string)
    mdx_raw_string = remove_hyperlinks(mdx_raw_string)
    mdx_raw_string = remove_backtick_content(mdx_raw_string)
    # mdx_raw_string = remove_code(mdx_raw_string)

    pattern = r'^(#{1,2})(?!#{1,})(.+)$'
    chunks = [x for x in re.split(pattern, mdx_raw_string, flags=re.MULTILINE) if len(x) > 0]
    
    chunks = [x for x in mdx_raw_string.split('#') if len(x) > 0]
    for chunk in chunks:
        urls,segments = extract_image_urls_v3(chunk)
        df = match_image_to_text(urls,segments,markdown_path)
        df_list.append(df)

filtered_df_list = [x for x in df_list if len(x) > 0]
test_df = pd.concat(filtered_df_list,ignore_index=True)

platform-documentation/Annotate/annotate-projects/annotate-manual-qa-projects.mdx
['https://storage.googleapis.com/docs-media.encord.com/static/img/projects/performance-new-current-toggle.png', 'https://storage.googleapis.com/docs-media.encord.com/static/img/projects/project-dashboard/performance/performance-dashboard-date-filter-and-charts.png', 'https://storage.googleapis.com/docs-media.encord.com/static/img/projects/project-dashboard/performance/performance-dashboard-annotor-and-reviewer-table.png', 'https://storage.googleapis.com/docs-media.encord.com/static/img/projects/project-dashboard/performance/performance-dashboard-objects-and-classifications-table.png']
url length: 4 seg length: 1
platform-documentation/Annotate/annotate-projects/annotate-training-projects.mdx
['https://storage.googleapis.com/docs-media.encord.com/static/img/projects/training/working_flow/at_lifecycle4-adjusting_score.png', 'https://storage.googleapis.com/docs-media.encord.com/static/img/projects/training/w

In [214]:
for row in test_df.iterrows():
  
    print(row[1]['source markdown file'])
    print(row[1]['image url'])
    print(f':\n\n{row[1]["text"]}')
    print(f'\n---\n')

platform-documentation/Annotate/general-sync-work-to-cloud-storage.mdx
https://storage.googleapis.com/docs-media.encord.com/static/img/aws-lambda-container.png
:

Automatically save labels to AWS S3

<Info>We strongly recommend that highly technical users (examples: IT professionals, software developers, or system administrators) are the ones who perform the steps outlined in this process. </Info>

Typically, after labeling your data with Encord, the labels are used to training your ML models. This process typically includes transferring your labels to cloud storage. To streamline this, follow the steps below to automatically save your labels to your cloud storage upon their creation:

1. Create an IAM policy for a Lambda function. 
2. Paste the following JSON into the JSON policy editor, replacing  with the name of the S3 bucket you want to export your labels to.



3. Create an IAM role for Lambda, and attach the policy you created in Step 1. 

4. Create a new directory on your compu

In [166]:
markdown_path = '/Users/felixcohen/vlm_dataset_creation/GettingStarted/gettingstarted-labeling.mdx'
with open(markdown_path, 'r', encoding='utf-8') as file:
        mdx_raw_string = file.read()

        mdx_raw_string = remove_hyperlinks(mdx_raw_string)
    
        chunks = [x for x in mdx_raw_string.split('#') if len(x) > 0]
        for i,chunk in enumerate(chunks):
                print(f'\n----{i}----\n{chunk}')


----0----
---
title: "How to Label"
slug: "gettingstarted-labeling"
hidden: false
metadata: 
  title: "How to label"
  description: "Learn how to draw objects and create frame classifications in Encord."
  image: 
    0: "https://files.readme.io/2168cad-image_16.png"
---

Labeling your data is what Encord Annotate is about. Labeling is initiated in the _Queue_ tab of your Project. Annotators use the Project's Ontology to label data from Datasets that are attached to the Project. 

Three types of annotation can be created: 
- **Objects**: Label specific parts of the data unit using a specified shape. Objects are not supported for audio annotation.
- **Classifications**: Apply to a whole image or frame. They do not have a specific location. 
- **Attributes**: Can be nested into objects and classifications to add more details to the annotation.

<Tip>For more information on annotation types, see our documentation on Ontology structure.</Tip>



----1----
 Introduction to Image Annotation

In [169]:
urls,segs = extract_image_urls_v2(chunks[5])

In [170]:
len(segs)

4