In [2]:
import re
import pandas as pd
from pathlib import Path

In [3]:
def remove_hyperlinks(chunk: str):
    # Pattern to match standard links: [text](url)
    standard_link_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
    
    # Pattern to match image links: ![alt](url)
    image_link_pattern = r'!\[([^\]]*)\]\(([^)]+)\)'
    
    def replace_links(match):
        text, url = match.groups()
        # Check if it's an image link
        if match.re.pattern == image_link_pattern:
            return f'<img src="{url}" alt="{text}">'
        # For standard links, keep the original text
        return text
    
    # First process image links
    result = re.sub(image_link_pattern, replace_links, chunk)
    
    # Then process standard links
    result = re.sub(standard_link_pattern, replace_links, result)
    
    return result


In [4]:

def extract_image_urls(text):
    """
    Extracts image URLs from <img src="..."> tags and splits the text around these tags.
    
    Args:
        text (str): Input text containing image tags
    
    Returns:
        tuple: A tuple containing:
            - List of extracted image URLs
            - List of text segments split around image tags
    """
    # Regular expression to match <img src="..."> tags
    pattern = r'<img\s+src="([^"]+)"[^>]*>'
    
    # Find all matches of image URLs
    urls = re.findall(pattern, text)
    
    # Split the text using the image tag pattern
    text_segments = [segment for segment in re.split(pattern, text) if segment and segment not in urls]
    
    
    return urls, text_segments

def extract_image_urls_v2(text):
    """
    Extracts image URLs from <img src="..."> tags and splits the text around these tags.
    Also removes <div> sections that may contain or surround these tags.
    
    Args:
        text (str): Input text containing image tags and div sections
    
    Returns:
        tuple: A tuple containing:
            - List of extracted image URLs
            - List of text segments split around image and div tags
    """
    # Combined regex pattern to match and remove:
    # 1. Entire <div> sections (including nested content)
    # 2. <img> tags
    pattern = r'(<div[^>]*>.*?</div>|<img\s+src="[^"]+?"[^>]*>)'
    
    # Find all URLs within img tags
    img_pattern = r'<img\s+src="([^"]+)"[^>]*>'
    urls = re.findall(img_pattern, text)
    
    # Split the text and filter out empty segments and urls
    text_segments = [
        segment.strip() 
        for segment in re.split(pattern, text, flags=re.DOTALL) 
        if segment and 
           segment not in urls and 
           not re.match(r'<(div|img)', segment.strip())
    ]
    
    return urls, text_segments

import re

def extract_image_urls_v3(text):
    """
    Extracts image URLs from <img src="..."> tags and splits the text around these tags.
    
    Processing of <div> sections:
      - If a <div> does NOT contain an <img src="...">, it is removed completely.
      - If a <div> DOES contain an <img src="...">, it is replaced with just the <img> tag(s).
        This preserves the image as a splitting delimiter while discarding any extra div content.
    
    Args:
        text (str): Input text containing image tags and div sections.
    
    Returns:
        tuple: A tuple containing:
            - List of extracted image URLs.
            - List of text segments split around image tags.
    """
    # Step 1: Extract all image URLs from the original text.
    img_pattern = r'<img\s+src="([^"]+)"[^>]*>'
    urls = re.findall(img_pattern, text, flags=re.DOTALL)
    
    # Step 2: Process all <div>...</div> blocks.
    def div_replacer(match):
        div_block = match.group(0)
        # If this <div> contains at least one <img src="...">, extract the image tags.
        if re.search(r'<img\s+src="[^"]+"', div_block):
            imgs = re.findall(r'<img\s+src="[^"]+"[^>]*>', div_block, flags=re.DOTALL)
            # Replace the entire div with only its <img> tag(s).
            return ''.join(imgs)
        else:
            # Otherwise, remove the <div> entirely (and do not split on it).
            return ''
    
    # Replace all <div> sections using the helper.
    text = re.sub(r'<div[^>]*>.*?</div>', div_replacer, text, flags=re.DOTALL)
    
    # Step 3: Split the text on <img> tags.
    segments = [seg.strip() for seg in re.split(r'<img\s+src="[^"]+"[^>]*>', text, flags=re.DOTALL) if seg.strip()]
    
    return urls, segments

#TODO move if text = .,'',| outside of loop to do at beginning / fix root issue causing this, also add remove import x regex func
def match_image_to_text(urls,input_text_segments,original_markdown_path):

    text_segments = [remove_yaml_header(x) for x in input_text_segments]

    if len(urls) == 0:
        return pd.DataFrame({"image url" : [], "above text" : [],"below text": [], "source markdown file" : []})
    
    
    elif len(urls) == 1:

        #Encord text upload doesnt like empty .txt files
        if text_segments[0] == '':
            text_segments[0] = '---'

        if len(text_segments) == 1:
            return pd.DataFrame({"image url" : urls, "above text" : [text_segments[0]],"below text": ['---'], "source markdown file" : [original_markdown_path]})
        
        elif len(text_segments) == 2:

            return pd.DataFrame({"image url" : urls, "above text" : [text_segments[0]],"below text": [text_segments[1]], "source markdown file" : [original_markdown_path]})

        else:
            raise ValueError
            
    else:
        

        try:
            upper_text_for_url = []
            lower_text_for_url = []

            pointer = 0

            #TODO this assumes text always precedes image in md file

            while pointer < len(urls):

                above_text = text_segments[pointer]

                if above_text == '' or above_text == '|' or above_text == '.':
                    above_text = '---'
                upper_text_for_url.append(above_text)

                if pointer + 1 < len(text_segments):
                    below_text = text_segments[pointer+1]

                    #encord txt upload doesnt like empty .txt
                    if below_text == '' or below_text == '|' or below_text == '.':
                        below_text = '---'
                    lower_text_for_url.append(below_text)
                else:
                    lower_text_for_url.append('---')
                
                pointer +=1
            
            return pd.DataFrame({"image url" : urls, "above text" : upper_text_for_url,"below text": lower_text_for_url, "source markdown file" : [original_markdown_path]*len(urls)})
        
        
        except Exception as e:
            print(original_markdown_path)
            print(urls)
            print(text_segments)
            print(f'url length: {len(urls)} seg length: {len(text_segments)}')
            return pd.DataFrame({"image url" : [], "above text" : [],"below text": [], "source markdown file" : []})
    
def remove_yaml_header(text):
    """
    Removes the YAML header from the input text.
    The YAML header is defined as the section at the beginning of the text 
    that starts with a line containing only '---' and ends with another such line.
    
    Args:
        text (str): The input text that may contain a YAML header.
    
    Returns:
        str: The text with the YAML header removed.
    """
    # Pattern explanation:
    # ^---\s*\n    => Matches a line that starts with '---', possibly followed by whitespace, then a newline.
    # .*?         => Non-greedily matches any characters (including newlines, due to DOTALL flag).
    # \n---\s*\n? => Matches a newline, then a line that starts with '---' (optionally followed by whitespace)
    #                and an optional newline after it.
    pattern = r'^---\s*\n.*?\n---\s*\n?'
    cleaned_text = re.sub(pattern, '', text, flags=re.DOTALL)
    return cleaned_text

def remove_code(chunk : str):
    code_pattern = r'import([\s\S]*?)/>'
    return re.sub(code_pattern,'',chunk)


def remove_backtick_content(text):
    """
    Removes all content enclosed within matching backticks from the text.
    This will remove both inline code (e.g. `code`) and multi-line code blocks 
    (e.g. ```code block```).

    Args:
        text (str): The input string.

    Returns:
        str: The text with all backtick-enclosed segments removed.
    """
    # This pattern captures one or more backticks, then any content (non-greedily),
    # until the same sequence of backticks appears again.
    pattern = r'(`+).*?\1'
    cleaned_text = re.sub(pattern, '', text, flags=re.DOTALL)
    return cleaned_text

In [5]:
df_list = []
for markdown_path in Path('').rglob('*.mdx'):

    with open(markdown_path, 'r', encoding='utf-8') as file:
        mdx_raw_string = file.read()
    
   # mdx_raw_string = clean_header(mdx_raw_string)
    mdx_raw_string = remove_hyperlinks(mdx_raw_string)
    mdx_raw_string = remove_backtick_content(mdx_raw_string)
    # mdx_raw_string = remove_code(mdx_raw_string)

    pattern = r'^(#{1,2})(?!#{1,})(.+)$'
    chunks = [x for x in re.split(pattern, mdx_raw_string, flags=re.MULTILINE) if len(x) > 0]
    
    chunks = [x for x in mdx_raw_string.split('#') if len(x) > 0]
    for chunk in chunks:
        urls,segments = extract_image_urls_v3(chunk)
        df = match_image_to_text(urls,segments,markdown_path)
        df_list.append(df)

filtered_df_list = [x for x in df_list if len(x) > 0]
test_df = pd.concat(filtered_df_list,ignore_index=True)

platform-documentation/Annotate/annotate-projects/annotate-manual-qa-projects.mdx
['https://storage.googleapis.com/docs-media.encord.com/static/img/projects/performance-new-current-toggle.png', 'https://storage.googleapis.com/docs-media.encord.com/static/img/projects/project-dashboard/performance/performance-dashboard-date-filter-and-charts.png', 'https://storage.googleapis.com/docs-media.encord.com/static/img/projects/project-dashboard/performance/performance-dashboard-annotor-and-reviewer-table.png', 'https://storage.googleapis.com/docs-media.encord.com/static/img/projects/project-dashboard/performance/performance-dashboard-objects-and-classifications-table.png']
url length: 4 seg length: 1
platform-documentation/Annotate/annotate-projects/annotate-training-projects.mdx
['https://storage.googleapis.com/docs-media.encord.com/static/img/projects/training/working_flow/at_lifecycle4-adjusting_score.png', 'https://storage.googleapis.com/docs-media.encord.com/static/img/projects/training/w

In [243]:
test_df.to_csv('raw_multimodal_data.csv')

In [None]:
test_df

In [230]:
test_df['image url'].apply(lambda x : 'gif' in x).sum()


181

In [259]:

for i,row in enumerate(test_df.iterrows()):

    atext = row[1]['above text']
    btext = row[1]['below text']

    with open(f'above_text/above_text_{i}.txt','w') as output:
        output.write(atext)
    with open(f'below_text/below_text_{i}.txt','w') as output:
        output.write(btext)

    


In [240]:
upload_json = { "images" : []}

for i,row in enumerate(test_df.iterrows()):

    img_url = row[1]['image url']

    img_obj = {
        "objectUrl": img_url,
        "title" : img_url.split('/')[-1],
        "clientMetadata" : {"id" : str(i), "Data_Type" : "Image"}
    }

    if 'gif' not in img_url:
        upload_json['images'].append(img_obj)

    

In [238]:
import json

In [242]:
with open('doc_images.json','w',encoding='utf-8') as f:
    json.dump(upload_json, f, ensure_ascii=False, indent=4)

In [244]:
from encord import EncordUserClient
import os

In [None]:
key_path = os.environ.get("ENCORD_SSH_KEY_FILE")

user_client: EncordUserClient = EncordUserClient.create_with_ssh_private_key(
    ssh_private_key_path=key_path
)


above_folder_name = 'Encord Documentation Above Text'
below_folder_name = 'Encord Documentation Below Text'

afolders = list(user_client.find_storage_folders(search=above_folder_name, page_size=1))
a_storage_folder = afolders[0] 
bfolders = list(user_client.find_storage_folders(search=below_folder_name, page_size=1))
b_storage_folder = bfolders[0] 


for i,row in enumerate(test_df.iterrows()):
    # if i < 752:
    #     print(f'skipped {i}')
    #     continue
    
    img_url = row[1]['image url']

    if 'gif' not in img_url:
        print(f'processing {i}')
        atext_path = f'above_text/above_text_{i}.txt'
        btext_path = f'below_text/below_text_{i}.txt'
        above_metadata = {"id" : str(i), "Data_Type" : "above_text"}
        below_metadata = {"id" : str(i), "Data_Type" : "below_text"}

        a_storage_folder.upload_text(atext_path,client_metadata=above_metadata)
        b_storage_folder.upload_text(btext_path,client_metadata=below_metadata)


skipped 0
skipped 1
skipped 2
skipped 3
skipped 4
skipped 5
skipped 6
skipped 7
skipped 8
skipped 9
skipped 10
skipped 11
skipped 12
skipped 13
skipped 14
skipped 15
skipped 16
skipped 17
skipped 18
skipped 19
skipped 20
skipped 21
skipped 22
skipped 23
skipped 24
skipped 25
skipped 26
skipped 27
skipped 28
skipped 29
skipped 30
skipped 31
skipped 32
skipped 33
skipped 34
skipped 35
skipped 36
skipped 37
skipped 38
skipped 39
skipped 40
skipped 41
skipped 42
skipped 43
skipped 44
skipped 45
skipped 46
skipped 47
skipped 48
skipped 49
skipped 50
skipped 51
skipped 52
skipped 53
skipped 54
skipped 55
skipped 56
skipped 57
skipped 58
skipped 59
skipped 60
skipped 61
skipped 62
skipped 63
skipped 64
skipped 65
skipped 66
skipped 67
skipped 68
skipped 69
skipped 70
skipped 71
skipped 72
skipped 73
skipped 74
skipped 75
skipped 76
skipped 77
skipped 78
skipped 79
skipped 80
skipped 81
skipped 82
skipped 83
skipped 84
skipped 85
skipped 86
skipped 87
skipped 88
skipped 89
skipped 90
skipped 9

In [266]:
from openai import OpenAI

In [268]:
openai_client = OpenAI(api_key = os.environ.get("OPENAI_API_KEY"))

In [1]:
'Help me create a question-answer dataset for my application. You have access to a screenshot of the application from the documentation and the surrounding text above and below it:\n\n    === documentation text above image ===\n    1. In the Encord platform, select _Projects_ under _Annotate_.\n2. Select the Project you want to Manage.\n\n    === documentation text below image ===\n    The dashboard is split into the following tabs:\n\n- **Project Overview**: High-level view of labeling and productivity statistics.\n- **Explore**: Explore the distribution of instances and labels across data assets in the project.\n- **Queue**: Shows all tasks in the Project by Workflow stage.\n- **Workflow**: Graphically displays the path tasks follow through the Project Workflow.\n- **Labels & Export**: For managing all the Project\'s labels, including exporting labels.\n- **Analytics**: Detailed Project analytics.\n- **Settings**: Manage your Project Settings, including copying Projects, managing Project tags, customizing editor layouts, and deleting Projects.\n\n    ==== instructions ===\n    Thoroughly review the screenshot, imagining you are a user interacting with the application in a context similar to what is shown. Based on that situation, propose three realistic questions that such a user might ask, and then provide accurate answers to each question based on the documentation text. If the documentation text is not informative enough, come up with question-answer pairs using the screenshot only. Ensure the questions are realistic but sufficiently diverse. The questions and answers ***must*** refer to the image.\n\n    Please follow the JSON Schema to indicate your response.\n    Don\'t respond with anything but valid json.\n\n    === JSON Schema ===\n    {"$defs": {"QuestionAnswerPair1RadioModel": {"properties": {"feature_node_hash": {"const": "D9oJlIXz", "description": "UUID for discrimination. Must be included in json as is.", "title": "Feature Node Hash", "type": "string"}, "choice": {"description": "Choose exactly one answer from the given options.", "discriminator": {"mapping": {"5qvmjZEN": "#/$defs/encord_agents__core__ontology__QaPairNestedRadioModel__1"}, "propertyName": "feature_node_hash"}, "oneOf": [{"$ref": "#/$defs/encord_agents__core__ontology__QaPairNestedRadioModel__1"}], "title": "Choice"}}, "required": ["feature_node_hash", "choice"], "title": "QuestionAnswerPair1RadioModel", "type": "object"}, "QuestionAnswerPair2RadioModel": {"properties": {"feature_node_hash": {"const": "jat0dizi", "description": "UUID for discrimination. Must be included in json as is.", "title": "Feature Node Hash", "type": "string"}, "choice": {"description": "Choose exactly one answer from the given options.", "discriminator": {"mapping": {"7ew0GaDG": "#/$defs/encord_agents__core__ontology__QaPairNestedRadioModel__2"}, "propertyName": "feature_node_hash"}, "oneOf": [{"$ref": "#/$defs/encord_agents__core__ontology__QaPairNestedRadioModel__2"}], "title": "Choice"}}, "required": ["feature_node_hash", "choice"], "title": "QuestionAnswerPair2RadioModel", "type": "object"}, "QuestionAnswerPair3RadioModel": {"properties": {"feature_node_hash": {"const": "cZsafTBg", "description": "UUID for discrimination. Must be included in json as is.", "title": "Feature Node Hash", "type": "string"}, "choice": {"description": "Choose exactly one answer from the given options.", "discriminator": {"mapping": {"FVgOeelB": "#/$defs/encord_agents__core__ontology__QaPairNestedRadioModel__3"}, "propertyName": "feature_node_hash"}, "oneOf": [{"$ref": "#/$defs/encord_agents__core__ontology__QaPairNestedRadioModel__3"}], "title": "Choice"}}, "required": ["feature_node_hash", "choice"], "title": "QuestionAnswerPair3RadioModel", "type": "object"}, "encord_agents__core__ontology__AnswerTextModel__1": {"properties": {"feature_node_hash": {"const": "4mvdzZ+c", "description": "UUID for discrimination. Must be included in json as is.", "title": "Feature Node Hash", "type": "string"}, "value": {"description": "Please describe the image as accurate as possible focusing on \'Answer\'", "maxLength": 1000, "minLength": 0, "title": "Value", "type": "string"}}, "required": ["feature_node_hash", "value"], "title": "AnswerTextModel", "type": "object"}, "encord_agents__core__ontology__AnswerTextModel__2": {"properties": {"feature_node_hash": {"const": "RtwPHjxE", "description": "UUID for discrimination. Must be included in json as is.", "title": "Feature Node Hash", "type": "string"}, "value": {"description": "Please describe the image as accurate as possible focusing on \'Answer\'", "maxLength": 1000, "minLength": 0, "title": "Value", "type": "string"}}, "required": ["feature_node_hash", "value"], "title": "AnswerTextModel", "type": "object"}, "encord_agents__core__ontology__AnswerTextModel__3": {"properties": {"feature_node_hash": {"const": "PV+lOe7L", "description": "UUID for discrimination. Must be included in json as is.", "title": "Feature Node Hash", "type": "string"}, "value": {"description": "Please describe the image as accurate as possible focusing on \'Answer\'", "maxLength": 1000, "minLength": 0, "title": "Value", "type": "string"}}, "required": ["feature_node_hash", "value"], "title": "AnswerTextModel", "type": "object"}, "encord_agents__core__ontology__QaPairNestedRadioModel__1": {"properties": {"feature_node_hash": {"const": "5qvmjZEN", "description": "UUID for discrimination. Must be included in json as is.", "title": "Feature Node Hash", "type": "string"}, "question": {"$ref": "#/$defs/encord_agents__core__ontology__QuestionTextModel__1", "description": "A text attribute with carefully crafted text to describe the property."}, "answer": {"$ref": "#/$defs/encord_agents__core__ontology__AnswerTextModel__1", "description": "A text attribute with carefully crafted text to describe the property."}}, "required": ["feature_node_hash", "question", "answer"], "title": "QaPairNestedRadioModel", "type": "object"}, "encord_agents__core__ontology__QaPairNestedRadioModel__2": {"properties": {"feature_node_hash": {"const": "7ew0GaDG", "description": "UUID for discrimination. Must be included in json as is.", "title": "Feature Node Hash", "type": "string"}, "question": {"$ref": "#/$defs/encord_agents__core__ontology__QuestionTextModel__2", "description": "A text attribute with carefully crafted text to describe the property."}, "answer": {"$ref": "#/$defs/encord_agents__core__ontology__AnswerTextModel__2", "description": "A text attribute with carefully crafted text to describe the property."}}, "required": ["feature_node_hash", "question", "answer"], "title": "QaPairNestedRadioModel", "type": "object"}, "encord_agents__core__ontology__QaPairNestedRadioModel__3": {"properties": {"feature_node_hash": {"const": "FVgOeelB", "description": "UUID for discrimination. Must be included in json as is.", "title": "Feature Node Hash", "type": "string"}, "question": {"$ref": "#/$defs/encord_agents__core__ontology__QuestionTextModel__3", "description": "A text attribute with carefully crafted text to describe the property."}, "answer": {"$ref": "#/$defs/encord_agents__core__ontology__AnswerTextModel__3", "description": "A text attribute with carefully crafted text to describe the property."}}, "required": ["feature_node_hash", "question", "answer"], "title": "QaPairNestedRadioModel", "type": "object"}, "encord_agents__core__ontology__QuestionTextModel__1": {"properties": {"feature_node_hash": {"const": "L27fSN3D", "description": "UUID for discrimination. Must be included in json as is.", "title": "Feature Node Hash", "type": "string"}, "value": {"description": "Please describe the image as accurate as possible focusing on \'Question\'", "maxLength": 1000, "minLength": 0, "title": "Value", "type": "string"}}, "required": ["feature_node_hash", "value"], "title": "QuestionTextModel", "type": "object"}, "encord_agents__core__ontology__QuestionTextModel__2": {"properties": {"feature_node_hash": {"const": "yZdu3EEi", "description": "UUID for discrimination. Must be included in json as is.", "title": "Feature Node Hash", "type": "string"}, "value": {"description": "Please describe the image as accurate as possible focusing on \'Question\'", "maxLength": 1000, "minLength": 0, "title": "Value", "type": "string"}}, "required": ["feature_node_hash", "value"], "title": "QuestionTextModel", "type": "object"}, "encord_agents__core__ontology__QuestionTextModel__3": {"properties": {"feature_node_hash": {"const": "8IpLiz/c", "description": "UUID for discrimination. Must be included in json as is.", "title": "Feature Node Hash", "type": "string"}, "value": {"description": "Please describe the image as accurate as possible focusing on \'Question\'", "maxLength": 1000, "minLength": 0, "title": "Value", "type": "string"}}, "required": ["feature_node_hash", "value"], "title": "QuestionTextModel", "type": "object"}}, "properties": {"question_answer_pair_1": {"$ref": "#/$defs/QuestionAnswerPair1RadioModel", "description": "A mutually exclusive radio attribute to choose exactly one option that best matches to the give visual input."}, "question_answer_pair_2": {"$ref": "#/$defs/QuestionAnswerPair2RadioModel", "description": "A mutually exclusive radio attribute to choose exactly one option that best matches to the give visual input."}, "question_answer_pair_3": {"$ref": "#/$defs/QuestionAnswerPair3RadioModel", "description": "A mutually exclusive radio attribute to choose exactly one option that best matches to the give visual input."}}, "required": ["question_answer_pair_1", "question_answer_pair_2", "question_answer_pair_3"], "title": "ClassificationModel", "type": "object"}\n    '

'Help me create a question-answer dataset for my application. You have access to a screenshot of the application from the documentation and the surrounding text above and below it:\n\n    === documentation text above image ===\n    1. In the Encord platform, select _Projects_ under _Annotate_.\n2. Select the Project you want to Manage.\n\n    === documentation text below image ===\n    The dashboard is split into the following tabs:\n\n- **Project Overview**: High-level view of labeling and productivity statistics.\n- **Explore**: Explore the distribution of instances and labels across data assets in the project.\n- **Queue**: Shows all tasks in the Project by Workflow stage.\n- **Workflow**: Graphically displays the path tasks follow through the Project Workflow.\n- **Labels & Export**: For managing all the Project\'s labels, including exporting labels.\n- **Analytics**: Detailed Project analytics.\n- **Settings**: Manage your Project Settings, including copying Projects, managing Pro

In [None]:
response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "user",
            "content": [{"type": "text", "text": prompt},fr ],
        }
    ],
    response_format={"type": "json_object"},
)

model_response = response.choices[0].message.content or "Failed to get resp"

NameError: name 'b64_frame' is not defined