# Generalized sketch

TO DO - create conversion tool for classic swipe (second tab in Katrina story)

In [1]:
# Install the webcolors package if not already installed
!pip install webcolors



In [2]:
from bs4 import BeautifulSoup, NavigableString, Tag
from arcgis.apps.storymap import StoryMap, Themes, Image, Video, Audio, Embed, Map, Button, Text, Gallery, Timeline, Sidecar, Code, Table, TextStyles, Collection, CollectionNavigation
from arcgis.gis import GIS, Item
from IPython.display import display
import pandas as pd
import webcolors
import webbrowser
import re, json, requests, sys, time 

agoNotebook = False

# Set Pandas dataframe display options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns',1000)

In [3]:
# Print Python and ArcGIS for Python versions
# since things can change between versions
import sys
print(f"Python version: ",sys.version)
import arcgis
print("ArcGIS for Python API / StoryMap module version: ",arcgis.__version__)

Python version:  3.11.11 (main, Mar  3 2025, 15:29:37) [MSC v.1938 64 bit (AMD64)]
ArcGIS for Python API / StoryMap module version:  2.4.1


In [4]:
# Connect to ArcGIS Online
# Define the GIS
if agoNotebook == False:
    import keyring
    service_name = "system" # Use the default local credential store
    success = False # Set initial state

    # Ask for the username
    while success == False:
        username_for_keyring = input("Enter your ArcGIS Online username:") # If you are using VS Code, the text input dialog box appears at the top of the window
        # Get the credential object
        credential = keyring.get_credential(service_name, username_for_keyring)
        # Check if the username is in the credential store
        if credential is None:
            print(f"'{username_for_keyring}' is not in the local system's credential store. Try another username.")
        # Retrieve the password, login and set the GIS portal
        else:
            password_from_keyring = keyring.get_password("system", username_for_keyring)
            portal_url = 'https://www.arcgis.com'  
            gis = GIS(portal_url, username=username_for_keyring, password=password_from_keyring)
            success = True
            # Print a success message with username and user's organization role
            print("Successfully logged in as: " + gis.properties.user.username, "(role: " + gis.properties.user.role + ")")
else:
    gis = GIS("home")

Successfully logged in as: dasbury_storymaps (role: org_admin)


In [5]:
# Define the Classic StoryMap item id
classic_storymap_id = '597d573e58514bdbbeb53ba2179d2359'
# Fetch the StoryMap Item from AGO
classic_item = Item(gis=gis,itemid=classic_storymap_id)
# Fetch the StoryMap data
classic_data = Item.get_data(classic_item)
if type(classic_data) == dict:
    classic_item_json = json.dumps(classic_data)
    classic_item_data = json.loads(classic_item_json)
else:
    classic_item_data = json.loads(classic_data)

In [6]:
# Helper functions
def color_to_hex(color_value):
    color_value = color_value.strip()
    # Check for rgb() format
    rgb_match = re.match(r'rgb-?(\d+)-?(\d+)-?(\d+)', color_value, re.IGNORECASE)
    if rgb_match:
        r, g, b = map(int, rgb_match.groups())
        return '{:02X}{:02X}{:02X}'.format(r, g, b)
    # Check for named color
    try:
        return webcolors.name_to_hex(color_value.lower())
    except ValueError:
        pass
    # Already hex
    if color_value.startswith('#') and len(color_value) == 7:
        return color_value.upper()
    return None

def convert_color_style_to_class(tag):
    # Check if tag has 'style' attribute with color
    style = tag.get('style', '')
    # Regex to find color property (hex, rgb, named colors)
    match = re.search(r'color\s*:\s*([^;]+)', style, re.IGNORECASE)
    if match:
        color_value = match.group(1).strip()
        # Convert hex (#XXXXXX) to class name, removing #
        if color_value.startswith('#'):
            class_color = f"sm-text-color-{color_value[1:].upper()}"
        else:
            # For rgb or named color, sanitize usable string (replace spaces/paren)
            sanitized = re.sub(r'[\s\(\)]', '', color_value).replace(',', '-')
            hex_color = color_to_hex(sanitized)
            class_color = f"sm-text-color-{hex_color}"
        # Remove color from style attribute
        new_style = re.sub(r'color\s*:\s*[^;]+;?', '', style, flags=re.IGNORECASE).strip()
        if new_style:
            tag['style'] = new_style
        else:
            del tag['style']
        # Add or append class attribute
        if 'class' in tag.attrs:
            tag['class'].append(class_color)
        else:
            tag['class'] = [class_color]

def process_html_colors_preserve_html(html_text):
    soup = BeautifulSoup(html_text, "html.parser")
    # Iterate over tags that can have styles: div, span, strong, em, p, etc.
    for tag in soup.find_all(True):
        convert_color_style_to_class(tag)
    return str(soup)

def parse_content_element(el):
    img_tag = el.find('img')
    if img_tag:
        src = img_tag.get("src")
        alt = img_tag.get("alt", "")
        link = "" # TO DO handle occasions when image is intended to launch a link
        # Find figcaption in parent figure or div
        figcaption = ""
        # print("img_tag:", img_tag)
        parent_figure = img_tag.find_parent("figure")
        # print("parent_figure:", parent_figure)
        if parent_figure:
            caption_tag = parent_figure.find("figcaption")
            # print("caption_tag:", caption_tag)
            if caption_tag:
                figcaption = caption_tag.get_text(strip=True)
        else:
            # Try to find figcaption in the parent div
            parent_div = img_tag.find_parent("div")
            # print("parent_div:", parent_div)
            if parent_div:
                caption_tag = parent_div.find("figcaption")
                # print("caption_tag (div):", caption_tag)
                if caption_tag:
                    figcaption = caption_tag.get_text(strip=True)
        # print("Extracted figcaption:", figcaption, type(figcaption))
        img = Image(path=src)
        #img.link = link
        #img.image = src
        return img, figcaption, alt, link

    tag_name = el.name
    if tag_name == "p": # or tag_name in ["span", "strong", "em", "div"]:
        # Extract inner HTML preserving inline styles
        inner_html = ''.join(str(c) for c in el.contents)
        processed_html = process_html_colors_preserve_html(inner_html)
        return Text(text=processed_html, style=TextStyles.PARAGRAPH)
    
    # elif tag_name == "img" or (tag_name == "div" and "class_" == "image-container"):
    #     src = el.get("src")
    #     alt = el.get("alt", "")
    #     link = ""
    #     if el.get("href"):
    #         link = el.get("href")
    #     # Find figcaption in parent figure or div
    #     figcaption = ""
    #     parent_figure = img_tag.find_parent("figure")
    #     if parent_figure:
    #         caption_tag = parent_figure.find("figcaption")
    #         if caption_tag:
    #             figcaption = caption_tag.get_text(strip=True)
    #     else:
    #         # Try to find figcaption in the parent div
    #         parent_div = img_tag.find_parent("div")
    #         if parent_div:
    #             caption_tag = parent_div.find("figcaption")
    #             if caption_tag:
    #                 figcaption = caption_tag.get_text(strip=True)
    #     img = Image(path=src, caption=figcaption, alt_text=alt)
    #     img.link = link
    #     img.image = src  # Assign image property. TO DO fix this for images hosted on AGO
    #     return img

    elif tag_name == "video":
        src = el.get("src")
        alt = el.get("alt", "")
        vid = Video(path=src)
        vid.alt_text = alt
        vid.caption = "" # TO DO try to find Classic stories that have Videos with captions
        vid.video = src # Assign video property. TO DO fix this for hosted videos
        return vid
    
    elif tag_name == "audio":
        src = el.get("src")
        alt = el.get("alt", "")
        aud = Audio(path=src)
        aud.alt_text = alt
        aud.caption = "" # TO DO try to find Classic stories that have Audio with captions
        aud.audio = src # Assign Audio property. TO DO fix this for hosted videos
        return aud
    
    elif tag_name == "iframe" or tag_name == "embed":
        src = el.get("src") or el.get("data-src")
        alt = el.get("alt", "")
        if src:
            emb = Embed(path=src)
            emb.alt_text = alt
            emb.caption = "" # TO DO try to find Classic stories that have Embeds with captions
            emb.link = src
        return emb

    elif tag_name == "map":
        src = el.get("src")
        alt = el.get("alt", "")
        extent = "" #TO DO get extent
        layers = "" # TO DO get map layers
        mp = Map(item="")
        mp.alt_text = alt
        mp.caption = "" # TO DO try to find Classic stories that have Maps in Sidecar panel with captions
        mp.map = src
        mp.map_layers = layers 
        mp.set_viewpoint = extent
        return aud
    
    else:
        # Fallback for unsupported or unknown types - treat as text
        inner_html = ''.join(str(c) for c in el.contents)
        processed_html = process_html_colors_preserve_html(inner_html)
        return Text(text=processed_html, style=TextStyles.PARAGRAPH)


# def deduplicate_by_containment(elements):
#     # Create list of (element, outer_html_str) tuples
#     elems_and_html = [(el, ' '.join(str(el).split())) for el in elements]

#     keep = []
#     for i, (el_i, html_i) in enumerate(elems_and_html):
#         # Check if this element is contained within another (excluding itself)
#         contained = False
#         for j, (el_j, html_j) in enumerate(elems_and_html):
#             if i != j and html_i in html_j:
#                 contained = True
#                 break
#         if not contained:
#             keep.append(el_i)
#     return keep

# def parse_narrative_html(html_text):
#     soup = BeautifulSoup(html_text, "html.parser")
#     content_nodes = []
#     for child in soup.children:
#         if isinstance(child, str):
#             # Text node (likely whitespace) - skip or wrap in Text()
#             node = Text(child)
#             continue
#         node = parse_content_element(child)
#         print(type(node))
#         if node:
#             content_nodes.append(node)
#     #deduped_nodes = deduplicate_by_containment(content_nodes)
#     return content_nodes

def parse_html_elements(html_snippet):
    soup = BeautifulSoup(html_snippet, "html.parser")
    soup_list = [child for child in soup.contents if getattr(child, 'name', None)]
    html_elements = []
    for element in soup_list:
        for c in element:
            if getattr(c, 'name', None):
                html_elements.append(c)
    return html_elements

def convert_html_elements_to_storymap_content(html_elements):
    content_nodes = []
    image_metadata = []  # To store (img, caption, alt, link) tuples
    for el in html_elements:
        node = parse_content_element(el)
        if isinstance(node, tuple):
            img, caption, alt, link = node
            content_nodes.append(img)
            image_metadata.append((img, caption, alt, link))
        elif node:
            content_nodes.append(node)
    return content_nodes, image_metadata

# def extract_content_blocks(html):
#     soup = BeautifulSoup(html, "html.parser")
#     # Find the main container (if present)
#     main = soup.find(class_="description") or soup
#     # Get all direct children that are tags (not NavigableString)
#     blocks = [child for child in main.children if getattr(child, 'name', None)]
#     # For each block, get its full HTML string
#     elements_list = []
#     for block in blocks:
#         html_str = str(block).strip()
#         # Skip empty blocks
#         if html_str and html_str != "&nbsp;":
#             elements_list.append(html_str)
#     return elements_list

# def split_nested_blocks(html, target_tags=None):
#     if target_tags is None:
#         target_tags = [
#             {"name": "div", "class_": "image-container"},
#             {"name": "figure"}
#         ]
#     soup = BeautifulSoup(html, "html.parser")
#     results = []
#     # For each target tag, find all matching elements
#     for target in target_tags:
#         found = soup.find_all(target["name"], class_=target.get("class_"))
#         for el in found:
#             results.append(str(el).strip())
#     # If no target tags found, keep the original block
#     if not results:
#         results.append(html.strip())
#     return results

# def filter_parent_blocks(blocks, target_tags=None):
#     filtered = []
#     for block in blocks:
#         split_items = split_nested_blocks(block, target_tags)
#         # If split_items contains only the original block, keep it
#         if len(split_items) == 1 and split_items[0] == block.strip():
#             filtered.append(block.strip())
#         # If split_items contains only target elements, keep only those
#         elif len(split_items) > 1:
#             filtered.extend(split_items)
#     return filtered

# def filter_parent_blocks_strict(blocks, target_tags=None):
#     if target_tags is None:
#         target_tags = [
#             {"name": "div", "class_": "image-container"},
#             {"name": "figure"}
#         ]
#     filtered = []
#     for block in blocks:
#         split_items = split_nested_blocks(block, target_tags)
#         soup = BeautifulSoup(block, "html.parser")
#         # Gather all non-whitespace descendants
#         descendants = [el for el in soup.descendants if getattr(el, 'name', None)]
#         # Gather all target elements
#         target_elements = []
#         for target in target_tags:
#             target_elements.extend(soup.find_all(target["name"], class_=target.get("class_")))
#         # If all non-whitespace descendants are target elements, keep only split_items
#         if descendants and set(descendants) == set(target_elements):
#             filtered.extend(split_items)
#         else:
#             filtered.append(block.strip())
#     return filtered

In [None]:
# Extract story data
classic_story_settings = classic_item_data["values"]["settings"]
classic_story_theme = classic_story_settings["theme"]
classic_story_title = classic_item_data["values"]["title"]
classic_story_data = classic_item_data["values"]["story"]

# Extract tabs (entries list)
entries = classic_story_data["entries"]

# Fetch theme group
classic_theme_group = classic_story_theme["colors"]["group"]
if classic_theme_group == "dark":
    new_theme = Themes.OBSIDIAN
elif classic_theme_group == "light":
    new_theme = Themes.SUMMIT

created_storymaps = []
published_storymap_items = []

#target_index = 0  # Change to the index of the entry you want to process (0-based)
for i, entry in enumerate(entries):
    # if i != target_index:
    #    continue # Skip all except the target index
    # Create a new StoryMap
    story = StoryMap()
    story.theme(new_theme)

    # Create Sidecar immersive section
    sidecar = Sidecar(style="docked-panel")

    # Add Sidecar to story
    story.add(sidecar)

    # Determine media content for main stage
    media_info = entry.get("media", {})
    media_type = media_info.get("type")

    media_content = None
    if media_type == "webmap":
        webmap_id = media_info.get('webmap', {}).get('id')
        if webmap_id:
            media_content = Map(webmap_id)
    elif media_type == "webpage":
        webpage_url = media_info.get("webpage", {}).get("url")
        if webpage_url:
            media_content = Embed(webpage_url)
  
    # Fetch content from description (HTML)
    description_html = entry.get("description", "")

    # Convert description HTML to StoryMap content nodes
    content_nodes, image_metadata = convert_html_elements_to_storymap_content(parse_html_elements(description_html))

    # Create text panel from narrative nodes
    text_panel = Text(content_nodes)
    #story.add(text_panel)

    # Add a slide to the sidecar with main media (no text panel yet)
    sidecar.add_slide(contents=content_nodes,media=media_content)

    # Assign metadata to each image in contents
    for img, caption, alt, link in image_metadata:
        if caption:
            img.caption = caption
        if alt:
            img.alt_text = alt
        if link:
            img.link = link

    # Set webmap properties. Map must be added to the story before setting viewpoint
    if media_type == "webmap":
        # Set the extent for the map stage
        extent_json = media_info.get('webmap', {}).get('extent')
        if extent_json:
            media_content.set_viewpoint(extent=extent_json)  # Extent dict per docs
        # Set layer visibility (if StoryMap Map object supports)
        old_layers = media_info.get('webmap', {}).get('layers', [])
        if hasattr(media_content, "map_layers"):
            for new_lyr in media_content.map_layers:
                for old_lyr in old_layers:
                    if new_lyr['id'] == old_lyr['id']:
                        new_lyr['visible'] = old_lyr['visibility']

    # Set Cover properties
    cover_properties = story.content_list[0]
    cover_properties.byline = ""
    cover_properties.date = "none"
    #cover_properties.media = createThumbnail() # figure out a way to create a thumbnail from the first Sidecar media item

    # As the Cover class does not include a setting to hide the cover, we hide it by adding the 'config' key
    # to the Cover json
    for k,v in story.properties['nodes'].items():
        if v['type'] == 'storycover':
            v['config'] = {'isHidden': 'true'}


    # Save and publish storymap
    story_title = entry.get("title", "Untitled Story")
    story.save(title=story_title, tags=["auto-created"], publish=True)

    # TO DO add an AGO relationship so if an attempt is made to delete story from My Content a warning is issued that the story
    # is included in a Collection (and give the name/id of the Collection(s) where it is referenced)

    created_storymaps.append(story)
    # Get the item object
    if hasattr(story, '_item'):
        published_story_item = story._item
    else:
        print("Could not find item for story:", story.title)
        continue

    print(f"Created replica of {story_title}")

    # Open a browser to launch the Story Checker and fully publish the story
    story_url = "https://storymaps.arcgis.com/stories/"+ published_story_item.id
    print(f"Opening: {published_story_item.title} ({story_url})")
    webbrowser.open(story_url)

print(f"Created {len(created_storymaps)} StoryMaps")

# Pause for 60 seconds
print("Waiting for 30 seconds for stories to publish...")
time.sleep(30)
print("Resuming code execution.")

In [None]:

# Create a Collection to hold the created StoryMaps
collection = Collection()
collection_title = classic_item.title
storymap_item = []
for story in published_storymap_items:
    collection.add(item=story, title=story.title)
collection.content[0] = title.collection_title
collection.theme(new_theme)    

collection.save(title=collection_title, tags=["auto-created"], publish=True)
published_collection_item = collection._item
collection_url = "https://storymaps.arcgis.com/collections/"+ published_collection_item.id
print(f"Opening Collection: {collection_title} ({collection_url})")
webbrowser.open(collection_url)
print(f"Created Collection: {collection_title}")

   

In [14]:
collection.get_theme()

'obsidian'

In [None]:
# Pseudocode for traversing html tree
# get all leaf nodes
# if node is a leaf node (no children)
#     recursively check its parents until the parent is equal to the root node.
#     step down one level
#     check for img, video, audio, iframe, embed, map tags
#     if no other tags found, mark as text node
#     if img, video, audio, iframe, embed, map tag found mark as that type
#     traverse up tree until a <p>, <div> or <span>is found
#     once a <p> is found, check if its parent is a <div class=image-container>, if so mark as Image()
#     once a <div> is found, check if its parent is a <Figure class=caption>, if so process as Image() with caption
#     once a <span> is found, check if its parent is a <span>
#     otherwise, capture all descendant tags and process the node as its marked type (Text | Image | Video | Audio | Embed | Map | Button | Code | Table)

In [None]:
soup = BeautifulSoup(description_html, "html.parser")
soup_content = list(soup.children)
# soup_content_collapsed = [child for child in soup_content if getattr(child, 'name', None)]
# for el in soup_content_collapsed[0]:
#     el_nodes = []
#     for tag in el.find_all(True):  # All tags, nested included
#         # A leaf node has no child tags
#         if not tag.find_all(True):
#             el_nodes.append(tag)

In [None]:
soup = BeautifulSoup(description_html, "html.parser")
leaf_nodes = []

for tag in soup.find_all(True):  # All tags, nested included
    # A leaf node has no child tags
    if not tag.find_all(True):
        leaf_nodes.append(tag)

# leaf_nodes now contains all leaf elements
print(len(leaf_nodes))

In [None]:
print(leaf_nodes[15].parent.parent.parent)