In [1]:
from bs4 import BeautifulSoup, NavigableString, Tag
import pandas as pd
import os
import markdown

In [2]:
# Function to parse markdown file and convert to HTML
def parse_markdown_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        markdown_content = f.read()
    
    # Convert markdown to HTML
    html_content = markdown.markdown(markdown_content, extensions=['fenced_code', 'tables'])
    
    return html_content

# Example usage
file_path = './markdown_file/firecralwer_get-started-japan.md'
html_content = parse_markdown_file(file_path)

In [3]:
soup = BeautifulSoup(html_content, 'html.parser')
    
# Find all h2 headings
h2_headings = soup.find_all('h2')

In [4]:
soup

<p><a href="#content">Skip to main content</a> <a href="#search">Skip to search</a></p>
<p>Powered by Zoomin Software. For more details please contact <a href="https://www.zoominsoftware.com">Zoomin</a></p>
<p><a href="https://www.here.com/docs/search?personalize=true&amp;q=how%20do%20I%20get%20an%20api%20key%3F">Try using our new AI features to get answers to your questions. To ask AI a question, add a question mark (?) to the end of a search query. Click here to try it out.</a></p>
<h2>HERE Maps API for JavaScript - Developer Guide</h2>
<p>Filters</p>
<h2>Filters</h2>
<p>Clear All Filters</p>
<h5>Product category</h5>
<ul>
<li>
<p>Dynamic map content</p>
</li>
<li>
<p>HERE Destination Weather API v3</p>
</li>
<li>
<p>HERE Destination Weather API v1</p>
</li>
<li>
<p>HERE Map Attributes API</p>
</li>
<li>
<p>HERE EV Charge Points API</p>
</li>
<li>
<p>HERE Fuel Prices API</p>
</li>
<li>
<p>HERE On-Street Parking API</p>
</li>
<li>
<p>HERE Off-Street Parking API</p>
</li>
<li>
<p>Map d

In [5]:
def extract_list_content(list_element):
    list_content = []
    for item in list_element.find_all('li'):
        # Find any code blocks inside the list item
        if item.find('pre'):
            list_content.append({'type': 'code', 'content': item.get_text()})
        else:
            # Otherwise, it's just text
            if item.get_text() not in ['Explain this code', 'Copy']:
                list_content.append({'type': 'text', 'content': item.get_text()})

    return list_content



In [6]:
soup = BeautifulSoup(html_content, 'html.parser')
    
# Find all h2 headings
h2_headings = soup.find_all('h2')

extracted_data = []

# Loop through all sections
for section in h2_headings[:-1]:
    section_title = section.get_text().strip()  # Get the section title
    section_content = []  # Store content for each section
    
    # Find the next sibling (the content after h2)
    sibling = section.find_next_sibling()
    while sibling and sibling.name != 'h2':
        # Extract text and code content
        if sibling.name == 'p' or sibling.name == 'code':  # This is text
            if sibling.get_text() not in ['Explain this code', 'Copy']:
                section_content.append({'type': 'text', 'content': sibling.get_text()})
        elif sibling.name == 'pre':  # This is code
            section_content.append({'type': 'code', 'content': sibling.get_text()})
        elif sibling.name == 'ul' or sibling.name == 'ol':  # This is an unordered/ordered list
            # Extract both text and code inside the list
            list_items = extract_list_content(sibling)
            section_content.extend(list_items)

        sibling = sibling.find_next_sibling()

    extracted_data.append({
        'section_title': section_title,
        'content': section_content
    })


In [7]:
extracted_data[5]

{'section_title': 'Initialize the map',
 'content': [{'type': 'text',
   'content': 'Create an instance of the Vector Tile Service object that is pre-configured to use the core endpoint that provides the Japan data.'},
  {'type': 'text',
   'content': 'Create a layer that uses the map style optimised for the display of the Japan data. The style can be adjusted with the help of Map Customization Tool.'},
  {'type': 'text', 'content': '\nInstantiate an H.Map object, specifying:\n'},
  {'type': 'text', 'content': '\nthe map container element\n'},
  {'type': 'text', 'content': 'the layer created in the previous step'},
  {'type': 'text', 'content': 'the zoom level at which to display the map'},
  {'type': 'text',
   'content': 'the geographic coordinates of the point on which to center the map'},
  {'type': 'code',
   'content': "\n// configure an OMV service to use the `core` enpoint\nvar omvService = platform.getOMVService({path:  'v2/vectortiles/core/mc'});\nvar baseUrl = 'https://js.ap

In [8]:
def create_text_to_code_mapping(extracted_data):
    mappings = []

    for section in extracted_data:
        section_title = section['section_title']
        text_content = ""
        code_content = ""

        # Separate text and code content
        for item in section['content']:
            if item['type'] == 'text':
                text_content += item['content'] + "\n"
            elif item['type'] == 'code':
                code_content += item['content'] + "\n"
        
        # Create a mapping between the text and the corresponding code
        if text_content and code_content:
            mappings.append({
                'section_title': section_title,
                'text': text_content.strip(),
                'code': code_content.strip()
            })
    
    return mappings


In [9]:
mappings = create_text_to_code_mapping(extracted_data)

In [10]:
mappings

[{'section_title': 'Load the API Code Libraries',
  'text': 'There are no changes in how the JavaScript API is included on the page, here is the complete <head> element that loads the core and service modules and ensures optimum performance on mobile devices.',
  'code': '<!DOCTYPE html>\n  <html>\n    <head>\n      ...\n      <meta name="viewport" content="initial-scale=1.0,\n        width=device-width" />\n      <script src="https://js.api.here.com/v3/3.1/mapsjs-core.js"\n        type="text/javascript" charset="utf-8"></script>\n      <script src="https://js.api.here.com/v3/3.1/mapsjs-service.js"\n        type="text/javascript" charset="utf-8"></script>\n      ...\n    </head>\n    <body>\n      <div style="width: 640px; height: 480px" id="mapContainer"></div>'},
 {'section_title': 'Initialize Communication with Back-End Services',
  'text': 'Initialize the Platform object normally by passing it the API Key:',
  'code': "var platform = new H.service.Platform({\n  'apikey': '{YOUR_API