In [None]:
import markdown
from weasyprint import HTML
from pathlib import Path
import sys

In [None]:
def md_to_pdf(md_file: str, pdf_file: str = None):
    """Convert markdown to PDF"""
    
    # Setup paths
    md_path = Path(md_file)
    if not md_path.exists():
        print(f"‚ùå Error: File not found: {md_file}")
        sys.exit(1)
    
    if pdf_file is None:
        pdf_file = str(md_path.with_suffix('.pdf'))
    
    # Read markdown content
    print(f"üìñ Reading {md_file}...")
    md_content = md_path.read_text(encoding='utf-8')
    
    # Convert markdown to HTML
    print("üîÑ Converting to HTML...")
    html_content = markdown.markdown(
        md_content,
        extensions=['extra', 'codehilite', 'tables', 'toc', 'fenced_code']
    )
    
    # Create styled HTML document
    styled_html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="utf-8">
        <style>
            @page {{
                size: A4;
                margin: 2.5cm;
            }}
            body {{
                font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Arial, sans-serif;
                line-height: 1.6;
                color: #333;
            }}
            h1 {{
                color: #2c3e50;
                border-bottom: 3px solid #3498db;
                padding-bottom: 10px;
                margin-top: 20px;
            }}
            h2 {{
                color: #34495e;
                border-bottom: 1px solid #ddd;
                padding-bottom: 5px;
                margin-top: 15px;
            }}
            h3 {{
                color: #555;
                margin-top: 12px;
            }}
            code {{
                background-color: #f4f4f4;
                padding: 2px 6px;
                border-radius: 3px;
                font-family: 'Courier New', monospace;
                font-size: 0.9em;
            }}
            pre {{
                background-color: #f4f4f4;
                padding: 15px;
                border-radius: 5px;
                overflow-x: auto;
                border-left: 4px solid #3498db;
            }}
            pre code {{
                background: none;
                padding: 0;
            }}
            blockquote {{
                border-left: 4px solid #ddd;
                padding-left: 15px;
                color: #666;
                margin: 15px 0;
                font-style: italic;
            }}
            table {{
                border-collapse: collapse;
                width: 100%;
                margin: 15px 0;
            }}
            th, td {{
                border: 1px solid #ddd;
                padding: 10px;
                text-align: left;
            }}
            th {{
                background-color: #f4f4f4;
                font-weight: bold;
            }}
            tr:nth-child(even) {{
                background-color: #f9f9f9;
            }}
            a {{
                color: #3498db;
                text-decoration: none;
            }}
            a:hover {{
                text-decoration: underline;
            }}
            ul, ol {{
                margin: 10px 0;
                padding-left: 30px;
            }}
            li {{
                margin: 5px 0;
            }}
        </style>
    </head>
    <body>
        {html_content}
    </body>
    </html>
    """
    
    # Convert HTML to PDF
    print(f"üìÑ Generating PDF...")
    try:
        HTML(string=styled_html).write_pdf(pdf_file)
        print(f"‚úÖ Success! PDF created: {pdf_file}")
    except Exception as e:
        print(f"‚ùå Error: {e}")
        sys.exit(1)

In [None]:
md_to_pdf('researcher/output/report.md', 'researcher/output/report_vn.pdf')

In [None]:
from crewai import Agent, Crew, Process, Task
from crewai.project import CrewBase, agent, crew, task
from crewai.agents.agent_builder.base_agent import BaseAgent
from typing import List

@CrewBase
class EngineeringTeam():
    """EngineeringTeam crew"""

    agents: List[BaseAgent]
    tasks: List[Task]

    @agent
    def product_manager(self) -> Agent:
        return Agent(
            config=self.agents_config['product_manager'], # type: ignore[index]
            max_execution_time=300, 
            max_retry_limit=3,
            verbose=True
        )

    @agent
    def engineering_lead(self) -> Agent:
        return Agent(
            config=self.agents_config['engineering_lead'], # type: ignore[index]
            max_execution_time=300, 
            max_retry_limit=3,
            verbose=True
        )

    @agent
    def backend_engineer(self) -> Agent:
        return Agent(
            config=self.agents_config['backend_engineer'], # type: ignore[index]
            allow_code_execution=True,
            code_execution_mode="safe", 
            max_execution_time=500, 
            max_retry_limit=3,
            verbose=True
        )    

    @task
    def create_userstories_task(self) -> Task:
        return Task(
            config=self.tasks_config['create_userstories_task'] # type: ignore[index]
        )

    @task
    def design_task(self) -> Task:
        return Task(
            config=self.tasks_config['design_task'] # type: ignore[index]
        )

    @task
    def code_task(self) -> Task:
        return Task(
            config=self.tasks_config['code_task'] # type: ignore[index]
        )

    @crew
    def crew(self) -> Crew:
        """Creates the EngineeringTeam crew"""
       
        return Crew(
            agents=self.agents, # Automatically created by the @agent decorator
            tasks=self.tasks, # Automatically created by the @task decorator
            process=Process.sequential,
            verbose=True,
            # process=Process.hierarchical, # In case you wanna use that instead https://docs.crewai.com/how-to/Hierarchical/
        )


In [None]:
import requests
from bs4 import BeautifulSoup
from typing import Dict, List, Optional
import json

In [None]:
def extract_dom_structure(url: str, max_depth: int = 10) -> Dict:
    """
    Extract clean structured DOM elements from a website URL.
    
    Args:
        url (str): The website URL to scrape
        max_depth (int): Maximum depth to traverse the DOM tree (default: 10)
    
    Returns:
        Dict: A structured dictionary representing the DOM tree
    """
    try:
        # Fetch the webpage
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract structured DOM
        dom_structure = {
            'url': url,
            'title': soup.title.string if soup.title else None,
            'meta': _extract_meta_tags(soup),
            'structure': _parse_element(soup.body if soup.body else soup, max_depth=max_depth)
        }
        
        return dom_structure
    
    except requests.RequestException as e:
        return {'error': f'Failed to fetch URL: {str(e)}'}
    except Exception as e:
        return {'error': f'Error parsing DOM: {str(e)}'}


def _extract_meta_tags(soup: BeautifulSoup) -> Dict:
    """Extract meta tags from the page."""
    meta_tags = {}
    
    for meta in soup.find_all('meta'):
        if meta.get('name'):
            meta_tags[meta.get('name')] = meta.get('content', '')
        elif meta.get('property'):
            meta_tags[meta.get('property')] = meta.get('content', '')
    
    return meta_tags


def _parse_element(element, depth: int = 0, max_depth: int = 10) -> Dict:
    """
    Recursively parse a DOM element into a structured dictionary.
    
    Args:
        element: BeautifulSoup element to parse
        depth: Current depth in the DOM tree
        max_depth: Maximum depth to traverse
    
    Returns:
        Dict: Structured representation of the element
    """
    if depth > max_depth:
        return {'truncated': True}
    
    # Skip script, style, and comment nodes
    if element.name in ['script', 'style', 'noscript']:
        return None
    
    # Handle text nodes
    if isinstance(element, str):
        text = element.strip()
        return {'text': text} if text else None
    
    # Build element structure
    element_data = {
        'tag': element.name,
        'attributes': dict(element.attrs) if element.attrs else {},
        'children': []
    }
    
    # Clean up attributes (remove long data attributes and inline styles)
    if 'style' in element_data['attributes']:
        element_data['attributes']['style'] = _clean_style(element_data['attributes']['style'])
    
    # Remove base64 data from src/href
    for attr in ['src', 'href']:
        if attr in element_data['attributes']:
            val = element_data['attributes'][attr]
            if isinstance(val, str) and val.startswith('data:'):
                element_data['attributes'][attr] = '[data-url]'
    
    # Parse children
    for child in element.children:
        parsed_child = _parse_element(child, depth + 1, max_depth)
        if parsed_child:
            element_data['children'].append(parsed_child)
    
    # If element has no children, capture direct text
    if not element_data['children'] and element.string:
        text = element.string.strip()
        if text:
            element_data['text'] = text
    
    return element_data


def _clean_style(style_str: str) -> str:
    """Clean up inline styles to show only important parts."""
    if len(style_str) > 100:
        return style_str[:100] + '...'
    return style_str


def get_dom_summary(url: str) -> Dict:
    """
    Get a high-level summary of the DOM structure.
    
    Args:
        url (str): The website URL
    
    Returns:
        Dict: Summary statistics of the DOM
    """
    dom = extract_dom_structure(url)
    
    if 'error' in dom:
        return dom
    
    def count_elements(node, counts):
        if isinstance(node, dict):
            if 'tag' in node:
                tag = node['tag']
                counts[tag] = counts.get(tag, 0) + 1
            
            if 'children' in node:
                for child in node['children']:
                    count_elements(child, counts)
        
        return counts
    
    element_counts = count_elements(dom['structure'], {})
    
    return {
        'url': url,
        'title': dom['title'],
        'total_elements': sum(element_counts.values()),
        'element_counts': dict(sorted(element_counts.items(), key=lambda x: x[1], reverse=True)),
        'meta_tags_count': len(dom['meta'])
    }


def get_interactive_elements(url: str) -> List[Dict]:
    """
    Extract all interactive elements (links, buttons, forms, inputs).
    
    Args:
        url (str): The website URL
    
    Returns:
        List[Dict]: List of interactive elements with their properties
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        interactive = []
        
        # Links
        for link in soup.find_all('a', href=True):
            interactive.append({
                'type': 'link',
                'href': link.get('href'),
                'text': link.get_text(strip=True),
                'attributes': {k: v for k, v in link.attrs.items() if k != 'href'}
            })
        
        # Buttons
        for button in soup.find_all('button'):
            interactive.append({
                'type': 'button',
                'text': button.get_text(strip=True),
                'attributes': dict(button.attrs)
            })
        
        # Forms
        for form in soup.find_all('form'):
            interactive.append({
                'type': 'form',
                'action': form.get('action'),
                'method': form.get('method', 'GET'),
                'inputs': len(form.find_all(['input', 'textarea', 'select']))
            })
        
        # Input fields
        for input_elem in soup.find_all('input'):
            interactive.append({
                'type': 'input',
                'input_type': input_elem.get('type', 'text'),
                'name': input_elem.get('name'),
                'placeholder': input_elem.get('placeholder'),
                'attributes': {k: v for k, v in input_elem.attrs.items()}
            })
        
        return interactive
    
    except Exception as e:
        return [{'error': str(e)}]

In [None]:
url = 'https://www.saucedemo.com/'
dom_structure = extract_dom_structure(url, max_depth=10)
print("Full DOM Structure:")
dom_structure

In [None]:
import requests
import json
from bs4 import BeautifulSoup, NavigableString, Tag, Comment

In [None]:
def _parse_element_to_dict(element):
    """
    Recursive helper function to convert a BeautifulSoup element into a 
    JSON-serializable dictionary.
    """
    
    # --- Handle Non-Tag Nodes ---
    
    # 1. Handle Text Nodes (NavigableString)
    if isinstance(element, NavigableString):
        # Strip whitespace, if text remains, return it
        text = element.string.strip()
        if text:
            return {"type": "text", "content": text}
        return None  # Ignore whitespace-only text nodes

    # 2. Ignore Comment Nodes
    if isinstance(element, Comment):
        return None

    # --- Handle Tag Nodes ---
    if isinstance(element, Tag):
        # 1. Build the basic node dictionary
        node = {
            "type": "element",
            "tag": element.name
        }
        
        # 2. Add attributes if they exist
        if element.attrs:
            # Convert attribute values (often lists) to strings 
            # for simpler JSON
            clean_attrs = {k: " ".join(v) if isinstance(v, list) else v 
                           for k, v in element.attrs.items()}
            node["attributes"] = clean_attrs
        
        # 3. Recursively handle children
        children = []
        for child in element.children:
            parsed_child = _parse_element_to_dict(child)
            if parsed_child:  # Only add valid (non-None) child nodes
                children.append(parsed_child)
        
        if children:
            node["children"] = children
            
        return node
    
    # Ignore other unknown element types
    return None

def get_dom_as_json(url: str) -> str:
    """
    Takes a URL, fetches the HTML, parses the DOM, 
    and returns a structured JSON string.
    """
    try:
        # 1. Fetch the HTML content
        # Set a User-Agent to mimic a browser
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()  # Raise an error on bad responses (e.g., 404)
        
        # 2. Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 3. Find the root element (<html>, or fallback to <body>)
        root_element = soup.find('html')
        if not root_element:
            root_element = soup.find('body') or soup # Fallback
            
        # 4. Start the recursive conversion
        dom_structure = _parse_element_to_dict(root_element)
        
        # 5. Return the pretty-printed JSON string
        if dom_structure:
            # ensure_ascii=False allows for non-English characters
            return json.dumps(dom_structure, indent=2, ensure_ascii=False)
        else:
            return json.dumps({"error": "Could not parse DOM structure."}, indent=2)

    except requests.RequestException as e:
        return json.dumps({"error": f"Failed to fetch URL: {str(e)}"}, indent=2)
    except Exception as e:
        return json.dumps({"error": f"An error occurred: {str(e)}"}, indent=2)

In [None]:
test_url = "http://127.0.0.1:7860/" 
    
dom_json = get_dom_as_json(test_url)

print(dom_json)

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
def get_dom(url: str) -> str:
    """
    Fetches and returns the DOM (HTML source) of a given website URL.
    
    Args:
        url (str): The target website URL.
        
    Returns:
        str: The cleaned, pretty-printed HTML of the page.
    """
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0.0.0 Safari/537.36"
        )
    }
    
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()  # Raise an exception if request fails
    
    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Return pretty-formatted HTML (optional)
    return soup.prettify()

In [None]:
url = "http://127.0.0.1:7860"
html_dom = get_dom(url)
print(html_dom)  # print first 1000 chars for preview

In [26]:
import asyncio
from playwright.async_api import async_playwright

async def get_dom_with_js(url: str) -> str:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, timeout=30000)
        html = await page.content()
        await browser.close()
        return html

In [27]:
from bs4 import BeautifulSoup
import json

def element_to_dict(element):
    """Recursively converts a BeautifulSoup element to a JSON-friendly dict."""
    if element.name is None:
        return element.strip() if element.strip() else None

    return {
        "tag": element.name,
        "attributes": element.attrs,
        "children": [
            child for child in
            (element_to_dict(c) for c in element.contents)
            if child is not None
        ]
    }

def dom_to_json_full(html: str) -> str:
    """Converts full HTML DOM to JSON tree."""
    soup = BeautifulSoup(html, "html.parser")
    body = soup.body or soup  # default to soup if no body
    dom_dict = element_to_dict(body)
    return json.dumps(dom_dict, ensure_ascii=False)

In [33]:
url = "http://127.0.0.1:7860"
dom = await get_dom_with_js(url)
print(dom)
result = dom_to_json_full(dom)

<!DOCTYPE html><html lang="en" style="
		margin: 0;
		padding: 0;
		min-height: 100%;
		display: flex;
		flex-direction: column;
	"><head>
		<meta charset="utf-8">
		<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
		<meta property="og:title" content="Gradio">
		<meta property="og:type" content="website">
		<meta property="og:url" content="{url}">
		<meta property="og:description" content="Click to try out the app!">
		<meta property="og:image" content="https://raw.githubusercontent.com/gradio-app/gradio/main/js/_website/src/lib/assets/img/header-image.jpg">
		<meta name="twitter:card" content="summary_large_image">
		<meta name="twitter:creator" content="@Gradio">
		<meta name="twitter:title" content="Gradio">
		<meta name="twitter:description" content="Click to try out the app!">
		<meta name="twitter:image" content="https://raw.githubusercontent.com/gradio-app/gradio/main/js/_website/src/lib/assets/img/header-image.jpg">

		<style>
			:root {
		

In [34]:
print(result)

{"tag": "body", "attributes": {"style": "\n\t\t\twidth: 100%;\n\t\t\tmargin: 0;\n\t\t\tpadding: 0;\n\t\t\tdisplay: flex;\n\t\t\tflex-direction: column;\n\t\t\tflex-grow: 1;\n\t\t"}, "children": [{"tag": "gradio-app", "attributes": {"control_page_title": "true", "embed": "false", "eager": "true", "style": "display: flex; flex-direction: column; flex-grow: 1; background: var(--body-background-fill);"}, "children": [{"tag": "div", "attributes": {"class": ["gradio-container", "gradio-container-5-49-1", "svelte-18evea3"], "data-iframe-height": "", "style": "min-height: initial; flex-grow: 1;"}, "children": [{"tag": "main", "attributes": {"class": ["fillable", "svelte-18evea3", "app"]}, "children": [{"tag": "div", "attributes": {"class": ["wrap", "svelte-czcr5b"], "style": "min-height: 100%;"}, "children": [{"tag": "div", "attributes": {"class": ["contain", "svelte-czcr5b"], "style": "flex-grow: 1; margin-right: 0px;"}, "children": [{"tag": "div", "attributes": {"id": "component-0", "class":