## User-Agent Parsing

This will be an attempt to parse User-Agent strings similarly to how the site below does.
- `https://user-agents.net/string/mozilla-5-0-windows-nt-10-0-win64-x64-applewebkit-537-36-khtml-like-gecko-chrome-122-0-0-0-safari-537-36-edg-122-0-0-0-maglev-24004-1307-2669-7070-49`



In [None]:
user_agent_schema = [
    {
        "device": "Desktop",
        "regex": "(?:Mozilla|Chrome|Firefox|Safari|Edge|Opera|Internet Explorer|IE|[A-Za-z][A-Za-z0-9\\-_]*)",
        "exemplar": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "type": "Desktop",
        "brand": "Generic",
        "model": "PC",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "Browser name/family detection with fallback for unknown browsers starting with alphanumeric characters",
        "source_links": [
            "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent",
            "https://github.com/ua-parser/uap-core",
            "https://datatracker.ietf.org/doc/html/rfc7231"
        ],
        "extraction_confidence": 0.7
    },
    {
        "device": "Desktop",
        "regex": "(?:Chrome|Firefox|Safari|Edge|Opera|IE|Version|[A-Za-z][A-Za-z0-9\\-_]*)\\/((\\d+)(?:\\.(\\d+))?(?:\\.(\\d+))?(?:\\.(\\d+))?|[\\w\\-\\.]+)",
        "exemplar": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "type": "Desktop",
        "brand": "Generic",
        "model": "PC",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "Browser version extraction with support for numeric versions and unknown alphanumeric versions",
        "source_links": [
            "https://github.com/ua-parser/uap-core",
            "https://www.keycdn.com/support/user-agent-string",
            "https://browscap.org"
        ],
        "extraction_confidence": 0.8
    },
    {
        "device": "Generic",
        "regex": "(?:WebKit|Gecko|Blink|Trident|EdgeHTML|Presto|[A-Za-z][A-Za-z0-9]*Engine?)(?:\\/(\\d+(?:\\.\\d+)*|[\\w\\-\\.]+))?",
        "exemplar": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "type": "Generic",
        "brand": "Generic",
        "model": "Generic",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "Rendering engine detection with fallback for unknown engines ending in 'Engine' or similar patterns",
        "source_links": [
            "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent",
            "https://github.com/ua-parser/uap-core"
        ],
        "extraction_confidence": 0.9
    },
    {
        "device": "Generic",
        "regex": "(?:Windows(?:\\s+NT)?|Mac(?:\\s+OS(?:\\s+X)?)?|Linux|Android|iOS|iPhone(?:\\s+OS)?|iPad(?:\\s+OS)?|Ubuntu|CentOS|Debian|FreeBSD|[A-Za-z][A-Za-z0-9\\s]*(?:OS|Linux|BSD|Unix)?)",
        "exemplar": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
        "type": "Generic",
        "brand": "Generic",
        "model": "Generic",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "Operating system name detection with fallback for unknown OS names ending in common OS suffixes",
        "source_links": [
            "https://datatracker.ietf.org/doc/html/rfc7231",
            "https://github.com/ua-parser/uap-core",
            "https://browscap.org"
        ],
        "extraction_confidence": 0.85
    },
    {
        "device": "Desktop",
        "regex": "(?:Windows\\s+NT\\s+(\\d+)\\.(\\d+)|Mac\\s+OS\\s+X\\s+(\\d+)[_._](\\d+)(?:[_._](\\d+))?|Android\\s+(\\d+)(?:\\.(\\d+))?(?:\\.(\\d+))?|iOS\\s+(\\d+)[_._](\\d+)(?:[_._](\\d+))?|iPhone\\s+OS\\s+(\\d+)[_._](\\d+)(?:[_._](\\d+))?|([A-Za-z][A-Za-z0-9\\s]*?)\\s+(\\d+(?:[\\._]\\d+)*|[\\w\\-\\.]+))",
        "exemplar": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
        "type": "Desktop",
        "brand": "Apple",
        "model": "Mac",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "OS version extraction with fallback for unknown OS name-version combinations",
        "source_links": [
            "https://github.com/ua-parser/uap-core",
            "https://browscap.org",
            "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent"
        ],
        "extraction_confidence": 0.75
    },
    {
        "device": "Generic",
        "regex": "(?:x86_64|x64|Win64|Intel|AMD64|i386|i686|armv7l|aarch64|arm64|[A-Za-z0-9]+(?:64|32|86)?)",
        "exemplar": "Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0",
        "type": "Generic",
        "brand": "Generic",
        "model": "Generic",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "CPU architecture detection with fallback for unknown architectures with common suffixes",
        "source_links": [
            "https://github.com/ua-parser/uap-core",
            "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent"
        ],
        "extraction_confidence": 0.6
    },
    {
        "device": "Mobile",
        "regex": "(?:Mobile|Tablet|Desktop|TV|Console|(?:Smart)?Watch|Car|VR|IoT|Bot|Crawler|[A-Za-z][A-Za-z0-9]*(?:Device|Pad|Phone|TV|Console|Watch)?)",
        "exemplar": "Mozilla/5.0 (Linux; Android 11; SM-G991B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36",
        "type": "Mobile",
        "brand": "Samsung",
        "model": "Galaxy",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "Device type classification with fallback for unknown device types with common suffixes",
        "source_links": [
            "https://github.com/ua-parser/uap-core",
            "https://browscap.org",
            "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent"
        ],
        "extraction_confidence": 0.7
    },
    {
        "device": "Mobile",
        "regex": "(?:Apple|Samsung|Google|Huawei|Xiaomi|OnePlus|LG|Sony|Nokia|Motorola|HTC|Oppo|Vivo|Realme|Honor|Asus|Acer|Dell|HP|Lenovo|Microsoft|Amazon|Roku|PlayStation|Xbox|Nintendo|[A-Z][A-Za-z0-9]*(?:Corp|Inc|Ltd|Co)?)",
        "exemplar": "Mozilla/5.0 (Linux; Android 12; SM-S908B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Mobile Safari/537.36",
        "type": "Mobile",
        "brand": "Samsung",
        "model": "Galaxy S22",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "Device brand detection with fallback for unknown manufacturers starting with capital letter",
        "source_links": [
            "https://github.com/ua-parser/uap-core",
            "https://browscap.org",
            "https://pypi.org/project/user-agent-parser/"
        ],
        "extraction_confidence": 0.75
    },
    {
        "device": "Mobile",
        "regex": "(?:iPhone|iPad|iPod|Galaxy\\s+\\w+|Pixel\\s+\\w+|Nexus\\s+\\w+|SM-\\w+|LG-\\w+|HTC\\s+\\w+|Xperia\\s+\\w+|Nokia\\s+\\w+|Moto\\s+\\w+|[A-Z][A-Za-z0-9\\-]*(?:\\s+[A-Za-z0-9\\-]+)*)",
        "exemplar": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Mobile/15E148 Safari/604.1",
        "type": "Mobile",
        "brand": "Apple",
        "model": "iPhone",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "Device model detection with fallback for unknown models starting with capital letter and optional spaces",
        "source_links": [
            "https://github.com/ua-parser/uap-core",
            "https://browscap.org",
            "https://pypi.org/project/user-agent-parser/"
        ],
        "extraction_confidence": 0.65
    },
    {
        "device": "Bot",
        "regex": "(?:bot|crawler|spider|scraper|crawl|indexer|search|fetch|monitor|check|scan|agent|archiver|extractor|parser|reader|validator|preview|thumbnail|screenshot|headless|automated|script|tool|service|api|webhook|feed|rss|xml|sitemap|robot|^curl|^wget|^python|^java|^php|^ruby|^perl|^node|^go|^rust|[A-Za-z0-9\\-_]*(?:bot|crawler|spider|scraper|agent|tool))",
        "exemplar": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
        "type": "Bot",
        "brand": "Google",
        "model": "Googlebot",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "Bot and crawler detection with fallback for unknown automated agents with common suffixes",
        "source_links": [
            "https://github.com/ua-parser/uap-core",
            "https://browscap.org",
            "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent"
        ],
        "extraction_confidence": 0.85
    },
    {
        "device": "Bot",
        "regex": "(?:facebook|twitter|instagram|linkedin|pinterest|tiktok|snapchat|reddit|discord|telegram|whatsapp|[a-z]+)(?:bot|crawler|scraper|externalhit|preview|parser|linkexpander|cardvalidator|facebookexternalhit|twitterbot|linkedinbot|pinterestbot|telegrambot|whatsappbot|slackbot|discordbot|redditbot|instagrambot|tiktokbot|snapchatbot|[a-z]*bot)",
        "exemplar": "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)",
        "type": "Bot",
        "brand": "Facebook",
        "model": "External Hit",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "Social media bot detection with fallback for unknown platforms followed by bot indicators",
        "source_links": [
            "https://github.com/ua-parser/uap-core",
            "https://browscap.org"
        ],
        "extraction_confidence": 0.9
    },
    {
        "device": "Generic",
        "regex": "^(?:Mozilla\\/[\\d\\.]+\\s+)?(?:\\([^)]*\\))?\\s*(?:(?:(?P<browser>Chrome|Firefox|Safari|Edge|Opera|IE|Internet\\s+Explorer|[A-Za-z][A-Za-z0-9\\-_]*)(?:\\/(?P<browser_version>\\d+(?:\\.\\d+)*|[\\w\\-\\.]+))?)|(\\w+))(?:\\s+\\([^)]*\\))?\\s*(?:(?P<engine>WebKit|Gecko|Blink|Trident|EdgeHTML|Presto|[A-Za-z][A-Za-z0-9]*Engine?)(?:\\/(?P<engine_version>\\d+(?:\\.\\d+)*|[\\w\\-\\.]+))?)?,\\s*(?:(?P<os>Windows(?:\\s+NT)?|Mac(?:\\s+OS(?:\\s+X)?)?|Linux|Android|iOS|iPhone(?:\\s+OS)?|iPad(?:\\s+OS)?|[A-Za-z][A-Za-z0-9\\s]*(?:OS|Linux|BSD|Unix)?)(?:\\s+(?P<os_version>\\d+(?:[_\\.]\\d+)*|[\\w\\-\\.]+))?)?,\\s*(?:(?P<device_type>Mobile|Tablet|Desktop|TV|Console|Watch|Car|VR|IoT|[A-Za-z][A-Za-z0-9]*(?:Device|Pad|Phone|TV|Console|Watch)?))?,\\s*(?:(?P<brand>Apple|Samsung|Google|Huawei|Xiaomi|OnePlus|LG|Sony|Nokia|Motorola|HTC|Oppo|Vivo|Realme|Honor|[A-Z][A-Za-z0-9]*(?:Corp|Inc|Ltd|Co)?))?,\\s*(?:(?P<model>iPhone|iPad|iPod|Galaxy\\s+\\w+|Pixel\\s+\\w+|Nexus\\s+\\w+|SM-\\w+|LG-\\w+|HTC\\s+\\w+|Xperia\\s+\\w+|Nokia\\s+\\w+|Moto\\s+\\w+|[A-Z][A-Za-z0-9\\-]*(?:\\s+[A-Za-z0-9\\-]+)*))?,\\s*(?P<is_bot>bot|crawler|spider|scraper|[A-Za-z0-9\\-_]*(?:bot|crawler|spider|scraper|agent|tool))?.*$",
        "exemplar": "Mozilla/5.0 (Linux; Android 12; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Mobile Safari/537.36",
        "type": "Mobile",
        "brand": "Samsung",
        "model": "Galaxy S21",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "Comprehensive unified User-Agent parser with named capture groups and unknown value fallbacks for all components",
        "source_links": [
            "https://github.com/ua-parser/uap-core",
            "https://datatracker.ietf.org/doc/html/rfc7231",
            "https://browscap.org"
        ],
        "extraction_confidence": 0.6
    },
    {
        "device": "Mobile",
        "regex": "(?:Mobile|Android|iPhone|iPad|iPod|BlackBerry|IEMobile|Opera\\s+Mini|Opera\\s+Mobi|Windows\\s+Phone|Symbian|Palm|Pocket\\s+PC|Mobile\\s+Safari|Tablet|Kindle|Silk|CrOS|webOS|Tizen|Bada|MeeGo|Maemo|Series60|S60|UIWebView|Mobile\\/\\w+|[A-Za-z][A-Za-z0-9]*(?:Mobile|Tablet|Phone|OS|Web))",
        "exemplar": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Mobile/15E148 Safari/604.1",
        "type": "Mobile",
        "brand": "Apple",
        "model": "iPhone",
        "schema": {
            "device": {
                "type": "string",
                "brand": "string",
                "model": "string"
            }
        },
        "description": "Mobile-specific detection with fallback for unknown mobile platforms with common mobile suffixes",
        "source_links": [
            "https://github.com/ua-parser/uap-core",
            "https://browscap.org",
            "https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent"
        ],
        "extraction_confidence": 0.8
    }
]

In [None]:
from typing import List, Dict, Any, Optional
from pydantic import BaseModel, Field


class DeviceSchema(BaseModel):
    """Schema for device information in a User-Agent string."""
    type: str = Field(description="The type of device (e.g., 'string')")
    brand: str = Field(description="The brand of device (e.g., 'string')")
    model: str = Field(description="The model of device (e.g., 'string')")


class UserAgentPattern(BaseModel):
    """
    Model representing a pattern for parsing User-Agent strings.
    Each pattern contains information about the device, regex pattern,
    exemplar, confidence scores, and more.
    """
    device: str = Field(description="The device category (e.g., 'Desktop', 'Mobile', 'Bot')")
    regex: str = Field(description="Regular expression pattern to match in User-Agent strings")
    exemplar: str = Field(description="Example User-Agent string that matches this pattern")
    type: str = Field(description="The type of device (e.g., 'Desktop', 'Mobile', 'Bot')")
    brand: str = Field(description="The brand of the device (e.g., 'Generic', 'Apple', 'Samsung')")
    model: str = Field(description="The model of the device (e.g., 'PC', 'iPhone', 'Galaxy')")
    device_schema: Dict[str, Dict[str, str]] = Field(alias="schema", description="Schema defining the structure of the device information")
    description: str = Field(description="Description of what this pattern detects")
    source_links: List[str] = Field(description="Links to sources for this pattern")
    extraction_confidence: float = Field(description="Confidence score for this pattern (0.0 to 1.0)")


class UserAgentSchema(BaseModel):
    """
    Collection of patterns for parsing User-Agent strings.
    """
    patterns: List[UserAgentPattern] = Field(description="List of patterns for parsing User-Agent strings")


# Function to convert the existing user_agent_schema to a Pydantic model
def create_user_agent_schema_model(user_agent_schema: List[Dict[str, Any]]) -> UserAgentSchema:
    """
    Convert the existing user_agent_schema to a Pydantic model.

    Args:
        user_agent_schema: The existing user_agent_schema as a list of dictionaries

    Returns:
        A UserAgentSchema instance containing the patterns
    """
    patterns = [UserAgentPattern(**pattern) for pattern in user_agent_schema]
    return UserAgentSchema(patterns=patterns)


def load_user_agent_schema(user_agents_list: List[str] = None) -> UserAgentSchema:
    """
    Load and create a UserAgentSchema from the predefined user_agent_schema.

    Args:
        user_agents_list: Optional list of user agent strings (for future use)

    Returns:
        UserAgentSchema: A properly structured schema object
    """
    # Use the existing user_agent_schema defined in your notebook
    try:
        # Convert the raw schema data to UserAgentPattern objects
        patterns = []
        for pattern_data in user_agent_schema:
            # Create UserAgentPattern from the dictionary
            pattern = UserAgentPattern(**pattern_data)
            patterns.append(pattern)

        # Return a UserAgentSchema object
        return UserAgentSchema(patterns=patterns)

    except Exception as e:
        print(f"Error creating user agent schema: {e}")
        # Return a basic schema as fallback
        return UserAgentSchema(patterns=[])




# Function to parse a User-Agent string using the schema
def parse_user_agent(user_agent: str, schema: UserAgentSchema) -> Dict[str, Any]:
    """
    Parse a User-Agent string using the provided schema.

    Args:
        user_agent: The User-Agent string to parse
        schema: The UserAgentSchema to use for parsing

    Returns:
        A dictionary containing the parsed information
    """
    import re

    # Initialize the result with default values
    result = {
        "device": {
            "type": None,
            "brand": None,
            "model": None
        },
        "browser": None,
        "browser_version": None,
        "engine": None,
        "engine_version": None,
        "os": None,
        "os_version": None,
        "is_bot": False,
        "confidence": 0.0,
        "matched_patterns": []
    }

    # Sort patterns by extraction_confidence in descending order
    sorted_patterns = sorted(
        schema.patterns,
        key=lambda p: p.extraction_confidence,
        reverse=True
    )

    # Try to match each pattern
    for pattern in sorted_patterns:
        try:
            match = re.search(pattern.regex, user_agent, re.IGNORECASE)
            if match:
                # Add to matched patterns
                result["matched_patterns"].append({
                    "pattern": pattern.regex,
                    "confidence": pattern.extraction_confidence,
                    "description": pattern.description
                })

                # Update confidence if higher
                if pattern.extraction_confidence > result["confidence"]:
                    result["confidence"] = pattern.extraction_confidence

                # Update device info if not already set with higher confidence
                if result["device"]["type"] is None or pattern.extraction_confidence > result["confidence"]:
                    result["device"]["type"] = pattern.type
                    result["device"]["brand"] = pattern.brand
                    result["device"]["model"] = pattern.model

                # Try to extract named groups if they exist
                if hasattr(match, 'groupdict') and match.groupdict():
                    for key, value in match.groupdict().items():
                        if value:
                            if key in ["browser", "engine", "os", "device_type", "brand", "model"]:
                                if key == "device_type":
                                    result["device"]["type"] = value
                                elif key in ["brand", "model"]:
                                    result["device"][key] = value
                                else:
                                    result[key] = value
                            elif key in ["browser_version", "engine_version", "os_version"]:
                                result[key] = value
                            elif key == "is_bot" and value:
                                result["is_bot"] = True
        except re.error:
            # Skip patterns with invalid regex
            continue

    return result



In [None]:
user_agents = [
    # Chrome (Windows)
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",

    # Chrome (macOS)
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",

    # Safari (macOS)
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.15",

    # Firefox (Windows)
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",

    # Firefox (macOS)
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:140.0) Gecko/20100101 Firefox/140.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",

    # Edge (Windows)
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0",

    # Opera
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 OPR/122.0.0.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 OPR/122.0.0.0",

    # Chrome Mobile (Android)
    "Mozilla/5.0 (Linux; Android 10; K) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Mobile Safari/537.36",
    "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Mobile Safari/537.36",
    "Mozilla/5.0 (Linux; Android 14; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Mobile Safari/537.36",

    # Safari Mobile (iOS)
    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 17_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Mobile/15E148 Safari/604.1",

    # Firefox Mobile (Android)
    "Mozilla/5.0 (Android 12; Mobile; rv:140.0) Gecko/16649680 Firefox/140.0",
    "Mozilla/5.0 (Android 13; Mobile; rv:121.0) Gecko/18431015 Firefox/121.0",

    # Samsung Internet
    "Mozilla/5.0 (Linux; Android 13; SM-A546B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/23.0 Chrome/115.0.0.0 Mobile Safari/537.36",
    "Mozilla/5.0 (Linux; Android 12; SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/22.0 Chrome/111.0.0.0 Mobile Safari/537.36",

    # iPad
    "Mozilla/5.0 (iPad; CPU OS 17_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1",

    # Android Tablets
    "Mozilla/5.0 (Linux; Android 12; SM-T970) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Linux; Android 13; Pixel Tablet) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",

    # WhatsApp
    "WhatsApp/2.23.24.76 A",
    "Mozilla/5.0 (Linux; Android 13; Pixel 7) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/136.0.0.0 Mobile Safari/537.36 WhatsApp/1.0",

    # Facebook App
    "Mozilla/5.0 (Linux; Android 13; Pixel 7 Build/TQ3A.230901.001; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/136.0.0.0 Mobile Safari/537.36 [FB_IAB/FB4A;FBAV/439.0.0.29.115;]",

    # Instagram App
    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 Instagram 307.0.0.34.111",

    # Googlebot
    "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
    "Googlebot/2.1 (+http://www.google.com/bot.html)",

    # Other Bots
    "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
    "facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)",

    # Additional User Agents from web sources
    "Dalvik/2.1.0 (Linux; U; Android 16; Pixel Fold Build/BP2A.250605.031.A2)",
    "Mozilla/5.0 (Linux; Android 13; RX Plus Build/TP1A.220624.014; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/111.0.5563.116 Mobile Safari/537.36 GoNativeAndroid/1.0 gonative",
    "Dalvik/2.1.0 (Linux; U; Android 14; moto g24 Build/UTAS34.82-97-4)"
]

# You can also access individual user agents like this:
random_ua = user_agents[0]  # Gets the first user agent
print(f"Total user agents: {len(user_agents)}")
print(random_ua)

In [None]:
parse_user_agent(random_ua, user_agent_schema)

In [None]:
# Correct usage:
user_agent_objects = []
schema = load_user_agent_schema(user_agents)  # This returns a UserAgentSchema object
for user_agent in user_agents:
    user_agent_objects.append(parse_user_agent(random_ua, schema))  # Use it directly

# Instead of passing the raw list
# raw_schema_data = load_user_agent_schema()
# user_agent_schema = UserAgentSchema(patterns=raw_schema_data)
# result = parse_user_agent(random_ua, user_agent_schema)


In [None]:
import json
print(json.dumps(user_agent_objects[0], indent=4))
# user_agent_objects[0]

## All the confidence scores in this notebook are artificial and based on surface level logic.

1. **Field-based Scoring**: Counts extracted fields (device type/brand/model, browser, OS, etc.) and gives points for
 each successful extraction
2. **Pattern Quality Assessment**: Considers the confidence scores of matched regex patterns and gives bonus points for multiple pattern matches
3. **Regex Validation Bonus**: Provides additional confidence when related fields are extracted together (e.g., browser + version)
4. **Weighted Combination**: Combines multiple factors with appropriate weights:
    - 40% original confidence
    - 30% extracted field count
    - 20% pattern match quality
    - 10% regex validation bonus
#### Additional info
- **Range**: 0.0 to 1.0 (same as original confidence)
- **Calculation**: Based on both the quality of regex matches and the quantity/quality of extracted data
- **Metadata**: Includes breakdown of contributing factors for transparency


In [None]:
import re
from typing import Dict, Any, List


def calculate_post_parse_confidence(parsed_ua: Dict[str, Any]) -> float:
    """
    Calculate a new confidence score based on the parsed user-agent data.

    Args:
        parsed_ua: The parsed user-agent object from parse_user_agent()

    Returns:
        A new confidence score between 0.0 and 1.0
    """

    # Count non-null extracted fields
    extracted_fields = []

    # Check device fields
    device = parsed_ua.get("device", {})
    if device.get("type"):
        extracted_fields.append("device_type")
    if device.get("brand"):
        extracted_fields.append("device_brand")
    if device.get("model"):
        extracted_fields.append("device_model")

    # Check other fields
    field_names = ["browser", "browser_version", "engine", "engine_version",
                   "os", "os_version"]

    for field in field_names:
        if parsed_ua.get(field):
            extracted_fields.append(field)

    # Check if bot detection worked
    if parsed_ua.get("is_bot"):
        extracted_fields.append("is_bot")

    # Calculate base score from number of extracted fields
    num_extracted = len(extracted_fields)
    field_score = min(num_extracted * 0.1, 0.7)  # Cap at 0.7 for field count

    # Get original confidence from matched patterns
    original_confidence = parsed_ua.get("confidence", 0.0)

    # Calculate pattern match quality
    matched_patterns = parsed_ua.get("matched_patterns", [])
    pattern_score = 0.0

    if matched_patterns:
        # Average confidence of matched patterns, weighted by number of patterns
        avg_pattern_confidence = sum(p["confidence"] for p in matched_patterns) / len(matched_patterns)
        pattern_multiplier = min(len(matched_patterns) * 0.1, 0.3)  # Bonus for multiple matches
        pattern_score = avg_pattern_confidence + pattern_multiplier

    # Regex validation bonus
    regex_bonus = 0.0
    user_agent_text = getattr(parsed_ua, 'original_ua', '')  # Would need to store original UA

    # Check for common patterns that indicate good parsing
    if device.get("type") and device.get("brand"):
        regex_bonus += 0.1

    if parsed_ua.get("browser") and parsed_ua.get("browser_version"):
        regex_bonus += 0.1

    if parsed_ua.get("os") and parsed_ua.get("os_version"):
        regex_bonus += 0.1

    # Combine scores with weights
    final_confidence = (
            original_confidence * 0.4 +  # 40% from original confidence
            field_score * 0.3 +  # 30% from extracted field count
            pattern_score * 0.2 +  # 20% from pattern match quality
            regex_bonus * 0.1  # 10% from regex validation bonus
    )

    # Ensure score is between 0.0 and 1.0
    return min(max(final_confidence, 0.0), 1.0)


def recalculate_confidence_scores(parsed_objects: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Add post_parse_confidence scores to a list of parsed user-agent objects.

    Args:
        parsed_objects: List of parsed user-agent dictionaries

    Returns:
        List of parsed objects with added post_parse_confidence field
    """

    enhanced_objects = []

    for parsed_ua in parsed_objects:
        # Create a copy to avoid modifying original
        enhanced_ua = parsed_ua.copy()

        # Calculate extracted fields
        extracted_fields = []
        device = parsed_ua.get("device", {})
        if device.get("type"):
            extracted_fields.append("device_type")
        if device.get("brand"):
            extracted_fields.append("device_brand")
        if device.get("model"):
            extracted_fields.append("device_model")

        field_names = ["browser", "browser_version", "engine", "engine_version",
                       "os", "os_version"]
        for field in field_names:
            if parsed_ua.get(field):
                extracted_fields.append(field)

        if parsed_ua.get("is_bot"):
            extracted_fields.append("is_bot")

        # Calculate confidence score and detailed breakdown
        confidence_score = calculate_post_parse_confidence(parsed_ua)

        # Add the post_parse_confidence field as a dictionary containing all confidence-related data
        enhanced_ua["post_parse_confidence"] = {
            "score": confidence_score,
            "original_confidence": parsed_ua.get("confidence", 0.0),
            "extracted_field_count": len(extracted_fields),
            "extracted_fields": extracted_fields,
            "matched_pattern_count": len(parsed_ua.get("matched_patterns", [])),
            "matched_patterns": parsed_ua.get("matched_patterns", []),
            "is_bot_detected": parsed_ua.get("is_bot", False),
            "has_device_info": bool(device.get("type") and device.get("brand")),
            "has_browser_info": bool(parsed_ua.get("browser") and parsed_ua.get("browser_version")),
            "has_os_info": bool(parsed_ua.get("os") and parsed_ua.get("os_version"))
        }

        enhanced_objects.append(enhanced_ua)

    return enhanced_objects


# Apply the new confidence calculation to your existing parsed objects
enhanced_user_agent_objects = recalculate_confidence_scores(user_agent_objects)

# Display example with new confidence scores
import json

print("Enhanced user agent object with post_parse_confidence:")
print(json.dumps(enhanced_user_agent_objects[0], indent=2))

# Compare original vs new confidence scores
print("Confidence Score Comparison:")
print("=" * 50)
for i, (original, enhanced) in enumerate(zip(user_agent_objects[:5], enhanced_user_agent_objects[:5])):
    original_conf = original.get("confidence", 0.0)
    new_conf = enhanced["post_parse_confidence"]["score"]
    field_count = enhanced["post_parse_confidence"]["extracted_field_count"]

    print(f"User Agent {i + 1}:")
    print(f"  Original Confidence: {original_conf:.3f}")
    print(f"  Post-Parse Confidence: {new_conf:.3f}")
    print(f"  Extracted Fields: {field_count}")
    print(f"  Difference: {new_conf - original_conf:+.3f}")
    print()

In [None]:
import re
from typing import Dict, Any, List


def calculate_post_parse_confidence(parsed_ua: Dict[str, Any]) -> float:
    """
    Calculate a new confidence score based on the parsed user-agent data.

    Args:
        parsed_ua: The parsed user-agent object from parse_user_agent()

    Returns:
        A new confidence score between 0.0 and 1.0
    """

    # Count non-null extracted fields
    extracted_fields = []

    # Check device fields
    device = parsed_ua.get("device", {})
    if device.get("type"):
        extracted_fields.append("device_type")
    if device.get("brand"):
        extracted_fields.append("device_brand")
    if device.get("model"):
        extracted_fields.append("device_model")

    # Check other fields
    field_names = ["browser", "browser_version", "engine", "engine_version",
                   "os", "os_version"]

    for field in field_names:
        if parsed_ua.get(field):
            extracted_fields.append(field)

    # Check if bot detection worked
    if parsed_ua.get("is_bot"):
        extracted_fields.append("is_bot")

    # Calculate base score from number of extracted fields
    num_extracted = len(extracted_fields)
    field_score = min(num_extracted * 0.1, 0.7)  # Cap at 0.7 for field count

    # Get original confidence from matched patterns
    original_confidence = parsed_ua.get("confidence", 0.0)

    # Calculate pattern match quality
    matched_patterns = parsed_ua.get("matched_patterns", [])
    pattern_score = 0.0

    if matched_patterns:
        # Average confidence of matched patterns, weighted by number of patterns
        avg_pattern_confidence = sum(p["confidence"] for p in matched_patterns) / len(matched_patterns)
        pattern_multiplier = min(len(matched_patterns) * 0.1, 0.3)  # Bonus for multiple matches
        pattern_score = avg_pattern_confidence + pattern_multiplier

    # Regex validation bonus
    regex_bonus = 0.0
    user_agent_text = getattr(parsed_ua, 'original_ua', '')  # Would need to store original UA

    # Check for common patterns that indicate good parsing
    if device.get("type") and device.get("brand"):
        regex_bonus += 0.1

    if parsed_ua.get("browser") and parsed_ua.get("browser_version"):
        regex_bonus += 0.1

    if parsed_ua.get("os") and parsed_ua.get("os_version"):
        regex_bonus += 0.1

    # Combine scores with weights
    final_confidence = (
            original_confidence * 0.4 +  # 40% from original confidence
            field_score * 0.3 +  # 30% from extracted field count
            pattern_score * 0.2 +  # 20% from pattern match quality
            regex_bonus * 0.1  # 10% from regex validation bonus
    )

    # Ensure score is between 0.0 and 1.0
    return min(max(final_confidence, 0.0), 1.0)


def recalculate_confidence_scores(parsed_objects: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Add post_parse_confidence scores to a list of parsed user-agent objects.

    Args:
        parsed_objects: List of parsed user-agent dictionaries

    Returns:
        List of parsed objects with added post_parse_confidence field
    """

    enhanced_objects = []

    for parsed_ua in parsed_objects:
        # Create a copy to avoid modifying original
        enhanced_ua = parsed_ua.copy()

        # Calculate extracted fields
        extracted_fields = []
        device = parsed_ua.get("device", {})
        if device.get("type"):
            extracted_fields.append("device_type")
        if device.get("brand"):
            extracted_fields.append("device_brand")
        if device.get("model"):
            extracted_fields.append("device_model")

        field_names = ["browser", "browser_version", "engine", "engine_version",
                       "os", "os_version"]
        for field in field_names:
            if parsed_ua.get(field):
                extracted_fields.append(field)

        if parsed_ua.get("is_bot"):
            extracted_fields.append("is_bot")

        # Calculate confidence score and detailed breakdown
        confidence_score = calculate_post_parse_confidence(parsed_ua)

        # Add the post_parse_confidence_total field right after confidence
        enhanced_ua["post_parse_confidence_total"] = confidence_score

        # Add the post_parse_confidence field as a dictionary containing all confidence-related data
        enhanced_ua["post_parse_confidence"] = {
            "score": confidence_score,
            "original_confidence": parsed_ua.get("confidence", 0.0),
            "extracted_field_count": len(extracted_fields),
            "extracted_fields": extracted_fields,
            "matched_pattern_count": len(parsed_ua.get("matched_patterns", [])),
            "matched_patterns": parsed_ua.get("matched_patterns", []),
            "is_bot_detected": parsed_ua.get("is_bot", False),
            "has_device_info": bool(device.get("type") and device.get("brand")),
            "has_browser_info": bool(parsed_ua.get("browser") and parsed_ua.get("browser_version")),
            "has_os_info": bool(parsed_ua.get("os") and parsed_ua.get("os_version"))
        }

        enhanced_objects.append(enhanced_ua)

    return enhanced_objects


# Apply the new confidence calculation to your existing parsed objects
enhanced_user_agent_objects = recalculate_confidence_scores(user_agent_objects)

# Display example with new confidence scores
import json

print("Enhanced user agent object with post_parse_confidence:")
print(json.dumps(enhanced_user_agent_objects[0], indent=2))

# Compare original vs new confidence scores
print("Confidence Score Comparison:")
print("=" * 50)
for i, (original, enhanced) in enumerate(zip(user_agent_objects[:5], enhanced_user_agent_objects[:5])):
    original_conf = original.get("confidence", 0.0)
    new_conf = enhanced["post_parse_confidence_total"]
    field_count = enhanced["post_parse_confidence"]["extracted_field_count"]

    print(f"User Agent {i + 1}:")
    print(f"  Original Confidence: {original_conf:.3f}")
    print(f"  Post-Parse Confidence Total: {new_conf:.3f}")
    print(f"  Extracted Fields: {field_count}")
    print(f"  Difference: {new_conf - original_conf:+.3f}")
    print()

## User-Agent Python Package

In [None]:
print(json.dumps(user_agent_objects[0], indent=4))

In [None]:
from user_agents import parse
ua_string = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
ua = parse(ua_string)
print(ua.browser.family, ua.os.family, ua.is_mobile)
print(ua)