<a href="https://colab.research.google.com/github/gyasifred/webscraping-with-BeautifulSoup-Scrapy-and-Selenium/blob/main/anotation_tech_coding__challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
url = "https://docs.google.com/document/d/e/2PACX-1vSHesOf9hv2sPOntssYrEdubmMQm8lwjfwv6NPjjmIRYs_FOYXtqrYgjh85jBUebK9swPXh_a5TJ5Kl/pub"

In [None]:
import requests
from lxml import etree

# Function to retrieve and parse the Google Doc data using XPath
def get_google_doc_data(doc_url):
    """
    Retrieves and parses data from a Google Doc, extracting X and Y coordinates
    along with corresponding characters using XPath.

    Args:
        doc_url (str): The URL of the Google Doc to be processed.

    Returns:
        list: A list of tuples, each containing (x, character, y) extracted from the document.
               If an error occurs, an empty list is returned.

    Raises:
        Exception: If there is an issue with the document retrieval or parsing.
    """
    try:
        # Convert the Google Doc URL to export as plain text
        export_url = doc_url.replace('/edit', '/export?format=html')

        # Fetch the HTML content of the document
        response = requests.get(export_url)

        # Check if the request was successful
        if response.status_code != 200:
            raise Exception(f"Failed to retrieve the document. Status code: {response.status_code}")

        # Parse the HTML content with lxml
        tree = etree.HTML(response.content)

        # Define XPaths for the first, second, and third columns
        x_path_xcoord = "//tr/td[1]//p//span/text()"  # X-coordinate
        x_path_char = "//tr/td[2]//p//span/text()"    # Character
        x_path_ycoord = "//tr/td[3]//p//span/text()"  # Y-coordinate

        # Extract values using XPath
        x_coords = tree.xpath(x_path_xcoord)[1:]  # Skip the first element (header)
        chars = tree.xpath(x_path_char)[1:]       # Skip the first element (header)
        y_coords = tree.xpath(x_path_ycoord)[1:]  # Skip the first element (header)

        # Make sure all lists are of equal length
        if len(x_coords) != len(chars) or len(chars) != len(y_coords):
            raise Exception("Mismatched data lengths between x, characters, and y.")

        table_data = []

        # Extract text from the elements and convert to appropriate types
        for i in range(len(x_coords)):
            try:
                x = int(x_coords[i].strip())  # Extract X-coordinate
                # Decode the character from unicode
                char = chars[i].strip().encode('latin1').decode('utf-8')  # Decode the character
                y = int(y_coords[i].strip())  # Extract Y-coordinate

                # Ensure char is a single character
                if len(char) != 1:
                    raise ValueError(f"Expected a single character but got: '{char}'")

                # Append the parsed (x, char, y) tuple to the table_data list
                table_data.append((x, char, y))

            except ValueError as ve:
                print(f"Error processing row {i}: {ve}")
            except Exception as e:
                print(f"Unexpected error processing row {i}: {e}")

        if not table_data:
            raise Exception("No valid data found in the table.")

        return table_data

    except Exception as e:
        print(f"Error retrieving or parsing Google Doc: {e}")
        return []

# Function to create and print the grid
def print_grid_from_data(url):
    """
    Creates and prints a grid representation of characters based on their X and Y coordinates
    extracted from the specified Google Doc.

    Args:
        url (str): The URL of the Google Doc to retrieve data from.

    Returns:
        None

    Raises:
        TypeError: If the provided URL is not a string or if data types are invalid.
        ValueError: If there is a value-related issue during processing.
        Exception: For any unexpected errors that occur during grid creation or printing.
    """
    # Check if the URL is a string
    if not isinstance(url, str):
        raise TypeError("The provided URL must be a string.")

    try:
        table_data = get_google_doc_data(url)

        # Find the maximum x and y coordinates to determine grid size
        max_x = max(data[0] for data in table_data)
        max_y = max(data[2] for data in table_data)

        # Create an empty grid filled with spaces
        grid = [[' ' for _ in range(max_x + 1)] for _ in range(max_y + 1)]

        # Place characters in the grid based on their coordinates
        for x, char, y in table_data:
            # Validate x and y as integers and char as a string
            if not (isinstance(x, int) and isinstance(y, int) and isinstance(char, str)):
                raise TypeError(f"Invalid data types for (x, char, y): ({x}, {char}, {y})")

            grid[y][x] = char

        # Print the grid row by row
        for row in grid:
            print(''.join(row))

    except ValueError as ve:
        print(f"Value error: {ve}")
    except TypeError as te:
        print(f"Type error: {te}")
    except Exception as e:
        print(f"Unexpected error occurred: {e}")


In [None]:
print_grid_from_data(url)

██████████░ ██████░    ███████░     ██░     ██░     ████████░    ██░    ███░   ████████░  
██░           ██░    ███░    ██░   ████░   ████░    ██░     ██░  ██░  ███░   ███░     ███░
██░           ██░   ███░           ██░██░ ██░██░    ██░      ██░ ██░███░     ██░       ██░
████████░     ██░   ██░           ███░ ██░██░ ██░   ██░      ██░ ████░       ██░       ██░
██░           ██░   ███░          ██░  █████░ ███░  ██░      ██░ ██░███░     ██░       ██░
██░           ██░    ███░    ██░ ███░   ███░   ██░  ██░     ██░  ██░  ███░   ███░     ███░
██████████░ ██████░    ███████░  ██░           ███░ ████████░    ██░    ███░   ████████░  
