In [1]:
var = "Doesn\u00e2\u0080\u0099t really do the job"

In [2]:
print(var)

Doesnât really do the job


In [30]:
import csv
from io import StringIO
from typing import Generator, Dict, Any

def decode_latin1_to_utf8(text: str) -> str:
    """
    Decode a string from Latin-1 (ISO-8859-1) to UTF-8.
    
    Args:
        text (str): The misencoded text.
    
    Returns:
        str: The corrected UTF-8 string.
    """
    # Convert the misencoded string back to bytes
    byte_string = text.encode('latin1')
    # Decode it properly into UTF-8
    return byte_string.decode('utf-8')

def parse_document(document: str) -> Generator[Dict[str, Any], None, None]:
    """
    Parse a TSV document and yield the parsed rows as dictionaries.

    Args:
        document (str): A TSV-formatted string.

    Yields:
        Dict[str, Any]: The parsed rows as dictionaries with column headers as keys.
    """
    # Debug output
    print(f"Raw document: {document}")

    if not document:
        print("Document is empty or None")
        return

    # Correct the encoding of the document
    corrected_document = decode_latin1_to_utf8(document)

    # Use StringIO to treat the corrected string as a file-like object
    tsv_file = StringIO(corrected_document)

    # Create a DictReader to parse the TSV data
    reader = csv.DictReader(tsv_file, delimiter="\t")

    for row in reader:
        yield row

In [32]:
# Sample TSV data with encoding issues
tsv_data = """Date\tRating\tComments\tYour Response\tOrder ID\tRater Email
22/08/24\t3\tNot adhesive enough. Came off within twenty minutes ð¤¦ââï¸\t\t204-7285323-7000301\tno-reply@amazon.com
16/08/24\t2\tWrong size delivered\t\t205-6760583-6261939\tno-reply@amazon.com
06/07/24\t1\tThis yoga match is not worth 10pence never mind what I paid for it. Slides all over the place Very disappointed\t\t203-6402425-2052315\tno-reply@amazon.com"""

# Use the parse_document method
for row in parse_document(tsv_data):
    print(row)

Raw document: Date	Rating	Comments	Your Response	Order ID	Rater Email
22/08/24	3	Not adhesive enough. Came off within twenty minutes ð¤¦ââï¸		204-7285323-7000301	no-reply@amazon.com
16/08/24	2	Wrong size delivered		205-6760583-6261939	no-reply@amazon.com
06/07/24	1	This yoga match is not worth 10pence never mind what I paid for it. Slides all over the place Very disappointed		203-6402425-2052315	no-reply@amazon.com
{'Date': '22/08/24', 'Rating': '3', 'Comments': 'Not adhesive enough. Came off within twenty minutes 🤦\u200d♂️', 'Your Response': '', 'Order ID': '204-7285323-7000301', 'Rater Email': 'no-reply@amazon.com'}
{'Date': '16/08/24', 'Rating': '2', 'Comments': 'Wrong size delivered', 'Your Response': '', 'Order ID': '205-6760583-6261939', 'Rater Email': 'no-reply@amazon.com'}
{'Date': '06/07/24', 'Rating': '1', 'Comments': 'This yoga match is not worth 10pence never mind what I paid for it. Slides all over the place Very disappointed', 'Your Response': '', 'Order ID': '203-6

In [27]:
import chardet

def detect_encoding(document: str) -> str:
    """
    Detect the encoding of a document.

    Args:
        document (str): A string of the document's content.

    Returns:
        str: Detected encoding.
    """
    # Detect encoding
    result = chardet.detect(document.encode('utf-8'))
    return result['encoding']

# Example usage
raw_document = "Doesnât really do the job"
encoding = detect_encoding(raw_document)
print(f"Detected encoding: {encoding}")

Detected encoding: utf-8
