This removes all diacritics from a str. It is heavy handed and will do it for any character, even those that are not Latin languages

In [8]:
import unicodedata
import string


def shave_marks(txt):
    """Remove all diacritic marks"""
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)




In [9]:
# A list of strings with diacritic marks and special characters
strings = [
    'Résumé',
    'Piqué',
    'naïve',
    'São Paulo',
    'Él Niño',
    'Beyoncé',
    'café',
    'jalapeño',
    'façade',
    'coördinate',
    'à la carte',
    'Pokémon',
    'Mëtàl Hëàd',
    '🎉Celebration time!',
    'Hello, 世界',
    'Καλημέρα κόσμε',
    'Привет мир',
    'السلام عليكم',
]

# Apply the function to each string and print the result
for s in strings:
    print(f'Original: {s}, Shaved: {shave_marks(s)}')



Original: Résumé, Shaved: Resume
Original: Piqué, Shaved: Pique
Original: naïve, Shaved: naive
Original: São Paulo, Shaved: Sao Paulo
Original: Él Niño, Shaved: El Nino
Original: Beyoncé, Shaved: Beyonce
Original: café, Shaved: cafe
Original: jalapeño, Shaved: jalapeno
Original: façade, Shaved: facade
Original: coördinate, Shaved: coordinate
Original: à la carte, Shaved: a la carte
Original: Pokémon, Shaved: Pokemon
Original: Mëtàl Hëàd, Shaved: Metal Head
Original: 🎉Celebration time!, Shaved: 🎉Celebration time!
Original: Hello, 世界, Shaved: Hello, 世界
Original: Καλημέρα κόσμε, Shaved: Καλημερα κοσμε
Original: Привет мир, Shaved: Привет мир
Original: السلام عليكم, Shaved: السلام عليكم


This is a version that will only shave the marks off if it it a Latin character:

In [10]:
def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    preserve = []

    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue # ignore diacritic on Latin base char
        preserve.append(c)
        # if it isn't combining char, it's a new base char
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(preserve)
    return unicodedata.normalize('NFC', shaved)
    

In [11]:
# A list of strings with diacritic marks and special characters
strings = [
    'Résumé',
    'Piqué',
    'naïve',
    'São Paulo',
    'Él Niño',
    'Beyoncé',
    'café',
    'jalapeño',
    'façade',
    'coördinate',
    'à la carte',
    'Pokémon',
    'Mëtàl Hëàd',
    '🎉Celebration time!',
    'Hello, 世界',
    'Καλημέρα κόσμε',
    'Привет мир',
    'السلام عليكم',
]

# Apply the function to each string and print the result
for s in strings:
    print(f'Original: {s}, Shaved: {shave_marks(s)}')


Original: Résumé, Shaved: Resume
Original: Piqué, Shaved: Pique
Original: naïve, Shaved: naive
Original: São Paulo, Shaved: Sao Paulo
Original: Él Niño, Shaved: El Nino
Original: Beyoncé, Shaved: Beyonce
Original: café, Shaved: cafe
Original: jalapeño, Shaved: jalapeno
Original: façade, Shaved: facade
Original: coördinate, Shaved: coordinate
Original: à la carte, Shaved: a la carte
Original: Pokémon, Shaved: Pokemon
Original: Mëtàl Hëàd, Shaved: Metal Head
Original: 🎉Celebration time!, Shaved: 🎉Celebration time!
Original: Hello, 世界, Shaved: Hello, 世界
Original: Καλημέρα κόσμε, Shaved: Καλημερα κοσμε
Original: Привет мир, Shaved: Привет мир
Original: السلام عليكم, Shaved: السلام عليكم


This will remove characters that are common in Western texts, but are not ASCII characters, and replace them with ASCII characters:

In [12]:
single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""", """'f"^<''""---~>""")

multi_map = str.maketrans({
    '€': 'EUR',
    '…': '...',
    'Æ': 'AE',
    'æ': 'ae',
    'Œ': 'OE',
    'œ': 'oe',
    '™': '(TM)',
    '‰': '<per mille>',
    '†': '**',
    '‡': '***',
})

multi_map.update(single_map)

def dewinize(txt):
    """Replace Win1252 symbols with ASCII chars or sequences"""
    return txt.translate(multi_map)

def asciiize(txt):
    no_marks = shave_marks_latin(dewinize(txt))
    no_marks = no_marks.replace('ß', 'ss')
    return unicodedata.normalize('NFKC', no_marks)


In [13]:
order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'

print("Dewinized:", dewinize(order))

print("ASCIIized:", asciiize(order))

Dewinized: "Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."
ASCIIized: "Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."


This is a command line utility to find the UTF number and name of a character (especially emojis and weird chars): it is in cf.py

In [14]:
#!/usr/bin/env python3
import sys
import unicodedata

START, END = ord(' '), sys.maxunicode + 1

def find(*query_words, start=START, end=END):
    query = {w.upper() for w in query_words}
    for code in range(start, end):
        char = chr(code)
        name = unicodedata.name(char, None)
        if name and query.issubset(name.split()):
            print(f'U+{code:04x}\t{char}\t{name}')
            
def main(words):
    if words:
        find(*words)
    else:
        print('Please provide words to find')

if __name__ == '__main__':
    main(sys.argv[1:])