# Clean-text Documentation

### Default Parameters

from cleantext import clean

clean("some input",

    fix_unicode=True,               # fix various unicode errors
    
    to_ascii=True,                  # transliterate to closest ASCII representation
    
    lower=True,                     # lowercase text
    
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    
    no_urls=False,                  # replace all URLs with a special token
    
    no_emails=False,                # replace all email addresses with a special token
    
    no_phone_numbers=False,         # replace all phone numbers with a special token
    
    
    no_numbers=False,               # replace all numbers with a special token
    
    no_digits=False,                # replace all digits with a special token
    
    no_currency_symbols=False,      # replace all currency symbols with a special token
    
    no_punct=False,                 # remove punctuations
    
    replace_with_punct="",          # instead of removing punctuations you may replace them
    
    replace_with_url="<URL>",
    
    replace_with_email="<EMAIL>",
    
    replace_with_phone_number="<PHONE>",
    
    replace_with_number="<NUMBER>",
    
    replace_with_digit="0",
    
    replace_with_currency_symbol="<CUR>",
    
    lang="en"                       # set to 'de' for German special handling
)


Carefully choose the arguments that fit your task. The default parameters are listed above.

You may also only use specific functions for cleaning. For this, take a look at the source code.

So far, only English and German are fully supported.

In [1]:
#Importing the clean text library
from cleantext import clean
# Sample text
text = """ Zürich, largest city of Switzerland and capital of the canton of 633Zürich. Located in an Al\u017eupine. (https://google.com). Currency is not ₹"""
# Cleaning the "text" with clean text
clean(text, 
      fix_unicode=True, 
      to_ascii=True, 
      lower=True, 
      no_urls=True, 
      no_numbers=True, 
      no_digits=True, 
      no_currency_symbols=True, 
      no_punct=True, 
      replace_with_punct=" ", 
      replace_with_url="", 
      replace_with_number="", 
      replace_with_digit=" ", 
      replace_with_currency_symbol="Rupees")


'zurich largest city of switzerland and capital of the canton of zurich located in an alzupine currency is not rupees'

In [40]:
import cleantext
def test_normalize_whitespace():
    text = "Hello, world!  Hello...\t \tworld?\n\nHello:\r\n\n\nWorld. "
    print(cleantext.normalize_whitespace(text, no_line_breaks=False))

In [41]:
test_normalize_whitespace()

Hello, world! Hello... world?
Hello:
World.


In [4]:
def test_replace_urls():
    texts = [
        [
            "I learned everything I know from www.stackoverflow.com and http://wikipedia.org/ and Mom.",
            "I learned everything I know from *URL* and *URL* and Mom.",
        ],
        [
            "There's a bunch of references in that one scene alone, including [Moana](https://en.wikipedia.org/wiki/Moana_%282016_film%29), which comes out later this year.",
            "There's a bunch of references in that one scene alone, including [Moana](*URL*), which comes out later this year.",
        ],
        [
            "Also this should be fixed http://localhost:8080, https://localhost:8080, localhost:8080",
            "Also this should be fixed *URL*, *URL*, localhost:8080",
        ],
    ]

    for text, proc_text in texts:
        print(cleantext.replace_urls(text, "*URL*"))

In [5]:
test_replace_urls()

I learned everything I know from *URL* and *URL* and Mom.
There's a bunch of references in that one scene alone, including [Moana](*URL*), which comes out later this year.
Also this should be fixed *URL*, *URL*, localhost:8080


In [6]:
email_addresses = [
    "mustermann@fh-aachen.de",
    "mustermann(at)fh-aachen.de",
    "m.mustermann@fh-aachen.de",
    "m.mustermann(at)fh-aachen.de",
    "m.mustermann<at>fh-aachen.de",
    "m.mustermann[at]fh-aachen.de",
    "m.mustermann{at}fh-aachen.de",
    "m.mustermann@alumni.fh-aachen.de",
    "max.mustermann@alumni.fh-aachen.com",
    "hotbunny1337@test.mail.gg",
    "test@this.really.should.work.com",
]

not_email_addresses = [
    "mustermann@ fh-aachen.de",
    "mustermannatfh-aachen.de",
    "mustermannat)fh-aachen.de",
    "@test.de",
    "hu@.de",
]


def test_replace_emails():
    text = "I can be reached at username@example.com through next Friday."
    proc_text = "I can be reached at *EMAIL* through next Friday."
    print(cleantext.replace_emails(text, "*EMAIL*"))


In [7]:
test_replace_emails()

I can be reached at *EMAIL* through next Friday.


In [8]:
def test_email_addresses():
    for x in email_addresses:
        print(cleantext.replace_emails(x, "*EMAIL*"))


def test_not_email_addresses():
    for x in not_email_addresses:
        print(cleantext.replace_emails(x, "*EMAIL*"))


In [9]:
test_email_addresses()

*EMAIL*
*EMAIL*
*EMAIL*
*EMAIL*
*EMAIL*
*EMAIL*
*EMAIL*
*EMAIL*
*EMAIL*
*EMAIL*
*EMAIL*


In [10]:
test_not_email_addresses()

mustermann@ fh-aachen.de
mustermannatfh-aachen.de
mustermannat)fh-aachen.de
@test.de
hu@.de


In [11]:
phone_numbers = [
    "+49 123 1548690",
    "555-123-4567",
    "2404 9099130",
    "024049099130",
    "02404 9099130",
    "02404/9099130",
    "+492404 9099130",
    "+4924049099130",
    "+492404/9099130",
    "0160 123456789",
    "0160/123456789",
    "+32160 123456789",
    "Tel.: 0160 123456789",
]


def test_replace_phone_numbers():
    for x in phone_numbers:
        x_phone = cleantext.replace_phone_numbers(x, "*PHONE*")
        print("PHONE" in x_phone and not any(map(str.isdigit, x_phone)), (
            x + " / " + x_phone
        ))


def test_replace_numbers():
    text = "I owe $1,000.99 to 123 people for 2 +1 reasons."
    proc_text = "I owe $*NUM* to *NUM* people for *NUM* *NUM* reasons."
    print(cleantext.replace_numbers(text, "*NUM*"))


In [12]:
test_replace_phone_numbers()

True +49 123 1548690 / *PHONE*
True 555-123-4567 / *PHONE*
True 2404 9099130 / *PHONE*
True 024049099130 / *PHONE*
True 02404 9099130 / *PHONE*
True 02404/9099130 / *PHONE*
True +492404 9099130 / *PHONE*
True +4924049099130 / *PHONE*
True +492404/9099130 / *PHONE*
True 0160 123456789 / *PHONE*
True 0160/123456789 / *PHONE*
True +32160 123456789 / *PHONE*
True Tel.: 0160 123456789 / Tel.: *PHONE*


In [13]:
test_replace_numbers()

I owe $*NUM* to *NUM* people for *NUM* *NUM* reasons.


In [14]:
def test_remove_punct():
    text = "I can't. No, I won't! It's a matter of \"principle\"; of -- what's the word? -- conscience."
    proc_text = (
        "I cant No I wont Its a matter of principle of  whats the word  conscience"
    )
    print(cleantext.remove_punct(text))


In [15]:
print(test_remove_punct())

I cant No I wont Its a matter of principle of  whats the word  conscience
None


In [16]:
def test_replace_punct():
    text = "I can't. No, I won't!"
    proc_text = "i can t no i won t"
    print(cleantext.clean(text, no_punct=True, replace_with_punct=" "))


In [17]:
test_replace_punct()

i can t no i won t


In [18]:
def test_replace_currency_symbols():
    tests = [
        (
            "$1.00 equals £0.67 equals €0.91.",
            "USD1.00 equals GBP0.67 equals EUR0.91.",
            "*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.",
        ),
        (
            "this zebra costs $100.",
            "this zebra costs USD100.",
            "this zebra costs *CUR* 100.",
        ),
    ]
    for text, proc_text1, proc_text2 in tests:
        print(cleantext.replace_currency_symbols(text, replace_with=None))
        print(
            cleantext.replace_currency_symbols(text, replace_with="*CUR* "))


In [19]:
test_replace_currency_symbols()

USD1.00 equals GBP0.67 equals EUR0.91.
*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.
this zebra costs USD100.
this zebra costs *CUR* 100.


In [20]:
def test_fix_bad_unicode():
    text = "and install a \\u2018new\\u2019 society in their"
    assert cleantext.fix_bad_unicode(text) == "and install a 'new' society in their"
    print(cleantext.fix_bad_unicode(text))


In [21]:
test_fix_bad_unicode()

and install a 'new' society in their


In [22]:
def test_zero_digits():
    text = "in the 1970s there was 12.3 and 111 11 33 $23 03 wins"
    print(cleantext.replace_digits(text))
    
    text = "7 Golf Records More 'Unbreakable' Than the Warriors' 73 Wins"
    print(
        cleantext.replace_digits(text))


In [23]:
test_zero_digits()

in the 0000s there was 00.0 and 000 00 00 $00 00 wins
0 Golf Records More 'Unbreakable' Than the Warriors' 00 Wins


In [24]:
def test_to_ascii():
    print(cleantext.to_ascii_unicode("whatëver"))
    print(cleantext.to_ascii_unicode("Äpfel»", lang="de"))
    print(cleantext.to_ascii_unicode("Äpfel»", lang="DE"))


In [25]:
test_to_ascii()

whatever
Äpfel"
Äpfel"


In [26]:
def test_whitespace():
    print(cleantext.clean(" peter", normalize_whitespace=False))
    print(cleantext.clean(" peter", normalize_whitespace=True))
    print(
        cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=True))
    print(
        cleantext.clean(" pet\n\ner", normalize_whitespace=True, no_line_breaks=False))


In [27]:
test_whitespace()

 peter
peter
pet er
pet
er


In [28]:
emoji_line = (
    "🤔 🙈 me, se 😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽"
)

In [29]:
def test_keep_emojis():
    print(cleantext.clean(emoji_line))

In [30]:
def test_remove_emojis():
    print(cleantext.clean(emoji_line, no_emoji=True))

In [31]:
def test_remove_emojis_no_ascii():
    print(cleantext.clean("😊 you today🙅🏽🙅🏽", to_ascii=False, no_emoji=True))

In [32]:
test_keep_emojis()
test_remove_emojis()
test_remove_emojis_no_ascii()

🤔 🙈 me, se 😌 ds 💕👭👙 hello 👩🏾‍🎓 emoji hello 👨‍👩‍👦‍👦 how are 😊 you today🙅🏽🙅🏽
me, se ds hello emoji hello how are you today
you today


In [33]:
def test_remove_trail_leading_whitespace():
    text_input = """
    Sehr geehrte Damen und Herren,
ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).
Der Fotoautomat steht in  19061  Berlin.
		Marke: Fotofix
		Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro
Mit freundlichen Grüßen,
Johannes dfdfd
    """

    text_output = """Sehr geehrte Damen und Herren,
ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).
Der Fotoautomat steht in 19061 Berlin.
Marke: Fotofix
Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro
Mit freundlichen Grüßen,
Johannes dfdfd"""

    print(
        cleantext.clean(
            text_input,
            lower=False,
            lang="de",
            no_line_breaks=False,
            keep_two_line_breaks=True,
        )
    )

    assert text_output == cleantext.clean(
        text_input,
        lower=False,
        lang="de",
        no_line_breaks=False,
        keep_two_line_breaks=True,
    )


In [34]:
test_remove_trail_leading_whitespace()

Sehr geehrte Damen und Herren,
ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).
Der Fotoautomat steht in 19061 Berlin.
Marke: Fotofix
Ort des Automats: Bezirksamt / Bürgeramt / Bürgerbüro
Mit freundlichen Grüßen,
Johannes dfdfd


In [38]:
def test_remove_trail_leading_whitespace():
    text_input = b'Sehr geehrte Damen und Herren,\\r\\n\\r\\nich m\\xf6chte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten f\\xfcr biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).\\r\\n\\r\\nDer Fotoautomat steht in  .\\r\\n\\r\\n\\r\\n\\t\\r\\n\\t\\tOrt des Automats: \\r\\n\\t\\r\\n\\r\\n\\r\\n\\r\\n \\r\\n\\t\\r\\n\\t\\tMarke: \\r\\n\\t\\r\\n\\r\\n\\r\\n\\r\\n\\r\\nHier noch Text von Anna Lena.\\r\\n\\r\\nMit freundlichen Gr\\xfc\\xdfen'
    text_input = text_input.decode('unicode_escape')
    print(
        cleantext.clean(
            text_input,
            lower=False,
            lang="de",
            no_line_breaks=False,
            keep_two_line_breaks=True,
        )
    )

In [39]:
test_remove_trail_leading_whitespace()

Sehr geehrte Damen und Herren,

ich möchte Sie bitten, zu folgendem Fall Stellung zu nehmen. Ich habe einen Fotoautomaten für biometrische Passfotos benutzt, der mein Gesicht nicht erkannt hat. Es besteht die Vermutung, dass dieser Fotoautomat vom BSI zertifiziert ist (Zertifikat BSI-DSZ-CC-0985-2018).

Der Fotoautomat steht in .

Ort des Automats:

Marke:

Hier noch Text von Anna Lena.

Mit freundlichen Grüßen
