## Imports and define sample text

In [1]:
import re
import string

## Frequent cleaning operations

In [2]:
def print_text(sample, clean):
    print(f"Before: {sample}")
    print(f"After: {clean}")

### Lowercase text

In [3]:
sample_text = "This is a SAMPLE TEXT"
clean_text = sample_text.lower()
print_text(sample_text, clean_text)

Before: This is a SAMPLE TEXT
After: this is a sample text


### Remove cases (useful for comparisons)

In [4]:
sample_text = "This is a SAMPLE TEXT"
clean_text = sample_text.casefold()
print_text(sample_text, clean_text)

Before: This is a SAMPLE TEXT
After: this is a sample text


### Remove links

In [5]:
# Source: https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python/40823105#40823105
sample_text = "Some URLs: https://example.com http://example.io http://exam-ple.com More text"
clean_text = re.sub(r"https?://\S+", "", sample_text)
print_text(sample_text, clean_text)

Before: Some URLs: https://example.com http://example.io http://exam-ple.com More text
After: Some URLs:    More text


### Remove a tags but keep content

In [6]:
# Source: https://stackoverflow.com/questions/20867719/removing-a-href-tag-using-regex
sample_text = "Here's <a href='https://example.com'> a tag</a>"
clean_text = re.sub(r"<a[^>]*>(.*?)</a>", r"\1", sample_text)
print_text(sample_text, clean_text)

Before: Here's <a href='https://example.com'> a tag</a>
After: Here's  a tag


### Remove multiple spaces, tabs, and indents

In [7]:
sample_text = "\t\tA      text\t\t\t\n\n Example"
clean_text = " ".join(sample_text.split())
print_text(sample_text, clean_text)

Before: 		A      text			

 Example
After: A text Example


### Remove punctuation

In [8]:
sample_text = "A lot of !!!! .... ,,,, ;;;;;;;?????"
clean_text = re.sub(f"[{re.escape(string.punctuation)}]", "", sample_text)
print_text(sample_text, clean_text)

Before: A lot of !!!! .... ,,,, ;;;;;;;?????
After: A lot of    


### Remove numbers

In [9]:
# Source: https://stackoverflow.com/questions/40020326/how-to-remove-words-containing-only-numbers-in-python
sample_text = "This are some numbers: 1919191 2229292 11.233 22/22/22. Don't remove this one H2O"
clean_text = re.sub(r"\b[0-9]+\b\s*", "", sample_text)
print_text(sample_text, clean_text)

Before: This are some numbers: 1919191 2229292 11.233 22/22/22. Don't remove this one H2O
After: This are some numbers: .//. Don't remove this one H2O


### Remove digits

In [10]:
sample_text = "I want to keep this one: 10/10/20 but not this one 222333"
clean_text = " ".join(["" if w.isdigit() else w for w in sample_text.split()])
print_text(sample_text, clean_text)

Before: I want to keep this one: 10/10/20 but not this one 222333
After: I want to keep this one: 10/10/20 but not this one 


### Remove non-alphanumeric characters

In [11]:
sample_text = "Sample text 123 !!!! Haha.... !!!! ##$$$%%%%"
clean_text = re.sub(r"[^A-Za-z0-9\s]+", "", sample_text)
print_text(sample_text, clean_text)

Before: Sample text 123 !!!! Haha.... !!!! ##$$$%%%%
After: Sample text 123  Haha  


### Remove stopwords

In [12]:
stop_words = ["is", "a"]
sample_text = "this is a sample text"
tokens = sample_text.split()
clean_tokens = [t for t in tokens if not t in stop_words]
clean_text = " ".join(clean_tokens)
print_text(sample_text, clean_text)

Before: this is a sample text
After: this sample text


### Remove short tokens

In [13]:
sample_text = "this is a sample text. I'll remove the a"
tokens = sample_text.split()
clean_tokens = [t for t in tokens if len(t) > 1]
clean_text = " ".join(clean_tokens)
print_text(sample_text, clean_text)

Before: this is a sample text. I'll remove the a
After: this is sample text. I'll remove the


### Tokenize text

In [14]:
from nltk.tokenize import word_tokenize
sample_text = "this is a text ready to tokenize"
clean_text = word_tokenize(sample_text)
print_text(sample_text, clean_text)

Before: this is a text ready to tokenize
After: ['this', 'is', 'a', 'text', 'ready', 'to', 'tokenize']
