# Table of Content
1. [Text Cleaning](#textcleaning)
2. [Text Preprocessing](#textpreprocessing)

In [None]:
%time
import os
import sys
import time
import random
import string
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
import numpy as np
import pandas as pd
import sklearn

# libraries for text cleaning
import contractions
from bs4 import BeautifulSoup
from textblob import TextBlob
from spellchecker import SpellChecker

# libraries and packages for text (pre-)processing 
import string
import re
import nltk

In [105]:
train_df = pd.read_csv("Data/train.csv")
print(train_df.shape)
train_df.head()

(159571, 8)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


<a id="textcleaning"></a>
# 1. Text Cleaning

## Convert to Lower Case

We convert all letters to lower case to prepare for the following steps of text cleaning. Exceptional cases such as capital abbreviation will be solved by replacing typos, slang, acronyms or informal abbreviations technique in the subsquent steps.

In [106]:
train_df["clean_text"] = train_df["comment_text"].apply(lambda x: x.lower())
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0,explanation\r\nwhy the edits made under my use...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i'm s...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i'm really not trying to edit war. it..."
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0,"""\r\nmore\r\ni can't make any real suggestions..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember..."


## Expand Contractions


Contractions are words or combinations of words that are shortened by dropping letters and replacing them by an apostrophe. Removing contractions helps contribute to text standardization. We use contractions package to expand contractions.

In [107]:
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: contractions.fix(x))

In [108]:
# check if expand contractions works
print("Original text: \n", train_df["comment_text"][2])
print("Clean text: \n", train_df["clean_text"][2])

Original text: 
 Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
Clean text: 
 hey man, i am really not trying to edit war. it is just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.


## Remove Noise

Remove unnecessary characters or punctuation such as URLs, HTML tags, non-ASCII characters, or other special characters 

### Remove URL

In [117]:
# replace URL with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', ' ', x))

###  Remove Non-ASCI Characters

In [131]:
# replace Non_ASCI characters with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(r'[^\x00-\x7f]', ' ', x))

###  Remove Special Characters

In [147]:
regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        u"\ufe0f"  # dingbats
        "]+", flags = re.UNICODE)

In [148]:
# replace special characters with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: regrex_pattern.sub(' ', x))

In [149]:
# check if special characters are removed
print("Original text: \n", train_df["comment_text"][143])
print("Clean text: \n", train_df["clean_text"][143])

Original text: 
 "P.S. It's not polite to talk to people behind their backs, please remove your comments from Mrph's talk page.

Vaughan
You're right; I went to check your previous edit and found a page on the Marvel site that spelled it ""Vaughn"", but now I am finding many more that spell it correctly. Thanks for the edits.   (☎☓) 

"
Clean text: 
 "p.s. it is not polite to talk to people behind their backs, please remove your comments from mrph's talk page.

vaughan
you are right; i went to check your previous edit and found a page on the marvel site that spelled it ""vaughn"", but now i am finding many more that spell it correctly. thanks for the edits.   (  ) 

"


### Remove HTML Tag (BeautifulSoup not really useful? merely remove space?)

In [172]:
cleaned_text = train_df["clean_text"].apply(lambda x: BeautifulSoup(str(x)).get_text())

In [173]:
text_changed = cleaned_text!=train_df["clean_text"]

In [174]:
[i for i, x in enumerate(text_changed) if x][:10]

[228, 329, 3303, 3699, 3858, 4112, 4929, 5547, 5837, 6193]

In [175]:
train_df["clean_text"][228]

'   heritage from village           in macedonian          . sources claim that the village was pure slavic.'

In [176]:
cleaned_text[228]

'heritage from village           in macedonian          . sources claim that the village was pure slavic.'

In [229]:
# replace HTML tag with space
html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(html, " ", x))

###  Remove Extra Space

In [190]:
# replace \r\n with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub('\r\n', ' ', x))

In [230]:
# remove extra space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(' +', ' ', x))

## Replace Common Slangs

Slang, acronyms or informal abbreviations should be replaced with formal English. The list of common slangs used in Tweets takes reference from https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing.

In [51]:
# read abbreviation.csv
abbreviations = pd.read_csv('Data/abbreviations.csv')
abbreviations.head()

Unnamed: 0,abbreviation,translation
0,$,dollar
1,€,euro
2,4ao,for adults only
3,a.m,before midday
4,a3,anytime anywhere anyplace


In [58]:
# convert the data frame to a dictionary
abbreviations_dict = dict(zip(abbreviations.abbreviation, abbreviations.translation))

In [59]:
# define a helper function to replace the abbreviations
def convert_abbrev(text):
    # create a pattern of all abbreviations and make sure they are not part of a longer word
    abbreviations_pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in abbreviations_dict.keys()) + r')(?!\w)')
    # replace an abbreviation with its translation
    text = abbreviations_pattern.sub(lambda x: abbreviations_dict[x.group()], text)
    return text

In [192]:
# replace the slangs
train_df["clean_text"] = train_df["clean_text"].apply(convert_abbrev)

In [193]:
# check if slangs are replaced
print("Original text: \n", train_df["comment_text"][1])
print("Clean text: \n", train_df["clean_text"][1])

Original text: 
 D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)
Clean text: 
 d'aww! he matches this background colour i am seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (coordinated universal time)


## Spelling Correction

We should correct the misspellings in the text. Both SpellChecker and TextBlob provide such functions, and we would like to compare their performance.

In [217]:
# select random texts from clean_text
length = len(train_df["clean_text"])
random_num = random.sample(range(length), 100)
random_text = train_df["clean_text"][random_num]

In [219]:
# using TextBlob package
start_time1 = time.time()
random_text.apply(lambda x: TextBlob(x).correct())
print("--- %s seconds ---" % (time.time() - start_time1))

--- 133.19340062141418 seconds ---


In [218]:
# using SpellChecker package
start_time2 = time.time()
random_text.apply(lambda x: SpellChecker().correction(x))
print("--- %s seconds ---" % (time.time() - start_time2))

--- 33.11142897605896 seconds ---


Randomly select 100 texts and apply spelling correction functions on them. Comparing the execution time of 2 different packages, SpellChecker is much faster than TextBlob. Considering we are using a large-scale dataset, SpellChecker is preferred.

In [246]:
def correct_spelling(text):
    start_time = time.time()
    cleaned_text = []
    spellchecker = SpellChecker()
    for i in range(text.shape[0]):
        if i%100==0:
            print(f'{i}-th text is being processed')
        cleaned_text.append(spellchecker.correction(text[i]))
    print("--- %s seconds ---" % (time.time() - start_time))
    return cleaned_text

In [247]:
cleaned_text = correct_spelling(train_df["clean_text"][:1001])

0-th text is being processed
100-th text is being processed
200-th text is being processed
300-th text is being processed
400-th text is being processed
500-th text is being processed
600-th text is being processed
700-th text is being processed
800-th text is being processed
900-th text is being processed
1000-th text is being processed
--- 124.26375722885132 seconds ---


In [254]:
train_df["clean_text"][:1001].index[cleaned_text!=train_df["clean_text"][:1001]]

Int64Index([ 62,  89, 101, 173, 175, 211, 217, 223, 226, 241, 250, 254, 259,
            268, 276, 299, 320, 323, 376, 381, 397, 408, 423, 448, 465, 470,
            504, 545, 592, 627, 632, 646, 715, 743, 758, 787, 806, 807, 814,
            823, 831, 844, 852, 874, 877, 883, 897, 899, 913, 923, 947, 971],
           dtype='int64')

In [266]:
train_df["clean_text"][971]

'. fu ck ing trollreasons'

In [265]:
cleaned_text[971]

However, many corrections do not make sense, and may omit some useful information. We decided not to use established package to perform spelling correction.

## Remove Punctuations

We remove punctuations from the text as the final step of text cleaning.

In [267]:
# remove punctuations
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [268]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww he matches this background colour i am se...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man i am really not trying to edit war it ...
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0,more i cannot make any real suggestions on im...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,you sir are my hero any chance you remember wh...


In [280]:
train_df.drop('comment_text', axis=1).to_csv('Data/cleaned_train.csv', index=False)

<a id="textpreprocessing"></a>
# 2. Text Preprocessing

In [281]:
cleaned_df = pd.read_csv("Data/cleaned_train.csv")
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text
0,0000997932d777bf,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,0,0,0,0,0,0,daww he matches this background colour i am se...
2,000113f07ec002fd,0,0,0,0,0,0,hey man i am really not trying to edit war it ...
3,0001b41b1c6bb37e,0,0,0,0,0,0,more i cannot make any real suggestions on im...
4,0001d958c54c6e35,0,0,0,0,0,0,you sir are my hero any chance you remember wh...
