In [74]:
import pandas as pd

# expand contractions
import contractions

# match regular expression
import re

In [105]:
train_df = pd.read_csv("Data/train.csv")
print(train_df.shape)
train_df.head()

(159571, 8)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


After doing some data exploration on the comment_text, we found that there are some noise in the text, which does not provide any useful information and will hinder the data proprecessing part. Therefore, we first performed some data cleaning including:
- Expand Contractions
- Remove URL
- Remove Non-ASCI Characters
- Remove Special Characters
- Remove Extra Spaces

## Expand Contractions


Contractions are words or combinations of words that are shortened by dropping letters and replacing them by an apostrophe. Removing contractions helps contribute to text standardization. We use contractions package to expand contractions.

In [76]:
train_df["clean_text"] = train_df["comment_text"].apply(lambda x: contractions.fix(x))

In [77]:
# check if expand contractions works
print("Original text: \n", train_df["comment_text"][2])
print("Clean text: \n", train_df["clean_text"][2])

Original text: 
 Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
Clean text: 
 Hey man, I am really not trying to edit war. It is just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.


## Remove URL

In [78]:
# replace URL with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', ' ', x))

In [79]:
# check if URLs are removed
print("Original text: \n", train_df["comment_text"][22])
print("Clean text: \n", train_df["clean_text"][22])

Original text: 
 "

 Snowflakes are NOT always symmetrical! 

Under Geometry it is stated that ""A snowflake always has six symmetric arms."" This assertion is simply not true! According to Kenneth Libbrecht, ""The rather unattractive irregular crystals are by far the most common variety."" http://www.its.caltech.edu/~atomic/snowcrystals/myths/myths.htm#perfection Someone really need to take a look at his site and get FACTS off of it because I still see a decent number of falsities on this page. (forgive me Im new at this and dont want to edit anything)"
Clean text: 
 "

 Snowflakes are NOT always symmetrical! 

Under Geometry it is stated that ""A snowflake always has six symmetric arms."" This assertion is simply not true! According to Kenneth Libbrecht, ""The rather unattractive irregular crystals are by far the most common variety.""   Someone really need to take a look at his site and get FACTS off of it because I still see a decent number of falsities on this page. (forgi

##  Remove Non-ASCI Characters

In [80]:
# replace Non_ASCI characters with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(r'[^\x00-\x7f]', ' ', x))

In [81]:
# check if Non-ASCI characters are removed
print("Original text: \n", train_df["comment_text"][5])
print("Clean text: \n", train_df["clean_text"][5])

Original text: 
 "

Congratulations from me as well, use the tools well.  · talk "
Clean text: 
 "

Congratulations from me as well, use the tools well.    talk "


##  Remove Special Characters

In [82]:
regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        u"\ufe0f"  # dingbats
        "]+", flags = re.UNICODE)

In [83]:
# replace special characters with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: regrex_pattern.sub(' ', x))

In [84]:
# check if special characters are removed
print("Original text: \n", train_df["comment_text"][143])
print("Clean text: \n", train_df["clean_text"][143])

Original text: 
 "P.S. It's not polite to talk to people behind their backs, please remove your comments from Mrph's talk page.

Vaughan
You're right; I went to check your previous edit and found a page on the Marvel site that spelled it ""Vaughn"", but now I am finding many more that spell it correctly. Thanks for the edits.   (☎☓) 

"
Clean text: 
 "P.S. It is not polite to talk to people behind their backs, please remove your comments from Mrph's talk page.

Vaughan
You are right; I went to check your previous edit and found a page on the Marvel site that spelled it ""Vaughn"", but now I am finding many more that spell it correctly. Thanks for the edits.   (  ) 

"


##  Remove Extra Space

In [85]:
# replace \r\n with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub('\r\n', ' ', x))

In [86]:
# replace \n with space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub('\n', ' ', x))

In [87]:
# remove extra space
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(' +', ' ', x))

## Store the cleaned data

In [88]:
train_df.drop('comment_text', axis=1).to_csv('Data/noise_removed_train.csv', index=False)

## Convert to Lower Case

In [89]:
# convert all letters to lower case.
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: x.lower())
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i am ...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i am really not trying to edit war. i..."
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0,""" more i cannot make any real suggestions on i..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember..."


## Remove Punctuations

In [90]:
# remove duplicated punctuations
r = re.compile(r'''([!#$%&'()*+,./:;<=>?@[\]^_`{|}~-])[!"#$%&'()*+,./:;<=>?@[\]^_`{|}~-]+''')
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: r.sub(r'\1', x))
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i am ...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i am really not trying to edit war. i..."
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0,""" more i cannot make any real suggestions on i..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember..."


## Store the cleaned data

In [91]:
train_df.drop('comment_text', axis=1).to_csv('Data/cleaned_train.csv', index=False)