# Table of Content
1. [Text Cleaning](#textcleaning)

In [23]:
%time
import os
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
import numpy as np
import pandas as pd
import sklearn

#libraries for text cleaning
import contractions
from bs4 import BeautifulSoup

# Libraries and packages for text (pre-)processing 
import string
import re
import nltk

Wall time: 0 ns


In [24]:
train_df = pd.read_csv("Data/train.csv")
print(train_df.shape)
train_df.head(5)

(159571, 8)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


<a id="textcleaning"></a>
# 1. Text Cleaning

## Capitalization / Lower case (attention to exceptional cases)

Exceptional cases such as capital abbreviation will be solved by replacing typos, slang, acronyms or informal abbreviations technique in the subsquent steps.

In [25]:
train_df["clean_text"] = train_df["comment_text"].apply(lambda x: x.lower())
display(train_df.head())

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i'm s...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i'm really not trying to edit war. it..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"""\nmore\ni can't make any real suggestions on ..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember..."


## Expand Contractions


Contractions are words or combinations of words that are shortened by dropping letters and replacing them by an apostrophe. Removing contractions helps contribute to text standardization.

In [26]:
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: contractions.fix(x))

# Check if expand contractions works
print("Original text: \n", train_df["comment_text"][2])
print("Clean text: \n", train_df["clean_text"][2])
print("Original text: \n", train_df["comment_text"][3])
print("Clean text: \n", train_df["clean_text"][3])

Original text: 
 Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.
Clean text: 
 hey man, i am really not trying to edit war. it is just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. he seems to care more about the formatting than the actual info.
Original text: 
 "
More
I can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.

There appears to be a backlog on articles for review so

## Remove Noise

Remove unnecessary characters or punctuation such as URLs, HTML tags, non-ASCII characters, or other special characters 

### Remove HTML Tag

In [27]:
#Remove html tag
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: BeautifulSoup(str(x)).get_text())

### Remove url

In [28]:
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(r"https?://\S+|www\.\S+", "", x))

###  Remove Non-ASCI Characters

In [29]:
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(r'[^\x00-\x7f]',r'', x))

###  Remove Extra Space

In [43]:
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: re.sub(' +', ' ', x))

###  Remove Special Characters (symbols, emojis, and other graphic characters)

In [32]:
regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        u"\ufe0f"  # dingbats
                           "]+", flags = re.UNICODE)

In [33]:
train_df["clean_text"] = train_df["clean_text"].apply(lambda x: regrex_pattern.sub(r'', x))

In [34]:
#Check text
print(train_df["comment_text"][143])
print(train_df["clean_text"][143])

"P.S. It's not polite to talk to people behind their backs, please remove your comments from Mrph's talk page.

Vaughan
You're right; I went to check your previous edit and found a page on the Marvel site that spelled it ""Vaughn"", but now I am finding many more that spell it correctly. Thanks for the edits.   (☎☓) 

"
"p.s. it is not polite to talk to people behind their backs, please remove your comments from mrph's talk page.  vaughan you are right; i went to check your previous edit and found a page on the marvel site that spelled it ""vaughn"", but now i am finding many more that spell it correctly. thanks for the edits.   ()   "


In [44]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i am ...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i am really not trying to edit war. i..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,""" more i cannot make any real suggestions on i..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember..."
