In [100]:
import pandas as pd

# for tokenization
from nltk.tokenize import TweetTokenizer

# for POS tagging
from nltk.tokenize import sent_tokenize
from nltk import pos_tag_sents

# for lemmatization
from nltk.stem import WordNetLemmatizer

In [101]:
cleaned_df = pd.read_csv("Data/cleaned_train.csv")
print(cleaned_df.shape)
cleaned_df.head()

(159571, 8)


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text
0,0000997932d777bf,0,0,0,0,0,0,explanation why the edits made under my userna...
1,000103f0d9cfb60f,0,0,0,0,0,0,d'aww! he matches this background colour i am ...
2,000113f07ec002fd,0,0,0,0,0,0,"hey man, i am really not trying to edit war. i..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,""" more i cannot make any real suggestions on i..."
4,0001d958c54c6e35,0,0,0,0,0,0,"you, sir, are my hero. any chance you remember..."


Multiple data pre-processing methods are performed, including:
- Tokenization
- POS Tagging
- Lemmatization

## Tokenization

We used TweetTokenizer for tokenization, which is a package specially catered to tokenizing tweets. We created 2 versions with or without punctuations.

In [102]:
tt = TweetTokenizer()
cleaned_df['text_tokenized'] = cleaned_df['clean_text'].apply(tt.tokenize)

In [103]:
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,text_tokenized
0,0000997932d777bf,0,0,0,0,0,0,explanation why the edits made under my userna...,"[explanation, why, the, edits, made, under, my..."
1,000103f0d9cfb60f,0,0,0,0,0,0,d'aww! he matches this background colour i am ...,"[d'aww, !, he, matches, this, background, colo..."
2,000113f07ec002fd,0,0,0,0,0,0,"hey man, i am really not trying to edit war. i...","[hey, man, ,, i, am, really, not, trying, to, ..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,""" more i cannot make any real suggestions on i...","["", more, i, cannot, make, any, real, suggesti..."
4,0001d958c54c6e35,0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...","[you, ,, sir, ,, are, my, hero, ., any, chance..."


In [104]:
# remove punctuations
clean_text_nopunc = cleaned_df["clean_text"].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [105]:
cleaned_df['text_tokenized_nopunc'] = clean_text_nopunc.apply(tt.tokenize)

In [106]:
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,text_tokenized,text_tokenized_nopunc
0,0000997932d777bf,0,0,0,0,0,0,explanation why the edits made under my userna...,"[explanation, why, the, edits, made, under, my...","[explanation, why, the, edits, made, under, my..."
1,000103f0d9cfb60f,0,0,0,0,0,0,d'aww! he matches this background colour i am ...,"[d'aww, !, he, matches, this, background, colo...","[daww, he, matches, this, background, colour, ..."
2,000113f07ec002fd,0,0,0,0,0,0,"hey man, i am really not trying to edit war. i...","[hey, man, ,, i, am, really, not, trying, to, ...","[hey, man, i, am, really, not, trying, to, edi..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,""" more i cannot make any real suggestions on i...","["", more, i, cannot, make, any, real, suggesti...","[more, i, cannot, make, any, real, suggestions..."
4,0001d958c54c6e35,0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...","[you, ,, sir, ,, are, my, hero, ., any, chance...","[you, sir, are, my, hero, any, chance, you, re..."


## Part of Speech Tagging (POS Tagging)

In [107]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\suyat\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [108]:
def to_word_tokens(sent_tokens):
    word_tokens = [] 
    for sent_token in sent_tokens:
        word_tokens.append(tt.tokenize(sent_token))
    return word_tokens

In [109]:
# tokenize sentences
cleaned_df['for_tagging_use_sent_token'] = cleaned_df['clean_text'].apply(sent_tokenize)
# tokenize words in sentences 
tt = TweetTokenizer()
cleaned_df['for_tagging_use_word_token'] = cleaned_df['for_tagging_use_sent_token'].apply(to_word_tokens)
# perform POS tagging
cleaned_df['POS_tagging'] = cleaned_df['for_tagging_use_word_token'].apply(pos_tag_sents)
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,text_tokenized,text_tokenized_nopunc,for_tagging_use_sent_token,for_tagging_use_word_token,POS_tagging
0,0000997932d777bf,0,0,0,0,0,0,explanation why the edits made under my userna...,"[explanation, why, the, edits, made, under, my...","[explanation, why, the, edits, made, under, my...",[explanation why the edits made under my usern...,"[[explanation, why, the, edits, made, under, m...","[[(explanation, NN), (why, WRB), (the, DT), (e..."
1,000103f0d9cfb60f,0,0,0,0,0,0,d'aww! he matches this background colour i am ...,"[d'aww, !, he, matches, this, background, colo...","[daww, he, matches, this, background, colour, ...","[d'aww!, he matches this background colour i a...","[[d'aww, !], [he, matches, this, background, c...","[[(d'aww, NN), (!, .)], [(he, PRP), (matches, ..."
2,000113f07ec002fd,0,0,0,0,0,0,"hey man, i am really not trying to edit war. i...","[hey, man, ,, i, am, really, not, trying, to, ...","[hey, man, i, am, really, not, trying, to, edi...","[hey man, i am really not trying to edit war.,...","[[hey, man, ,, i, am, really, not, trying, to,...","[[(hey, NN), (man, NN), (,, ,), (i, JJ), (am, ..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,""" more i cannot make any real suggestions on i...","["", more, i, cannot, make, any, real, suggesti...","[more, i, cannot, make, any, real, suggestions...","["" more i cannot make any real suggestions on ...","[["", more, i, cannot, make, any, real, suggest...","[[("", IN), (more, JJR), (i, JJ), (cannot, NNS)..."
4,0001d958c54c6e35,0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...","[you, ,, sir, ,, are, my, hero, ., any, chance...","[you, sir, are, my, hero, any, chance, you, re...","[you, sir, are my hero., any chance you rememb...","[[you, ,, sir, ,, are, my, hero, .], [any, cha...","[[(you, PRP), (,, ,), (sir, VB), (,, ,), (are,..."


In [110]:
# flatten the POS tagging
cleaned_df['POS_tagging_flat'] = cleaned_df['POS_tagging'].apply(lambda x: [element for innerList in x for element in innerList])
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,text_tokenized,text_tokenized_nopunc,for_tagging_use_sent_token,for_tagging_use_word_token,POS_tagging,POS_tagging_flat
0,0000997932d777bf,0,0,0,0,0,0,explanation why the edits made under my userna...,"[explanation, why, the, edits, made, under, my...","[explanation, why, the, edits, made, under, my...",[explanation why the edits made under my usern...,"[[explanation, why, the, edits, made, under, m...","[[(explanation, NN), (why, WRB), (the, DT), (e...","[(explanation, NN), (why, WRB), (the, DT), (ed..."
1,000103f0d9cfb60f,0,0,0,0,0,0,d'aww! he matches this background colour i am ...,"[d'aww, !, he, matches, this, background, colo...","[daww, he, matches, this, background, colour, ...","[d'aww!, he matches this background colour i a...","[[d'aww, !], [he, matches, this, background, c...","[[(d'aww, NN), (!, .)], [(he, PRP), (matches, ...","[(d'aww, NN), (!, .), (he, PRP), (matches, VBZ..."
2,000113f07ec002fd,0,0,0,0,0,0,"hey man, i am really not trying to edit war. i...","[hey, man, ,, i, am, really, not, trying, to, ...","[hey, man, i, am, really, not, trying, to, edi...","[hey man, i am really not trying to edit war.,...","[[hey, man, ,, i, am, really, not, trying, to,...","[[(hey, NN), (man, NN), (,, ,), (i, JJ), (am, ...","[(hey, NN), (man, NN), (,, ,), (i, JJ), (am, V..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,""" more i cannot make any real suggestions on i...","["", more, i, cannot, make, any, real, suggesti...","[more, i, cannot, make, any, real, suggestions...","["" more i cannot make any real suggestions on ...","[["", more, i, cannot, make, any, real, suggest...","[[("", IN), (more, JJR), (i, JJ), (cannot, NNS)...","[("", IN), (more, JJR), (i, JJ), (cannot, NNS),..."
4,0001d958c54c6e35,0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...","[you, ,, sir, ,, are, my, hero, ., any, chance...","[you, sir, are, my, hero, any, chance, you, re...","[you, sir, are my hero., any chance you rememb...","[[you, ,, sir, ,, are, my, hero, .], [any, cha...","[[(you, PRP), (,, ,), (sir, VB), (,, ,), (are,...","[(you, PRP), (,, ,), (sir, VB), (,, ,), (are, ..."


In [111]:
# drop intermediate columns
cleaned_df = cleaned_df.drop(columns=['for_tagging_use_sent_token', 'for_tagging_use_word_token'])
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,text_tokenized,text_tokenized_nopunc,POS_tagging,POS_tagging_flat
0,0000997932d777bf,0,0,0,0,0,0,explanation why the edits made under my userna...,"[explanation, why, the, edits, made, under, my...","[explanation, why, the, edits, made, under, my...","[[(explanation, NN), (why, WRB), (the, DT), (e...","[(explanation, NN), (why, WRB), (the, DT), (ed..."
1,000103f0d9cfb60f,0,0,0,0,0,0,d'aww! he matches this background colour i am ...,"[d'aww, !, he, matches, this, background, colo...","[daww, he, matches, this, background, colour, ...","[[(d'aww, NN), (!, .)], [(he, PRP), (matches, ...","[(d'aww, NN), (!, .), (he, PRP), (matches, VBZ..."
2,000113f07ec002fd,0,0,0,0,0,0,"hey man, i am really not trying to edit war. i...","[hey, man, ,, i, am, really, not, trying, to, ...","[hey, man, i, am, really, not, trying, to, edi...","[[(hey, NN), (man, NN), (,, ,), (i, JJ), (am, ...","[(hey, NN), (man, NN), (,, ,), (i, JJ), (am, V..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,""" more i cannot make any real suggestions on i...","["", more, i, cannot, make, any, real, suggesti...","[more, i, cannot, make, any, real, suggestions...","[[("", IN), (more, JJR), (i, JJ), (cannot, NNS)...","[("", IN), (more, JJR), (i, JJ), (cannot, NNS),..."
4,0001d958c54c6e35,0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...","[you, ,, sir, ,, are, my, hero, ., any, chance...","[you, sir, are, my, hero, any, chance, you, re...","[[(you, PRP), (,, ,), (sir, VB), (,, ,), (are,...","[(you, PRP), (,, ,), (sir, VB), (,, ,), (are, ..."


## Lemmatization

Lemmatization is done based on POS tagging results.

In [112]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\suyat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [113]:
lemmatizer = WordNetLemmatizer()
cleaned_df['lemmatization'] = cleaned_df['POS_tagging_flat'].apply(lambda x: [lemmatizer.lemmatize(word) for word, tag in x 
                                                                              if ((tag.startswith('JJ') or tag.startswith('NN') or tag.startswith('RB') or tag.startswith('VB')) and (word not in string.punctuation))])

In [114]:
cleaned_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,text_tokenized,text_tokenized_nopunc,POS_tagging,POS_tagging_flat,lemmatization
0,0000997932d777bf,0,0,0,0,0,0,explanation why the edits made under my userna...,"[explanation, why, the, edits, made, under, my...","[explanation, why, the, edits, made, under, my...","[[(explanation, NN), (why, WRB), (the, DT), (e...","[(explanation, NN), (why, WRB), (the, DT), (ed...","[explanation, edits, made, username, hardcore,..."
1,000103f0d9cfb60f,0,0,0,0,0,0,d'aww! he matches this background colour i am ...,"[d'aww, !, he, matches, this, background, colo...","[daww, he, matches, this, background, colour, ...","[[(d'aww, NN), (!, .)], [(he, PRP), (matches, ...","[(d'aww, NN), (!, .), (he, PRP), (matches, VBZ...","[d'aww, match, colour, i, am, seemingly, stuck..."
2,000113f07ec002fd,0,0,0,0,0,0,"hey man, i am really not trying to edit war. i...","[hey, man, ,, i, am, really, not, trying, to, ...","[hey, man, i, am, really, not, trying, to, edi...","[[(hey, NN), (man, NN), (,, ,), (i, JJ), (am, ...","[(hey, NN), (man, NN), (,, ,), (i, JJ), (am, V...","[hey, man, i, am, really, not, trying, edit, w..."
3,0001b41b1c6bb37e,0,0,0,0,0,0,""" more i cannot make any real suggestions on i...","["", more, i, cannot, make, any, real, suggesti...","[more, i, cannot, make, any, real, suggestions...","[[("", IN), (more, JJR), (i, JJ), (cannot, NNS)...","[("", IN), (more, JJR), (i, JJ), (cannot, NNS),...","[more, i, cannot, make, real, suggestion, impr..."
4,0001d958c54c6e35,0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...","[you, ,, sir, ,, are, my, hero, ., any, chance...","[you, sir, are, my, hero, any, chance, you, re...","[[(you, PRP), (,, ,), (sir, VB), (,, ,), (are,...","[(you, PRP), (,, ,), (sir, VB), (,, ,), (are, ...","[sir, are, hero, chance, remember, page, is]"


## Store the processed data

In [115]:
cleaned_df.to_csv('Data/processed_train.csv', index=False)