# Pre-process the Comment_Text Train and Test Data
In this notebook, we pre-process the train and text data of the toxic comments task. 
The first action is to extract the sentences from the text files. 

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Extract the Sentences from the Annotations

In [25]:
train_df = pd.read_csv("train.csv")
train_df.shape

(159571, 8)

In [26]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [27]:
test_df = pd.read_csv("test.csv")

In [28]:
test_df.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [29]:
test_df.shape

(153164, 2)

## Use NLTK to Tokenize 

In [31]:
from nltk.tokenize import word_tokenize

In [32]:
train_text = train_df['comment_text']
test_text = test_df['comment_text']

In [35]:
" ".join(word_tokenize(test_text[2]))

'`` == Sources == * Zawe Ashton on Lapland — / ``'

In [66]:
def tokenizeAndSave(text, outfile):
    """
    Tokenize the input text and save the re-joined (by space) tokens as sentences
    
    Input:
        text: a list of sentences
        outfile: a output file to stored the re-joined (by space) tokens
    Output:
        none
    """
    with open(outfile, "w") as f:
        for sent in text:
            toks = word_tokenize(sent)
            toks_lower = [t.lower() for t in toks]
            f.write(" ".join(toks_lower) + "\n")
            
    f.close()    

In [67]:
tokenizeAndSave(train_text, "train_sentences.tokenized")

In [68]:
tokenizeAndSave(test_text, "test_sentences.tokenized")

## Clean up the Sentences

In [69]:
import re
def cleanUpSentences(in_file, out_file):
    """
    Replace words with digits, useless symbols,... to NUM and empty in train set...."
    
    Input:
        in_file: the input file containing the sentences for cleaning up
        out_file: the output file to store the cleaned sentences
    Output:
        None
    """
    f_in = open(in_file, "r")
    f_out = open(out_file, "w")

    while True:
        sentence = f_in.readline()
        if sentence == '':
            break

        
        # substitute words with digits to NUM
        sentence = re.sub(r"[\w]*[\d]+[\d.-]+[\w]*|\d", "NUM", sentence)
        
        # substitute symbols to empty
        sentence = re.sub(r"[!\"#$%&()\*+,-\./:;<=>?@\[\\\]^_`{|}~\t\n]", " ", sentence)
                          
        # substitute 'm to am
        sentence = re.sub(r"'m", "am", sentence)
        
        # substitute .... to empty
        sentence = re.sub(r"[..][.]+", "", sentence)

        # substitute --- to empty
        sentence = re.sub(r"[--][-]+", "", sentence)

        # substitute '' to empty
        sentence = re.sub(r"''", "", sentence)

        # substitute ' to empty
        sentence = re.sub(r"'", "", sentence)

        # substitute -lrb- to empty
        sentence = re.sub(r"[-]lrb[-]", "", sentence)

        # substitute -rrb- to empty
        sentence = re.sub(r"[-]rrb[-]", "", sentence)

        # substitute . to empty
        sentence = re.sub(r"\s[.]", "", sentence)


        f_out.write(sentence.strip() + "\n")

    f_in.close()
    f_out.close()

In [70]:
cleanUpSentences("train_sentences.tokenized", "train_sentences.clean")

In [71]:
cleanUpSentences("test_sentences.tokenized", "test_sentences.clean")

# Coun the unique words after cleaning

In [72]:
import operator
def uniqueWord2Count(in_file):
    """
    Get the map from the unique words in the input file to their frequencies
    
    Input:
        in_file: text file containing sentences
    Output:
        sorted_word2Count: a map from unique words to frequences 
                    sorted by frequences descending
    """
    word2count = {}

    f = open(in_file, "r")
    
    while True:
        sentence = f.readline()
        if sentence == '':
            break
    
        words = sentence.strip().split()
    
        for word in words:
            if word not in word2count:
                word2count[word] = 1
            else:
                word2count[word] = word2count[word] + 1
    f.close()

    # sort the words by their frequencies

    #sorted_word2count = sorted(word2count.items(), \
    #                           key=operator.itemgetter(1),reverse=True)

    sorted_word2count = sorted(word2count.items(), \
                               key=operator.itemgetter(1))

    return sorted_word2count

In [77]:
sorted_word2count = uniqueWord2Count("train_sentences.clean")

count = 0
for w, c in sorted_word2count:
    # print("%s\t%s" % (w, c))
    if c >= 100:
        count += 1

print(count)

6007


In [75]:
# save the all uniques words from the train_sentence_clean
train_clean_all_words = "train_clean_all_words.txt"

f = open(train_clean_all_words, "w")
for w, c in sorted_word2count:
    if c >= 100:
        f.write("%s\t%s\n" % (w, c))
f.close()

# ====================================

# Test Keras Text Preprocessing

In [12]:
test_df['comment_text'].iloc[2]

'" \n\n == Sources == \n\n * Zawe Ashton on Lapland —  /  "'

In [16]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = 100)

Using TensorFlow backend.


In [17]:
test_sent = test_df['comment_text'].iloc[2]

In [18]:
tokenizer.fit_on_texts([test_sent])
tokenizer.texts_to_sequences([test_sent])

[[1, 2, 3, 4, 5, 6]]

In [19]:
tokenizer.word_index

{'ashton': 3, 'lapland': 5, 'on': 4, 'sources': 1, 'zawe': 2, '—': 6}

In [20]:
from keras.preprocessing.sequence import pad_sequences

In [21]:
pad_sequences(tokenizer.texts_to_sequences([test_sent]), maxlen = 25, padding="post")

array([[1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]], dtype=int32)

## Count the Unique Words from the Sentences

In [1]:
train_sentence_path = "data/seq2seq/train_sentences.txt"
test_sentence_path = "data/seq2seq/test_sentences.txt"

In [3]:
train_word2count = {}

f = open(train_sentence_path, "r")

while True:
    sentence = f.readline()
    if sentence == '':
        break
    
    words = sentence.strip().split()
    
    for word in words:
        if word not in train_word2count:
            train_word2count[word] = 1
        else:
            train_word2count[word] = train_word2count[word] + 1
f.close()

In [18]:
len(train_word2count)

220460

In [19]:
train_word2count['$']

29304

In [21]:
# sort the words by their frequencies
import operator
sorted_word2count = sorted(train_word2count.items(), key=operator.itemgetter(1),reverse=True)

count = 0
for w, c in sorted_word2count:
    print "%s\t%s" % (w, c)
    count += 1
    if count > 10:
        break

In [26]:
# save the all uniques words from the train sentences
train_all_words = "data/seq2seq/train_all_words.txt"
f = open(train_all_words, "w")
for w, c in sorted_word2count:
    f.write("%s\t%s\n" % (w, c))
f.close()

In [121]:
import re
sorted_symbol2count = []
for w, c in sorted_word2count:
    match = re.search(r'[\w]*[\d]+[\d.-]+[\w]*', w)
    if match:
        sorted_symbol2count.append((w, c))

In [122]:
len(sorted_symbol2count)

17041

In [118]:
sorted_symbol2count[1000:1005]

[('10018', 32),
 ('2-family', 32),
 ('5,500', 32),
 ('14.95', 32),
 ('265-8133', 32)]

In [148]:
sorted_special2count = []
for w, c in sorted_word2count:
    match = re.search(r'i\'m', w)
    if match:
        sorted_special2count.append((w, c))

In [149]:
len(sorted_special2count)

0

## Coun the unique words after cleaning

In [190]:
train_word2count = {}

f = open(train_sentence_clean_path, "r")

while True:
    sentence = f.readline()
    if sentence == '':
        break
    
    words = sentence.strip().split()
    
    for word in words:
        if word not in train_word2count:
            train_word2count[word] = 1
        else:
            train_word2count[word] = train_word2count[word] + 1
f.close()

In [191]:
len(train_word2count)

203464

In [192]:
# sort the words by their frequencies
import operator
sorted_word2count = sorted(train_word2count.items(), key=operator.itemgetter(1),reverse=True)

count = 0
for w, c in sorted_word2count:
    print "%s\t%s" % (w, c)
    count += 1
    if count > 10:
        break

,	1937005
the	1098862
and	709989
of	638332
in	581732
a	438584
to	408159
NUM	331761
s	229116
for	157843
that	155391


In [193]:
# save the all uniques words from the train_sentence_clean
train_clean_all_words = "data/seq2seq/train_clean_all_words.txt"
f = open(train_clean_all_words, "w")
for w, c in sorted_word2count:
    f.write("%s\t%s\n" % (w, c))
f.close()

In [194]:
test_word2count = {}

f = open(test_sentence_clean_path, "r")

while True:
    sentence = f.readline()
    if sentence == '':
        break
    
    words = sentence.strip().split()
    
    for word in words:
        if word not in test_word2count:
            test_word2count[word] = 1
        else:
            test_word2count[word] = test_word2count[word] + 1
f.close()

In [195]:
len(test_word2count)

76858

In [196]:
# sort the words by their frequencies
import operator
sorted_word2count = sorted(test_word2count.items(), key=operator.itemgetter(1),reverse=True)

count = 0
for w, c in sorted_word2count:
    print "%s\t%s" % (w, c)
    count += 1
    if count > 10:
        break

,	617689
the	322699
and	237388
of	193819
in	156377
a	127134
to	116663
NUM	94805
s	62914
;	52423
for	46862


In [197]:
# save the all uniques words from the train_sentence_clean
test_clean_all_words = "data/seq2seq/test_clean_all_words.txt"
f = open(test_clean_all_words, "w")
for w, c in sorted_word2count:
    f.write("%s\t%s\n" % (w, c))
f.close()