# Text Normalization

This notebook focuses on normalizing the text from the competition Jigsaw to the point where we have a suitable stream of text that can be used to extract features.

In [1]:
# Import libraries
# Operating system functions
import os
# Random generators
import random
# Time measurement and processing time
import time
# Data frames and data manipulation
import pandas as pd
# Numerical library
import numpy as np
# TQDM: progress bar (https://github.com/tqdm/tqdm)
import tqdm
# Plotting library
import matplotlib.pyplot as plt
# Statistical data visualization
import seaborn as sns
# Natural language toolkit
import nltk
import spacy
# Various operations on collections
import operator

In [20]:
# local helper functions
import helpers
# Importlib allows reloading a file (for example when it is changed)
# https://docs.python.org/3/library/importlib.html#importlib.reload
import importlib
importlib.reload(helpers)

<module 'helpers' from 'C:\\Users\\Gerald\\kg-jig\\kg-jig\\helpers.py'>

In [3]:
# some initialization
def set_seed(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

set_seed(1)

### Define Where Data is Located

In [6]:
# Gamer
#data_path = 'F:/kg-jigsaw-data/data'
# Laptop
data_path = 'C:/Users/Gerald/kg-jig/kg-jigsaw-data/data'
train_data_path=data_path + "/train.csv"
test_data_path=data_path + "/test.csv"
train_sample_data_path= data_path + "/train-sample.csv"

print (data_path)
%ls "$data_path"

C:/Users/Gerald/kg-jig/kg-jigsaw-data/data
 Volume in drive C is OS
 Volume Serial Number is B41D-1118

 Directory of C:\Users\Gerald\kg-jig\kg-jigsaw-data\data

04/06/2019  01:50 PM    <DIR>          .
04/06/2019  01:50 PM    <DIR>          ..
03/28/2019  09:17 PM         1,167,854 sample_submission.csv
03/28/2019  09:17 PM        30,179,878 test.csv
03/28/2019  09:16 PM       816,211,476 train.csv
04/06/2019  01:32 PM        25,584,557 train-sample.csv
04/06/2019  01:36 PM        15,077,629 train-sample.xlsx
               5 File(s)    888,221,394 bytes
               2 Dir(s)  388,400,234,496 bytes free


### Load Data

In [21]:
# load the two data sets
train_df = helpers.load_data(train_data_path, "Training set")
test_df = helpers.load_data(test_data_path, "Test set")

#timer = helpers.start_timer()
#train_df = pd.read_csv(train_sample_data_path)
#train_df = pd.read_csv(train_data_path)
#print(helpers.elapsed_time_ext("Training set load time", helpers.stop_timer(timer), "record", len(train_df)))

#timer = helpers.start_timer()
#test_df = pd.read_csv(test_data_path)
#print(helpers.elapsed_time_ext("Test set load time", helpers.stop_timer(timer), "record", len(test_df)))

Training setload time: 15982.300 ms for 1804874 records (112929.551 record/s, 0.009 ms/record)
Test setload time: 440.999 ms for 97320 records (220680.638 record/s, 0.005 ms/record)


The test set is 97K samples. If I read the Kernel requirement, the test must complete in 9 hours or less without GPU, and 2 hours or less with GPU. I understand this means we need to process 97K records in less than 9 hours, or an average of 330ms per text.<br/>

See https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/overview/kernels-requirements

### Partition Data in Pickle Files

In [9]:
#train_df.head()
# in Pandas v24:
#import pyarrow.parquet as pq
#output_train = data_path + "test.pqt"

# This takes several minutes and the compression is not fantastic
if (False):
    output_train = data_path + "train.pickle"
    train_df.to_pickle(output_train, compression="gzip")
    print("Data saved to ", output_train, " ", len(train_df), " rows")
    output_test = data_path + "test.pickle"
    test_df.to_pickle(output_test, compression="gzip")
    print("Data saved to ", output_test, " ", len(test_df), " rows")

In [13]:
# list columns present in each dataset
print("Training set size: ", train_df.shape)
print("Test set size: ", test_df.shape)
print("List of fields for training set:\n", train_df.columns)
print("List of fields for test set:\n", test_df.columns)
#labels = ["target"]
#train_df[labels].describe()

Training set size:  (1804874, 45)
Test set size:  (97320, 2)
List of fields for training set:
 Index(['id', 'target', 'comment_text', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual',
       'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu',
       'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability',
       'jewish', 'latino', 'male', 'muslim', 'other_disability',
       'other_gender', 'other_race_or_ethnicity', 'other_religion',
       'other_sexual_orientation', 'physical_disability',
       'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date',
       'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow',
       'sad', 'likes', 'disagree', 'sexual_explicit',
       'identity_annotator_count', 'toxicity_annotator_count'],
      dtype='object')
List of fields for test set:
 Index(['id', 'comment_text'], dtype='object')


In [10]:
train_df.describe()

Unnamed: 0,id,target,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,...,parent_id,article_id,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
count,55609.0,55609.0,55609.0,55609.0,55609.0,55609.0,55609.0,10916.0,10916.0,10916.0,...,29500.0,55609.0,55609.0,55609.0,55609.0,55609.0,55609.0,55609.0,55609.0,55609.0
mean,275836.704274,0.087631,0.004207,0.015072,0.015166,0.068099,0.00972,0.004826,0.003699,0.0036,...,275663.980441,72872.734791,0.000755,0.00027,3.6e-05,2.247262,0.000629,0.006266,1.55054,8.410779
std,19902.28419,0.185047,0.022414,0.072204,0.063472,0.162639,0.052146,0.055233,0.054464,0.039496,...,19581.267018,23392.185467,0.034443,0.019431,0.005997,3.799358,0.025787,0.043501,29.777183,46.979452
min,59848.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,64824.0,2006.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
25%,259321.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,259302.75,54610.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
50%,275924.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,275582.0,67456.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0
75%,292684.0,0.108696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,291990.5,98136.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,6.0
max,310126.0,1.0,0.534819,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,310119.0,106453.0,4.0,2.0,1.0,60.0,2.0,0.948495,1848.0,3509.0


In [11]:
test_df.describe()

Unnamed: 0,id
count,97320.0
mean,7048660.0
std,28094.01
min,7000000.0
25%,7024330.0
50%,7048660.0
75%,7072989.0
max,7097319.0


In [12]:
train_df.head(20)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47
5,59859,0.666667,ur a sh*tty comment.,0.047619,0.638095,0.0,0.333333,0.0,,,...,2006,rejected,0,0,0,0,0,0.009524,0,105
6,59861,0.457627,hahahahahahahahhha suck it.,0.050847,0.305085,0.0,0.254237,0.0,,,...,2006,rejected,0,0,0,0,0,0.220339,0,59
7,59863,0.0,FFFFUUUUUUUUUUUUUUU,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
8,239575,0.0,The ranchers seem motivated by mostly by greed...,0.0,0.0,0.0,0.0,0.0,,,...,26662,approved,0,0,0,0,0,0.0,0,4
9,239576,0.0,It was a great show. Not a combo I'd of expect...,0.0,0.0,0.0,0.0,0.0,,,...,26650,approved,0,0,0,1,0,0.0,0,4


# Get Text

In [13]:
texts = train_df[['comment_text','target']]

In [14]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', 500)
texts.head(10)

Unnamed: 0,comment_text,target
0,"This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!",0.0
1,"Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!",0.0
2,This is such an urgent design problem; kudos to you for taking it on. Very impressive!,0.0
3,Is this something I'll be able to install on my site? When will you be releasing it?,0.0
4,haha you guys are a bunch of losers.,0.893617
5,ur a sh*tty comment.,0.666667
6,hahahahahahahahhha suck it.,0.457627
7,FFFFUUUUUUUUUUUUUUU,0.0
8,The ranchers seem motivated by mostly by greed; no one should have the right to allow their animals destroy public land.,0.0
9,It was a great show. Not a combo I'd of expected to be good together but it was.,0.0


# Load SpaCy Libraries

The choice was between SpaCy and NLTK. It was recommended that SpaCy is the better library to use to do NLP.

NLTK does not come with all libraries pre-installed. If you run the command below and it raises an error,
uncomment the "nltk.download()" and run again. It will open a separate window (on windows) from which you
can download the NLTK packages.
Once done, comment out this line.

In [38]:
#nltk.download()

In order to install SpaCy, open Anaconda Navigator, choose your environment and open a terminal for it. In the terminal, type the following two lines:
<pre>
conda install -c conda-forge spacy
python -m spacy download en
</pre>


In [22]:
spl = spacy.load("en_core_web_sm")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

Helper Functions

In [41]:
def show_corpus_stats(title, texts, toolkit='spacy'):
    text_count=0
    token_count=0
    text_len=0
    timer = helpers.start_timer()
    for t in texts['comment_text']:
        #texts['comment_text'][0]
        # NLTK 
        if (toolkit == 'nltk'): 
            tokens = nltk.word_tokenize(t)
        if (toolkit == 'spacy'): 
            tokens = spl(t)
        #print(tokens)
        text_len += len(t)
        token_count += len(tokens)
        text_count+=1
        if (text_count > 2000): break
    elapsed = helpers.stop_timer(timer)
    print(title)
    print("- ", helpers.elapsed_time_ext("Tokenized text", elapsed, "sentence", text_count))
    print("- ", helpers.elapsed_time_ext("Tokenized token", elapsed, "token", token_count))
    print("- ", helpers.elapsed_time_ext("Tokenized characters", elapsed, "char", text_len))

# Tokenize

In [42]:
# NLTK
show_corpus_stats("Training Corpus (NLTK)", train_df[['comment_text', 'target']],toolkit='nltk')

Training Corpus (NLTK)
-  Tokenized text: 974.406 ms for 2001 sentences (2053.558 sentence/s, 0.487 ms/sentence)
-  Tokenized token: 974.406 ms for 127127 tokens (130466.118 token/s, 0.008 ms/token)
-  Tokenized characters: 974.406 ms for 633071 chars (649699.245 char/s, 0.002 ms/char)


NLTK is fairly efficient at tokenizing, with an average of 0.5ms per sentence.

In [43]:
# SpaCy
show_corpus_stats("Training Corpus (SpaCy)", train_df[['comment_text', 'target']])

Training Corpus (SpaCy)
-  Tokenized text: 27514.396 ms for 2001 sentences (72.726 sentence/s, 13.750 ms/sentence)
-  Tokenized token: 27514.396 ms for 131119 tokens (4765.469 token/s, 0.210 ms/token)
-  Tokenized characters: 27514.396 ms for 633071 chars (23008.719 char/s, 0.043 ms/char)


SpaCy is much more expensive if you just need tokenization. It takes around 14ms per sentence.

In [31]:
show_corpus_stats("Test Corpus", test_df[['comment_text']], toolkit='nltk')

Test Corpus
-  Tokenized text: 12586.374 ms for 1001 sentences (79.530 sentence/s, 12.574 ms/sentence)
-  Tokenized token: 12586.374 ms for 60241 tokens (4786.208 token/s, 0.209 ms/token)
-  Tokenized characters: 12586.374 ms for 291367 chars (23149.399 char/s, 0.043 ms/char)


There is a total of 5,822,679  tokens in the test set. This means that 
the average processing time per token must be around 5.5ms ((9*3600) / 5822679)

# Looking at Vocabulary

In this section, we look at words present in the training corpus. The steps are:

- Tokenize every sentence in the training set
- Create a dictionary of all words used, and count the number of occurrences

In [65]:
def create_dictionary(texts):
    word_dict = {}
    text_count = len(texts)
    count=0
    for t in texts:
        tokens = nltk.word_tokenize(t)
        for tk in tokens:
            if tk in word_dict:
                word_dict[tk] = word_dict[tk] + 1
            else:
                word_dict[tk] = 1
        if (count%5000 == 0):
            print("Processed ", "{0:.1f}".format(count * 100/ text_count), "%")
        count += 1
    print("Processed ", "{0:.1f}".format(count * 100/ text_count), "%")
    return word_dict

In [66]:
texts = train_df['comment_text']

In [67]:
word_dict = create_dictionary(texts)

Processed  0.0 %
Processed  9.0 %
Processed  18.0 %
Processed  27.0 %
Processed  36.0 %
Processed  45.0 %
Processed  53.9 %
Processed  62.9 %
Processed  71.9 %
Processed  80.9 %
Processed  89.9 %
Processed  98.9 %


In [70]:
print("Shape of training set: ", train_df.shape)
print("Number of tokens (words) in training set: ", len(word_dict))

(55609, 45)
82570


In [110]:
sorted_words = sorted(word_dict.items(), key=lambda kv: kv[1], reverse=True)
word_df = pd.DataFrame(sorted_words, columns =['word', 'count'])

word_df.head(10)
print("Number of words appearing only once: ", len(word_df[word_df['count']==1]))
print("Number of words appearing twice: ", len(word_df[word_df['count']==2]))
print("Number of words appearing three times: ", len(word_df[word_df['count']==3]))


Number of words appearing only once:  40780
Number of words appearing twice:  10712
Number of words appearing three times:  5420


The observation here is that the dictionary can be shrunk by 55K words (out of 82.5K) if we are ok to drop words that only appear 3 times or less.

## Cleanup
Look at the set of words that appear more than once, and look for ways to normalize those

In [112]:
display(word_df.head(10))

Unnamed: 0,word,count
0,.,160674
1,the,136724
2,",",113719
3,to,87354
4,and,68416
5,of,64504
6,a,64153
7,is,46103
8,in,42008
9,that,40754


### Look for one letter words

In [164]:
def get_codepoint(word):
    #if (not isinstance(word, str)): 
    #    return 0
    if (len(word) == 1): 
        return ord(word)
    elif (len(word) == 2): 
        return str(ord(word[0])) + "," + str(ord(word[1]))
    else:
        return 0
    
def is_emoji(char):
    val = ord(char)
    if (val > 0x1F000): return True
    if (val >= 0xFE00 and val <= 0xFEFF): return True
    if (val >= 0x2000 and val <= 2800): return True
    return False

import string

def is_punct(char):
    return char in string.punctuation

In [165]:
word_df['sz'] = word_df.apply (lambda row: len(row['word']), axis=1)
word_df['unicode'] = word_df.apply (lambda row: get_codepoint(row['word']), axis=1)
word_df['punct'] = word_df.apply (lambda row: is_punct(row['word'][0]), axis=1)
word_df['emoji'] = word_df.apply (lambda row: is_emoji(row['word'][0]), axis=1)
word_df[word_df['sz'] == 1] 

Unnamed: 0,word,count,sz,unicode,emoji,punct
0,.,160674,1,46,False,True
2,",",113719,1,44,False,True
6,a,64153,1,97,False,False
10,I,40300,1,73,False,False
14,?,23930,1,63,False,True
29,!,14405,1,33,False,True
48,),8939,1,41,False,True
53,(,8274,1,40,False,True
62,:,7403,1,58,False,True
77,$,5323,1,36,False,True


We see the following:

- There are punctuation marks
- Parenthesis
- Emojis

Emojis codepoint seems to be around 126976 (0x1F000)

- https://emojipedia.org/emoji/
- https://unicode.org/emoji/charts/full-emoji-list.html

## Look for two letter words

In [166]:
two_letter_words = word_df[word_df['sz'] == 2]
display(two_letter_words)
#two_letter_words.head(100)
#display(sorted(two_letter_words, key=lambda kv: kv[1], reverse=True))


Unnamed: 0,word,count,sz,unicode,emoji,punct
3,to,87354,2,116111,False,False
5,of,64504,2,111102,False,False
7,is,46103,2,105115,False,False
8,in,42008,2,105110,False,False
13,it,27463,2,105116,False,False
16,'s,23318,2,39115,False,True
19,be,20671,2,98101,False,False
21,on,20275,2,111110,False,False
22,'',18251,2,3939,False,True
23,``,17918,2,9696,False,True


In [152]:
print(len(two_letter_words), " words with 2-letters")

1111  words with 2-letters
