# A Tour of Natural Language Processing

In [20]:
import pandas as pd
from nltk.tokenize import word_tokenize # workhorse for tokenizing each sentence --> individual words
import nltk
nltk.download('punkt')                  #required for the tokenizer. 'punkt' is a sentence detector and an implementation
                                        # of [this paper](https://dl.acm.org/citation.cfm?id=1245122). The word_tokenize()
                                        # function is intelligently designed: 
                                        # https://www.nltk.org/_modules/nltk/tokenize.html#word_tokenize.
                                        # This function tokenizes each input into sentences, and then into words.

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\r\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv('train.csv')

In [4]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
df.shape

(159571, 8)

# Clean the data
Simply remove generic text that can safely be ignored

In [11]:
# do some standard cleaning
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "at")
    df[text_field] = df[text_field].str.lower()
    return df
df = standardize_text(df, 'comment_text')

In [12]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation\nwhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww! he matches this background colour i'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"hey man, i'm really not trying to edit war it...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nmore\ni can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"you, sir, are my hero any chance you remember...",0,0,0,0,0,0


# Tokenize the text data
Tokenizing refers to splitting sentences into lists of individual words. This can be a difficult process, and is language dependent. The python nltk package represents a lot of work in dealing with tokenization in English, so I just rely on this package. But we'll note that it contains some obvious errors.

In [13]:
from nltk.tokenize import word_tokenize

In [16]:
df['tokens'] = df['comment_text'].apply(word_tokenize)

In [17]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,tokens
0,0000997932d777bf,explanation\nwhy the edits made under my usern...,0,0,0,0,0,0,"[explanation, why, the, edits, made, under, my..."
1,000103f0d9cfb60f,d'aww! he matches this background colour i'm s...,0,0,0,0,0,0,"[d'aww, !, he, matches, this, background, colo..."
2,000113f07ec002fd,"hey man, i'm really not trying to edit war it...",0,0,0,0,0,0,"[hey, man, ,, i, 'm, really, not, trying, to, ..."
3,0001b41b1c6bb37e,"""\nmore\ni can't make any real suggestions on ...",0,0,0,0,0,0,"[``, more, i, ca, n't, make, any, real, sugges..."
4,0001d958c54c6e35,"you, sir, are my hero any chance you remember...",0,0,0,0,0,0,"[you, ,, sir, ,, are, my, hero, any, chance, y..."


In [18]:
# note the error in row 3: the word "can't" is tokenized into two separate words: "ca" and "n'". That appears to be incorrect.
# Although, I admit it might be doing something more intelligent by attempting to split the word into "can" and "not" (?).

# Tour of Models

## Bag of Words
The simplest method is to use a bag of words

In [None]:
tokenized_text = [word_tokenize(df['comment_text']) for report in incoming_reports]
