In [15]:
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import numpy as np
import pandas as pd
import time

import progressbar

In [2]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')
test_labels = pd.read_csv('./dataset/test_labels.csv')

print(train.shape)
print(test.shape)
print(test_labels.shape)

(159571, 8)
(153164, 2)
(153164, 7)


In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


## Pre-processing

In [8]:
X = train['comment_text']
X_TEST = test['comment_text']

#### Lemmatization

In [16]:
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            yield wnl.lemmatize(word, pos='r')
        else:
            yield word

In [17]:
X_lemma = []
X_TEST_lemma = []

In [24]:
print ("Train data lemmatization begins")
for i in progressbar.progressbar(range(0,len(X))):
    X_lemma.append(" ".join(lemmatize_all(str(X[i]))))
print ("Train data lemmatization ends")
print ("Test data lemmatization begins")
for i in progressbar.progressbar(range (0, len(X_TEST))):
    X_TEST_lemma.append(" ".join(lemmatize_all(str(X_TEST[i]))))
print ("Test data lemmatization ends")

  0% (43 of 159571) |                    | Elapsed Time: 0:00:00 ETA:   0:12:49

Train data lemmatization begins


100% (159571 of 159571) |################| Elapsed Time: 0:10:56 Time:  0:10:56
  0% (33 of 153164) |                    | Elapsed Time: 0:00:00 ETA:   0:07:51

Train data lemmatization ends
Test data lemmatization begins


  9% (14138 of 153164) |#                | Elapsed Time: 0:00:54 ETA:   0:08:59

KeyboardInterrupt: 

#### Stemming

In [None]:
X_stem = []
X_TEST_stem = []

In [None]:
ps = PorterStemmer()

print ("Train data Stemming begins")
for i in progressbar.progressbar(range(0,len(X))):
    X_stem.append(" ".join(map(ps.stem, word_tokenize(str(X[i])))))
print ("Train data Stemming ends")
print ("Test data Stemming begins")
for i in progressbar.progressbar(range (0, len(X_TEST))):
    X_TEST_stem.append(" ".join(map(ps.stem, word_tokenize(str(X_TEST[i])))))
print ("Test data Stemming ends")