In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import numpy as np
import pandas as pd
import time

import progressbar

In [3]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')
test_labels = pd.read_csv('./dataset/test_labels.csv')

print(train.shape)
print(test.shape)
print(test_labels.shape)

(159571, 8)
(153164, 2)
(153164, 7)


In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


## Pre-processing

In [6]:
X = train['comment_text']
X_TEST = test['comment_text']

#### Lemmatization

In [7]:
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            yield wnl.lemmatize(word, pos='r')
        else:
            yield word

In [8]:
X_lemma = []
X_TEST_lemma = []

In [9]:
# Train data lemmatization begins
for i in progressbar.progressbar(range(0,len(X))):
    X_lemma.append(" ".join(lemmatize_all(str(X[i]))))
print ("Train data lemmatization ends")

# Test data lemmatization begins
for i in progressbar.progressbar(range (0, len(X_TEST))):
    X_TEST_lemma.append(" ".join(lemmatize_all(str(X_TEST[i]))))
print ("Test data lemmatization ends")

N/A% (0 of 159571) |                     | Elapsed Time: 0:00:00 ETA:  --:--:--

Train data lemmatization begins


100% (159571 of 159571) |################| Elapsed Time: 0:13:30 Time:  0:13:30
  0% (30 of 153164) |                    | Elapsed Time: 0:00:00 ETA:   0:08:50

Train data lemmatization ends
Test data lemmatization begins


100% (153164 of 153164) |################| Elapsed Time: 0:11:42 Time:  0:11:42


Test data lemmatization ends


#### Stemming

In [24]:
X_stem = []
X_TEST_stem = []

In [25]:
ps = PorterStemmer()

# Train data Stemming begins
for i in progressbar.progressbar(range(0,len(X))):
    a = str(X[i])
    if(i == 115606) :
        a = a.split('\n')[0] 
        
    X_stem.append(" ".join(map(ps.stem, word_tokenize(a))))
print ("Train data Stemming ends")

# Test data Stemming begins
for i in progressbar.progressbar(range (0, len(X_TEST))):
    a = str(X_TEST[i])
    X_TEST_stem.append(" ".join(map(ps.stem, word_tokenize(a))))
print ("Test data Stemming ends")

100% (159571 of 159571) |################| Elapsed Time: 0:05:34 Time:  0:05:34
  0% (56 of 153164) |                    | Elapsed Time: 0:00:00 ETA:   0:04:44

Train data Stemming ends


100% (153164 of 153164) |################| Elapsed Time: 0:04:52 Time:  0:04:52


Test data Stemming ends


In [26]:
len(X_stem)

159571

### Saving Stuff to Files

In [27]:
train_new = train.copy()
test_new = test.copy()

In [28]:
train_new['comment_lemma'] = X_lemma
test_new['comment_lemma'] = X_TEST_lemma

train_new['comment_stem'] = X_stem
test_new['comment_stem'] = X_TEST_stem

In [29]:
train_new.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_lemma,comment_stem
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,Explanation Why the edits make under my userna...,explan whi the edit made under my usernam hard...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,D'aww ! He match this background colour I 'm s...,d'aww ! He match thi background colour I 'm se...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"Hey man , I 'm really not try to edit war . It...","hey man , I 'm realli not tri to edit war . It..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,`` More I ca n't make any real suggestion on i...,`` more I ca n't make ani real suggest on impr...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"You , sir , be my hero . Any chance you rememb...","you , sir , are my hero . ani chanc you rememb..."


In [30]:
test_new.head()

Unnamed: 0,id,comment_text,comment_lemma,comment_stem
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,Yo bitch Ja Rule be more succesful then you 'l...,Yo bitch Ja rule is more succes then you 'll e...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,"== From RfC == The title be fine as it be , IMO .","== from rfc == the titl is fine as it is , imo ."
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",`` == Sources == * Zawe Ashton on Lapland — / ``,`` == sourc == * zaw ashton on lapland — / ``
3,00017563c3f7919a,":If you have a look back at the source, the in...",": If you have a look back at the source , the ...",": If you have a look back at the sourc , the i..."
4,00017695ad8997eb,I don't anonymously edit articles at all.,I do n't anonymously edit article at all .,I do n't anonym edit articl at all .


In [35]:
train_new.to_csv('./preprocessed/train.csv', index=False)

In [34]:
test_new.to_csv('./preprocessed/test.csv', index=False)