In [1]:
import re
import numpy as np
from nltk import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords

from keras.preprocessing import text, sequence
from sklearn.model_selection import train_test_split
import itertools


maxWords = 30000
maxLengthInWords = 400

lemmatizer = WordNetLemmatizer()

def preprocess(text):
    cleanedText = clean(text)
    tokenizedText = itertools.islice(word_tokenize(cleanedText), maxLengthInWords)
    withoutStopwords = removeStopwords(tokenizedText)
    lemmatizedText = lemmatize(withoutStopwords)

    return lemmatizedText

def clean_and_save(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    
    corpus_file = open('corpus-clean.txt', 'a')
    corpus_file.write(text+'\n')

    return text

def removeStopwords(words):
    return [word for word in words if word not in stopwords.words('english')]

def lemmatize(text):
    for word, tag in pos_tag(text):
        if tag.startswith("NN"):
            return lemmatizer.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            return lemmatizer.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            return lemmatizer.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            return lemmatizer.lemmatize(word, pos='r')
        else:
            return word

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
import os 
import pandas as pd

pd.set_option('display.max_colwidth', -1)

trainTestValData = pd.read_csv(os.getcwd() + '/train.csv')
submissionData = pd.read_csv(os.getcwd() + '/test.csv')

In [3]:
labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

trainTestValData['comment_text'].fillna('fillna', inplace=True)
submissionData['comment_text'].fillna('fillna', inplace=True)

In [4]:
trainTestValData['preprocessed_text'] = trainTestValData['comment_text'].apply(clean_and_save)
submissionData['preprocessed_text'] = submissionData['comment_text'].apply(clean_and_save)

trainTestValData.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0,explanation why the edits made under my username hardcore metallica fan were reverted they were not vandalisms just closure on some gas after i voted at new york dolls fac and please do not remove the template from the talk page since i am retired now 89 205 38 27
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0,d aww he matches this background colour i am seemingly stuck with thanks talk 21 51 january 11 2016 utc
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0,hey man i am really not trying to edit war it just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page he seems to care more about the formatting than the actual info
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport """,0,0,0,0,0,0,more i cannot make any real suggestions on improvement i wondered if the section statistics should be later on or a subsection of types of accidents i think the references may need tidying so that they are all in the exact same format ie date format etc i can do that later on if no one else does first if you have any preferences for formatting style on references or want to do it yourself please let me know there appears to be a backlog on articles for review so i guess there may be a delay until a reviewer turns up it listed in the relevant form eg wikipedia good_article_nominations transport
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0,you sir are my hero any chance you remember what page that on


In [5]:
X_train_full = trainTestValData['preprocessed_text']
y_train_full = trainTestValData[labels].values

X_submission = submissionData['preprocessed_text']

In [6]:
tokenizer = text.Tokenizer(num_words=maxWords)
tokenizer.fit_on_texts(list(X_train_full) + list(X_submission))

X_train_full = tokenizer.texts_to_sequences(X_train_full)
X_submission = tokenizer.texts_to_sequences(X_submission)

X_train_full = sequence.pad_sequences(X_train_full, maxlen=maxLengthInWords)
X_submission = sequence.pad_sequences(X_submission, maxlen=maxLengthInWords)

# Construct custom fasttext embeddings matrix

In [8]:
def parse(word, *embeddingVector): return word, np.asarray(embeddingVector, dtype='float32')

fastTextEmbeddingsFile = os.getcwd() + '/toxic-embeddings.vec'
fastTextEmbeddings = dict(parse(*wordEmbeddingPair.rstrip().rsplit(' ')) for wordEmbeddingPair in open(fastTextEmbeddingsFile))

tokenizerDictionary = tokenizer.word_index

wordsNumber = min(maxWords, len(tokenizerDictionary))
fastTextEmbeddingsMatrix = np.zeros((wordsNumber, 300))

for word, i in tokenizerDictionary.items():
    if i >= wordsNumber: 
        break 
        
    embeddingVector = fastTextEmbeddings.get(word)
    
    if embeddingVector is not None: 
        fastTextEmbeddingsMatrix[i] = embeddingVector
        
np.save('custom_fast_text_embeddings.npy', fastTextEmbeddingsMatrix)     

# Construct pretrained fasttext embeddings matrix

In [None]:
def parse(word, *embeddingVector): return word, np.asarray(embeddingVector, dtype='float32')

fastTextEmbeddingsFile = os.getcwd() + '/crawl-300d-2M.vec'
fastTextEmbeddings = dict(parse(*wordEmbeddingPair.rstrip().rsplit(' ')) for wordEmbeddingPair in open(fastTextEmbeddingsFile))

tokenizerDictionary = tokenizer.word_index

wordsNumber = min(maxWords, len(tokenizerDictionary))
fastTextEmbeddingsMatrix = np.zeros((wordsNumber, 300))

for word, i in tokenizerDictionary.items():
    if i >= wordsNumber: 
        break 
        
    embeddingVector = fastTextEmbeddings.get(word)
    
    if embeddingVector is not None: 
        fastTextEmbeddingsMatrix[i] = embeddingVector
        
np.save('fast_text_embeddings.npy', fastTextEmbeddingsMatrix)     

In [None]:
def getDatasetSplits(maxWords = 30000, maxSequenceLengthInWords = 400):
    
    global X_train_full
    global y_train_full
    global X_submission
    
    X_train_val, X_test, y_train_val, y_test = train_test_split(X_train_full, y_train_full, train_size=0.8, random_state=256)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, train_size=0.8, random_state=256)
    
    return X_train, y_train, X_val, y_val, X_test, y_test, X_train_full, y_train_full, X_submission
    

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test, X_train_full, y_train_full, X_submission = getDatasetSplits()

In [None]:
print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

In [None]:
np.save('X_train.npy', X_train)
np.save('X_val.npy', X_val)
np.save('X_test.npy', X_test)
np.save('X_train_full.npy', X_train_full)

np.save('y_train.npy', y_train)
np.save('y_val.npy', y_val)
np.save('y_test.npy', y_test)
np.save('y_train_full.npy', y_train_full)

np.save('X_submission.npy', X_submission)