In [0]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#filename = '/content/drive/My Drive/imbalance-classification/mammography.csv'

path = '/content/drive/My Drive/toxic_comment/'

TRAIN_DATA_FILE = path + 'train.csv'
#TEST_DATA_FILE = path + 'test.csv'

In [0]:
train_df = pd.read_csv(TRAIN_DATA_FILE)

In [18]:
print('Processing text dataset')
from nltk.tokenize import WordPunctTokenizer
from collections import Counter
from string import punctuation, ascii_lowercase
import regex as re
from tqdm import tqdm

Processing text dataset


In [0]:
# replace urls
re_url = re.compile(r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\
                    .([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*",
                    re.MULTILINE|re.UNICODE)

In [0]:
# replace ips
re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")

In [0]:
# setup tokenizer
tokenizer = WordPunctTokenizer()

In [0]:
vocab = Counter()

In [0]:
def text_to_wordlist(text, lower=False):
    # replace URLs
    text = re_url.sub("URL", text)
    
    # replace IPs
    text = re_ip.sub("IPADDRESS", text)
    
    # Tokenize
    text = tokenizer.tokenize(text)
    
    # optional: lower case
    if lower:
        text = [t.lower() for t in text]
    
    # Return a list of words
    vocab.update(text)
    return text

In [0]:
def process_comments(list_sentences, lower=False):
    comments = []
    for text in tqdm(list_sentences):
        txt = text_to_wordlist(text, lower=lower)
        comments.append(txt)
    return comments

In [25]:
list_sentences_train = list(train_df["comment_text"].fillna("NAN_WORD").values)
#list_sentences_test = list(test_df["comment_text"].fillna("NAN_WORD").values)

comments = process_comments(list_sentences_train , lower=True)

100%|██████████| 159571/159571 [00:10<00:00, 15720.15it/s]


In [26]:
print("The vocabulary contains {} unique tokens".format(len(vocab)))

The vocabulary contains 195325 unique tokens


In [18]:
comments[10]

['"',
 'fair',
 'use',
 'rationale',
 'for',
 'image',
 ':',
 'wonju',
 '.',
 'jpg',
 'thanks',
 'for',
 'uploading',
 'image',
 ':',
 'wonju',
 '.',
 'jpg',
 '.',
 'i',
 'notice',
 'the',
 'image',
 'page',
 'specifies',
 'that',
 'the',
 'image',
 'is',
 'being',
 'used',
 'under',
 'fair',
 'use',
 'but',
 'there',
 'is',
 'no',
 'explanation',
 'or',
 'rationale',
 'as',
 'to',
 'why',
 'its',
 'use',
 'in',
 'wikipedia',
 'articles',
 'constitutes',
 'fair',
 'use',
 '.',
 'in',
 'addition',
 'to',
 'the',
 'boilerplate',
 'fair',
 'use',
 'template',
 ',',
 'you',
 'must',
 'also',
 'write',
 'out',
 'on',
 'the',
 'image',
 'description',
 'page',
 'a',
 'specific',
 'explanation',
 'or',
 'rationale',
 'for',
 'why',
 'using',
 'this',
 'image',
 'in',
 'each',
 'article',
 'is',
 'consistent',
 'with',
 'fair',
 'use',
 '.',
 'please',
 'go',
 'to',
 'the',
 'image',
 'description',
 'page',
 'and',
 'edit',
 'it',
 'to',
 'include',
 'a',
 'fair',
 'use',
 'rationale',
 '.',


# Model The Word Vectors With Gensim

In [0]:
from gensim.models import Word2Vec

In [0]:
model = Word2Vec(comments, size=100, window=5, min_count=5, workers=16, sg=0, negative=5)

In [0]:
word_vectors = model.wv

In [30]:
print("Number of word vectors: {}".format(len(word_vectors.vocab)))

Number of word vectors: 44264


In [31]:
model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

[('prince', 1.041556715965271),
 ('queen', 1.0297127962112427),
 ('bishop', 0.9998652935028076),
 ('princess', 0.996989905834198),
 ('jordan', 0.9941806197166443),
 ('edward', 0.9867451786994934),
 ('mayor', 0.9847586154937744),
 ('sultan', 0.9823850989341736),
 ('henry', 0.9803484082221985),
 ('mary', 0.979623556137085)]

# Initialize The Embeddings In Keras

In [0]:
MAX_NB_WORDS = len(word_vectors.vocab)
MAX_SEQUENCE_LENGTH = 200

In [0]:
from keras.preprocessing.sequence import pad_sequences

In [0]:
word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NB_WORDS))}
sequences = [[word_index.get(t, 0) for t in comment]
             for comment in comments[:len(list_sentences_train)]]
test_sequences = [[word_index.get(t, 0)  for t in comment] 
                  for comment in comments[len(list_sentences_train):]]

In [28]:
sequences

[[722,
  89,
  2,
  146,
  149,
  200,
  41,
  708,
  4551,
  11593,
  1138,
  99,
  357,
  34,
  61,
  2301,
  10,
  30,
  11594,
  3,
  66,
  6911,
  20,
  74,
  2818,
  167,
  5,
  2989,
  47,
  137,
  1244,
  15421,
  2852,
  1,
  7,
  59,
  72,
  10,
  30,
  271,
  2,
  392,
  43,
  2,
  52,
  40,
  163,
  5,
  10,
  82,
  3425,
  102,
  1,
  134],
 [184,
  10,
  16739,
  51,
  63,
  2689,
  18,
  586,
  3834,
  5,
  10,
  82,
  4552,
  2760,
  31,
  1,
  110,
  1,
  23,
  52,
  37,
  908,
  25,
  3400,
  3,
  1023,
  660,
  3,
  8346,
  23,
  205,
  37],
 [441,
  453,
  3,
  5,
  10,
  82,
  154,
  19,
  277,
  4,
  88,
  342,
  1,
  13,
  10,
  26,
  66,
  12,
  18,
  640,
  11,
  2343,
  526,
  536,
  120,
  7,
  645,
  4,
  49,
  352,
  146,
  390,
  6,
  41,
  52,
  40,
  1,
  63,
  236,
  4,
  463,
  73,
  50,
  2,
  2462,
  109,
  2,
  769,
  502,
  1],
 [21,
  73,
  5,
  46,
  10,
  30,
  113,
  69,
  365,
  1486,
  20,
  2162,
  17,
  5,
  5992,
  32,
  2,
  142,
  2548,


In [35]:
# pad
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_df[list_classes].values
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                          truncating="post")
print('Shape of test_data tensor:', test_data.shape)

Shape of data tensor: (159571, 200)
Shape of label tensor: (159571, 6)
Shape of test_data tensor: (0, 200)


In [0]:
WV_DIM = 100
nb_words = min(MAX_NB_WORDS, len(word_vectors.vocab))
# we initialize the matrix with random numbers
wv_matrix = (np.random.rand(nb_words, WV_DIM) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        # words not found in embedding index will be all-zeros.
        wv_matrix[i] = embedding_vector
    except:
        pass        

In [0]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout,SpatialDropout1D, Bidirectional
from keras.models import Model
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization

In [38]:
wv_layer = Embedding(nb_words,
                     WV_DIM,
                     mask_zero=False,
                     weights=[wv_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

# Inputs
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = wv_layer(comment_input)

# biGRU
embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
x = Bidirectional(LSTM(64, return_sequences=False))(embedded_sequences)

# Output
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
preds = Dense(6, activation='sigmoid')(x)

# build the model
model = Model(inputs=[comment_input], outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=0.001, clipnorm=.25, beta_1=0.7, beta_2=0.99),
              metrics=[])


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [39]:
hist = model.fit([data], y, validation_split=0.1,
                 epochs=10, batch_size=256, shuffle=True)

Train on 143613 samples, validate on 15958 samples
Epoch 1/10

InvalidArgumentError: ignored

In [0]:
history = pd.DataFrame(hist.history)
plt.figure(figsize=(12,12));
plt.plot(history["loss"]);
plt.plot(history["val_loss"]);
plt.title("Loss with pretrained word vectors");
plt.show();


In [0]:
wv_layer = Embedding(nb_words,
                     WV_DIM,
                     mask_zero=False,
                     # weights=[wv_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

# Inputs
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = wv_layer(comment_input)

# biGRU
embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
x = Bidirectional(LSTM(64, return_sequences=False))(embedded_sequences)

# Output
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
preds = Dense(6, activation='sigmoid')(x)

# build the model
model = Model(inputs=[comment_input], outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=0.001, clipnorm=.25, beta_1=0.7, beta_2=0.99),
              metrics=[])

In [0]:
hist = model.fit([data], y, validation_split=0.1,
                 epochs=10, batch_size=256, shuffle=True)

In [0]:
history = pd.DataFrame(hist.history)
plt.figure(figsize=(12,12));
plt.plot(history["loss"]);
plt.plot(history["val_loss"]);
plt.title("Loss with random word vectors");
plt.show();

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

In [0]:
train_df = pd.read_csv("/content/drive/My Drive/toxic_comment/train.csv").fillna("sterby")

In [3]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [0]:
X_train = train_df["comment_text"].values
y_train = train_df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
#X_test = test_df["comment_text"].values

In [0]:
from keras.preprocessing import sequence
from keras.models import Model, Input
from keras.layers import Dense, Embedding, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam

In [0]:
max_features = 20000  # number of words we want to keep
maxlen = 100  # max length of the comments in the model
batch_size = 64  # batch size for the model
embedding_dims = 20  # dimension of the hidden variable, i.e. the embedding dimension

In [8]:
tok = Tokenizer(num_words=max_features)
tok.fit_on_texts(list(X_train))
x_train = tok.texts_to_sequences(X_train)
#x_test = tok.texts_to_sequences(X_test)
print(len(x_train), 'train sequences')
#print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
#print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

159571 train sequences
Average train sequence length: 65


In [9]:
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
#x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
#print('x_test shape:', x_test.shape)

x_train shape: (159571, 100)


In [0]:
comment_input = Input((maxlen,))

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
comment_emb = Embedding(max_features, embedding_dims, input_length=maxlen, 
                        embeddings_initializer="uniform")(comment_input)

# we add a GlobalMaxPooling1D, which will extract features from the embeddings
# of all words in the comment
h = GlobalMaxPooling1D()(comment_emb)

# We project onto a six-unit output layer, and squash it with a sigmoid:
output = Dense(6, activation='sigmoid')(h)

model = Model(inputs=comment_input, outputs=output)

model.compile(loss='binary_crossentropy',
              optimizer=Adam(0.01),
              metrics=['accuracy'])

In [12]:
hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=3, validation_split=0.1)




Train on 143613 samples, validate on 15958 samples
Epoch 1/3





Epoch 2/3
Epoch 3/3
