In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dropout, Dense, GRU, LSTM, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('train.csv', engine='python')

In [None]:
df['not_toxic'] = (df.iloc[:, 2:].sum(axis=1) == 0 ).astype(int)

In [None]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,not_toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,1
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,1
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,1
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,1
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,1


In [None]:
#return dictionary {word: embedding_vector}
def load_embedding(fname):
    embeddings_index = {}
    f = open(fname, encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            embedding_vector = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = embedding_vector
    f.close()

    return embeddings_index

In [None]:
np.random.seed(42)
sample = np.random.choice(df.shape[0], 50000, replace=False)
text = df.comment_text[sample].values
target = df.iloc[sample, 2:].values

N_CLASSES=target.shape[1]
MAX_WORDS=10000
MAX_SEQUENCE_LENGTH=100
EMBEDDING_DIMS = 50
HIDDEN_UNITS=50
HIDDEN_LAYER=2

tokenizer = Tokenizer(num_words=MAX_WORDS) 
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text) 
sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
#embeddings_index = load_embedding("./glove.6B.50d.txt")

x_train, x_test, y_train, y_test = train_test_split(sequences, target, test_size=0.2, random_state=42)

In [None]:
def build_RNN_model(word_index, embeddings_index, N_CLASSES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIMS, HIDDEN_UNITS, HIDDEN_LAYER):
  print('Build model...')
  model = Sequential()
  embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIMS))

  for word, i in word_index.items():
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          embedding_matrix[i] = embedding_vector

  model.add(Embedding(len(word_index) + 1,
                              EMBEDDING_DIMS,
                              weights=[embedding_matrix],
                              input_length=MAX_SEQUENCE_LENGTH,
                              trainable=False))

  for i in range(0, HIDDEN_LAYER):
    model.add(LSTM(HIDDEN_UNITS, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
  
  model.add(Dense(64, activation='relu'))
  model.add(Dense(N_CLASSES, activation='sigmoid'))

  model.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])
  
  return model



In [None]:
model = build_RNN_model(word_index, {}, N_CLASSES, MAX_SEQUENCE_LENGTH, EMBEDDING_DIMS, HIDDEN_UNITS, HIDDEN_LAYER)
model.summary()

Build model...
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 100, 50)           5121800   
                                                                 
 lstm_5 (LSTM)               (None, 100, 50)           20200     
                                                                 
 lstm_6 (LSTM)               (None, 100, 50)           20200     
                                                                 
 dense_6 (Dense)             (None, 100, 64)           3264      
                                                                 
 dense_7 (Dense)             (None, 100, 7)            455       
                                                                 
Total params: 5,165,919
Trainable params: 44,119
Non-trainable params: 5,121,800
_________________________________________________________________


In [None]:
history = model.fit(x_train, y_train,
              validation_data=(x_test, y_test),
              epochs=3,
              batch_size=32)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
predictions = model.predict(x_test)
predictions = predictions.round()

In [None]:
print(classification_report(y_test, predictions))
multilabel_confusion_matrix(y_test, predictions)

              precision    recall  f1-score   support

           0       0.86      0.64      0.74       934
           1       0.34      0.12      0.18        89
           2       0.80      0.73      0.77       501
           3       0.00      0.00      0.00        25
           4       0.70      0.65      0.67       485
           5       0.00      0.00      0.00        99
           6       0.96      0.99      0.98      9011

   micro avg       0.94      0.92      0.93     11144
   macro avg       0.52      0.45      0.48     11144
weighted avg       0.92      0.92      0.92     11144
 samples avg       0.95      0.94      0.94     11144



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


array([[[8968,   98],
        [ 334,  600]],

       [[9890,   21],
        [  78,   11]],

       [[9409,   90],
        [ 134,  367]],

       [[9975,    0],
        [  25,    0]],

       [[9382,  133],
        [ 172,  313]],

       [[9901,    0],
        [  99,    0]],

       [[ 637,  352],
        [  89, 8922]]])

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l[K     |████▊                           | 10 kB 23.5 MB/s eta 0:00:01[K     |█████████▌                      | 20 kB 9.9 MB/s eta 0:00:01[K     |██████████████▎                 | 30 kB 8.5 MB/s eta 0:00:01[K     |███████████████████             | 40 kB 8.0 MB/s eta 0:00:01[K     |███████████████████████▉        | 51 kB 5.5 MB/s eta 0:00:01[K     |████████████████████████████▋   | 61 kB 5.5 MB/s eta 0:00:01[K     |████████████████████████████████| 68 kB 3.3 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.1-py2.py3-none-any.whl (211 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3132249 sha256=f0f4087088dcafa23ddcafecb3081d48cc5f8da550ebaa4064f2a152f923a1f8
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a

In [None]:
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] =  np.array([float(v) for v in tokens[1:]])
    return data

In [None]:
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English
ft = fasttext.load_model('wiki.simple.vec')

embedding = load_vectors('wiki.simple.vec')


Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz


KeyboardInterrupt: ignored

In [None]:
embedding['ciao']

array([ 1.7826e-01,  4.0100e-01,  2.0527e-01,  1.9235e-01, -1.1595e-02,
        2.4436e-01, -2.4738e-01,  5.7712e-02,  5.1554e-02,  2.1348e-01,
        1.8758e-01, -8.0496e-02, -1.2098e-01,  5.3264e-04,  2.1572e-02,
       -1.1008e-01, -2.7953e-01, -6.3232e-01,  1.5574e-01, -1.2167e-01,
        2.6415e-01, -1.9252e-01,  4.0597e-02,  5.6585e-02,  8.4474e-02,
       -1.3311e-01,  4.0583e-01, -1.2415e-01, -2.0119e-02,  2.0903e-01,
       -3.6345e-01, -3.1191e-02, -2.1535e-01,  4.9066e-01,  2.4230e-01,
       -4.1609e-01,  3.1602e-01,  1.0106e-01, -1.9878e-01,  5.1571e-02,
       -1.2399e-01,  9.2534e-02, -1.7056e-01, -6.1603e-02, -7.1328e-02,
       -3.3209e-02,  1.2767e-01,  2.9721e-01,  3.0716e-01,  3.3349e-01,
       -1.3044e-01, -1.9081e-02, -1.9244e-01, -1.4157e-01, -2.5533e-01,
       -1.8843e-01,  3.2075e-02, -2.6600e-01,  2.4838e-01,  1.8560e-01,
       -6.9006e-02,  3.5800e-01, -2.3988e-01,  6.8542e-03, -4.4892e-01,
       -2.0046e-01, -3.8911e-01, -1.3708e-01, -3.1330e-01,  7.79

In [None]:
ft.get_dimension()

300

In [None]:
ft.get_word_vector('')

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [None]:
df = pd.read_csv('train.csv', engine='python')
df['target'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0 ).astype(int)
class_names = np.unique(df.target)
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,target
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,0


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

X = df.comment_text.to_list()
y = df.target

# tokenize the sentences
tokenizer = Tokenizer(lower=False, num_words=10000)
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)

# pad the sequences
X = pad_sequences(X, maxlen=512)

X.shape

(159571, 512)

In [None]:
num_tokens = len(tokenizer.word_index) + 2
hits = 0
misses = 0
embedding_dims = 300

# init embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dims))

for word, i in tokenizer.word_index.items():
    #embedding_vector = ft.get_word_vector(word)
    embedding_vector = embedding.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
        
print("Converted %d words (%d misses)" % (hits, misses))

Converted 40349 words (216762 misses)


In [None]:
embedding_layer = tf.keras.layers.Embedding(
    num_tokens,
    embedding_dims,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [None]:
from tensorflow.keras import layers

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.Dropout(0.2)(embedded_sequences)
x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
x = layers.MaxPooling1D(5)(x)
x = layers.Conv1D(128, 5, activation="relu")(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.2)(x)
#preds = layers.Dense(len(class_names), activation="softmax")(x)
preds = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 300)         77133900  
                                                                 
 conv1d (Conv1D)             (None, None, 128)         192128    
                                                                 
 max_pooling1d (MaxPooling1D  (None, None, 128)        0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, None, 128)         82048     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                              

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping
epochs = 20
batch_size = 128

early_stopping = EarlyStopping(monitor='val_loss', patience=5)

history = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=early_stopping,
          shuffle=True,
          validation_data=(x_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred.round()))
confusion_matrix(y_test, y_pred.round())

              precision    recall  f1-score   support

           0       0.93      0.98      0.96     28671
           1       0.68      0.35      0.46      3244

    accuracy                           0.92     31915
   macro avg       0.80      0.66      0.71     31915
weighted avg       0.90      0.92      0.90     31915



array([[28141,   530],
       [ 2122,  1122]])

In [None]:
del X, y, df

In [None]:
embedding = {}

for word, i in tokenizer.word_index.items():
    embedding_vector = ft.get_word_vector(word)
    embedding[word] = list(embedding_vector)

In [None]:
len(embedding)

In [None]:
embedding['ciao']

In [None]:
import json

out_file = open("embedding.cc.en.300.jigsaw.json", "w")
json.dump(embedding, out_file)
out_file.close()

In [None]:
import pickle

with open('embedding.cc.en.300.jigsaw.pickle', 'wb') as handle:
    pickle.dump(embedding, handle, protocol=pickle.HIGHEST_PROTOCOL)