In [5]:
import os
import sys
import math
import time
import itertools

import tensorflow as tf
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
TRAIN_DATA_FILE= 'gdrive/My Drive/dl/data/sentences_encoded.csv'

train_df = pd.read_csv(TRAIN_DATA_FILE)
print(train_df.columns)
print(len(train_df['toxic']))

Index(['comment_text', 'toxic'], dtype='object')
159571


In [7]:
x_train = np.array([sent.split() for sent in train_df['comment_text']])
x_train = np.array([[int(word) for word in sent] for sent in x_train])
y_train = train_df['toxic']
print('loaded')

MAX_SEQUENCE_LEN = x_train.shape[1]
WORDS_IN_CORPORA = max([int(max(sent)) for sent in x_train]) + 1
print(WORDS_IN_CORPORA)

loaded
134053


In [0]:
sequence_input = keras.layers.Input(shape=(MAX_SEQUENCE_LEN,), dtype='int32')

In [9]:
embedded_sequences = keras.layers.Embedding(input_dim=WORDS_IN_CORPORA,
                                            output_dim=100,
                                            input_length=MAX_SEQUENCE_LEN)(sequence_input)

Instructions for updating:
Colocations handled automatically by placer.


In [10]:
x = keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(embedded_sequences)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPooling1D(pool_size=5)(x)
x = keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(x)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.MaxPooling1D(pool_size=5)(x)
x = keras.layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(x)
x = keras.layers.Dropout(0.2)(x)
x = keras.layers.BatchNormalization()(x)

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
x = keras.layers.GlobalMaxPool1D()(x)
x = keras.layers.Flatten()(x)

In [0]:
x = keras.layers.Dense(128, activation='relu')(x)
x = keras.layers.BatchNormalization()(x)
sequence_output = keras.layers.Dense(2, activation='softmax')(x)

In [0]:
model = keras.models.Model(inputs=[sequence_input], outputs=[sequence_output])

In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding (Embedding)        (None, 300, 100)          13405300  
_________________________________________________________________
conv1d (Conv1D)              (None, 300, 128)          64128     
_________________________________________________________________
batch_normalization_v1 (Batc (None, 300, 128)          512       
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 60, 128)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 60, 128)           82048     
_________________________________________________________________
batch_normalization_v1_1 (Ba (None, 60, 128)           512       
__________

In [0]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', 
              metrics=["accuracy"])

In [17]:
x_train_to_use = x_train[:120000]
y_train_to_use = y_train[:120000]
x_test_to_use = x_train[120000:]
y_test_to_use = y_train[120000:]

print(x_train_to_use.shape)
print(y_train_to_use.shape)
print(x_test_to_use.shape)
print(y_test_to_use.shape)
print(sum(y_train_to_use))

(120000, 300)
(120000,)
(39571, 300)
(39571,)
11420


In [18]:

model.fit(x_train_to_use, y_train_to_use, epochs=4, batch_size=32,
          validation_data=(x_test_to_use, y_test_to_use),
          callbacks=[tf.keras.callbacks.LearningRateScheduler(schedule = lambda x: 0.001 if x == 0 else 0.0001)])

Train on 120000 samples, validate on 39571 samples
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f8ed30ecef0>

In [19]:
model_json = model.to_json()
with open("gdrive/My Drive/dl/models/model_easier.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("gdrive/My Drive/dl/models/model_easier.h5")
print("Saved model to disk")

Saved model to disk


In [20]:
from keras.models import model_from_json

json_file = open('gdrive/My Drive/dl/models/model_easier.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = tf.keras.models.model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("gdrive/My Drive/dl/models/model_easier.h5")
print("Loaded model from disk")

Using TensorFlow backend.


Loaded model from disk


In [0]:
all_models_path = 'models'
MODEL_NAME = "model_easier"

In [0]:
model_version = int(time.time())
model_path = os.path.join(all_models_path, MODEL_NAME, str(model_version))
os.makedirs(model_path)