In [1]:
import logging
import re
import string
import time
from typing import Tuple, Union, List, Dict
import random

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

level = logging.INFO
logging.basicConfig(level=level)
logger = logging.getLogger(__name__)


In [2]:
embed_size   = 128
max_features = 20000
epochs       = 10
batch_size   = 128
max_len      = 500

In [3]:
def fetch_data() -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    filename = 'jigsaw_subset.csv'
    f = open(filename,'r')
    records = f.read().split('\n')
    
    header  = records[0].split(',')
    records = [(record.split(',')[0], re.sub('[0-9\.]+\,','',record)) for record in records[1:]]
    
    train_labels   = []
    train_examples = []
    test_labels    = []
    test_examples  = []
    
    for label,text in records:
        try:
            if random.random() <= 0.80:
                train_labels.append(float(label))
                train_examples.append(text)
            else:
                test_labels.append(float(label))
                test_examples.append(text)
        except:
            print('[ EXCEPTION ] {}')
    
    #logger.info(f'There are {train_examples.shape[0]} comments in the training set')
    #logger.info(f'There are {test_examples.shape[0]} comments in the testing set')
    return np.array(train_examples), np.array(train_labels), np.array(test_examples), np.array(test_labels)

In [4]:
def custom_preprocessing(raw_text: str) -> tf.string:
    lowercase = tf.strings.lower(raw_text)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')

In [5]:
def init_vectorize_layer(text_dataset: np.ndarray) -> TextVectorization:
    text_vectorizer = TextVectorization(max_tokens=max_features,
                                        standardize=custom_preprocessing,
                                        output_mode='int',
                                        output_sequence_length=max_len)
    text_vectorizer.adapt(text_dataset)
    return text_vectorizer

In [6]:
def init_model(text_dataset: np.ndarray) -> tf.keras.Model:
    vectorize_layer = init_vectorize_layer(text_dataset)
    raw_input = tf.keras.Input(shape=(1,), dtype=tf.string)
    x = vectorize_layer(raw_input)
    x = tf.keras.layers.Embedding(max_features + 1, embed_size)(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(100, return_sequences=True))(x)
    x = tf.keras.layers.GlobalMaxPooling1D()(x)
    predictions = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.Model(raw_input, predictions)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [7]:
'''
def train():
    train_examples, train_labels, _, _ = fetch_data()
    model = init_model(train_examples)
    model.fit(train_examples, train_labels, epochs=epochs, batch_size=batch_size)
    tf_model_wrapper = TFModel(model)
    tf.saved_model.save(tf_model_wrapper.model, f'saved_models/{int(time.time())}',
                        signatures={'serving_default': tf_model_wrapper.prediction})
    
    logger.info('saving SavedModel to ./saved_models')
'''

"\ndef train(self) -> None:\n    train_examples, train_labels, _, _ = self.fetch_data()\n    model = self.init_model(train_examples)\n    model.fit(train_examples, train_labels, epochs=self.epochs, batch_size=self.batch_size)\n    self.tf_model_wrapper = TFModel(model)\n    tf.saved_model.save(self.tf_model_wrapper.model, f'saved_models/{int(time.time())}',\n                        signatures={'serving_default': self.tf_model_wrapper.prediction})\n    \n    logger.info('saving SavedModel to saved_models')\n"

In [8]:
train_examples, train_labels, test_examples, test_labels = fetch_data()

[ EXCEPTION ] {}
[ EXCEPTION ] {}
[ EXCEPTION ] {}
[ EXCEPTION ] {}
[ EXCEPTION ] {}
[ EXCEPTION ] {}
[ EXCEPTION ] {}
[ EXCEPTION ] {}
[ EXCEPTION ] {}
[ EXCEPTION ] {}
[ EXCEPTION ] {}
[ EXCEPTION ] {}


In [9]:
model = init_model(train_examples)

In [10]:
model.fit(train_examples, train_labels, epochs=epochs, batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f41ccca46a0>

In [None]:
tf_model_wrapper = TFModel(model)

In [None]:
!mkdir saved_models

In [None]:
tf.saved_model.save(tf_model_wrapper.model, f'saved_models/{int(time.time())}',
                        signatures={'serving_default': tf_model_wrapper.prediction})

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


INFO:tensorflow:Assets written to: saved_models/1606500237/assets


INFO:tensorflow:Assets written to: saved_models/1606500237/assets


In [None]:
!tar -zcvf toxicity_model_z1.tar.gz saved_models/

saved_models/
saved_models/1606500237/
saved_models/1606500237/variables/
saved_models/1606500237/variables/variables.index
saved_models/1606500237/variables/variables.data-00000-of-00001
saved_models/1606500237/assets/
saved_models/1606500237/saved_model.pb


In [None]:
#ZEND