In [303]:
import tensorflow as tf
import pandas as pd
import numpy as np
import glob
import re

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

np.set_printoptions(precision=3, suppress=True)
print(tf.__version__)

2.4.0


In [289]:
!unzip ./glove.6B.zip -d glove

In [304]:
df = pd.read_csv("data/amazon_cells_labelled.txt", sep="\t",header=None)
df2 = pd.read_csv("data/imdb_labelled.txt", sep="\t",header=None)
df3 = pd.read_csv("data/yelp_labelled.txt", sep="\t",header=None)
df = df.append(df2)
df = df.append(df3)

In [305]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [306]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [307]:
X = []
sentences = list(df[0])
y = df[1].to_numpy()
for sen in sentences:
    X.append(preprocess_text(sen))

In [308]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [309]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [310]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1
MAX_LENGTH = 300

X_train = pad_sequences(X_train, padding='post', maxlen=MAX_LENGTH)
X_test = pad_sequences(X_test, padding='post', maxlen=MAX_LENGTH)

In [311]:
BATCH_SIZE = 32
REPEAT_SIZE = 100

def train_input_fn():
    return tf.data.Dataset.from_tensor_slices((X_train, y_train)).repeat(REPEAT_SIZE).batch(BATCH_SIZE)

def test_input_fn():
    return tf.data.Dataset.from_tensor_slices((X_test, y_test)).repeat(REPEAT_SIZE).batch(BATCH_SIZE)

In [312]:
from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('./glove/glove.6B.300d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [313]:
embedding_matrix = zeros((vocab_size, 300))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [314]:
import tensorflow.compat.v1 as tf1
tf1.disable_v2_behavior()

def model_fn(features, labels, mode):
    
    layer = Embedding(vocab_size, MAX_LENGTH, weights=[embedding_matrix], input_length=maxlen , trainable=False)(features)
    layer = Bidirectional(LSTM(128, return_sequences=True))(layer)
    layer = Bidirectional(LSTM(128))(layer)    
    logits = tf.keras.layers.Dense(units=2)(layer) 
    
    predictions = {
      # Generate predictions (for PREDICT and EVAL mode)
      "classes": tf.argmax(input=logits, axis=1, name="classes"),
      "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
    }
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    
     # Calculate Loss   
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)    
    loss = tf.reduce_mean(loss)
    
    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf1.train.AdamOptimizer(learning_rate=0.001)
        train_op = optimizer.minimize(
            loss = loss,
            global_step = tf1.train.get_global_step()
            )
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    
    # Add evaluation metrics Evaluation mode
    eval_metric_ops = {
        "accuracy": tf1.metrics.accuracy(
            labels=labels,
            predictions=predictions["classes"]
        )}
    
    return tf.estimator.EstimatorSpec(
        mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)


In [315]:
# Create the Estimator
import tempfile
model_dir = tempfile.mkdtemp()
my_estimator = tf.estimator.Estimator(
    model_fn=model_fn, model_dir=model_dir)

# Set up training logging for predictions
tensors_to_log = {"probabilities": "softmax_tensor", "classes": "classes"}
logging_hook = tf.estimator.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=100)

# Build specification 
train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=500, hooks=[logging_hook])
eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn)

tf.estimator.train_and_evaluate(
    my_estimator,
    train_spec,
    eval_spec)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmppozzz5wc', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running trainin

({'accuracy': 0.8, 'loss': 0.84808326, 'global_step': 500}, [])