In [0]:
!pip3 install ttictoc
!pip3 uninstall -y tensorflow
!pip3 install tensorflow==2.1.0
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [0]:
import os
import os.path

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout
from tensorflow.keras.layers import LSTM, Bidirectional, Conv1D, GlobalAveragePooling1D, GlobalMaxPooling1D
import tensorflow.keras
from tensorflow import keras

BASE_PATH=""
NEGATIVE_DATASET_DIRECTORY = '/neg/'
POSITIVE_DATASET_DIRECTORY = '/pos/'
TRAIN_DATASET_DIRECTORY = BASE_PATH + 'train'
TEST_DATASET_DIRECTORY = BASE_PATH + 'test'
TRAIN_INDICES_DATASET_FILE_NAME = BASE_PATH + 'train_processed_indices.csv'
TEST_INDICES_DATASET_FILE_NAME = BASE_PATH + 'test_processed_indices.csv'

EPOCHS_NUMBER = 200

TPU_BATCH_SIZE = 128*8

MAX_LENGTH = 128
EMBEDDING_VECTOR_SIZE=300
LSTM_UNITS_COUNT=512
VOCABULARY_SIZE = 89527


def read_dataset(dataset_directory):
    dataset = []
    labels = []
    positive_reviews_directory = dataset_directory + POSITIVE_DATASET_DIRECTORY
    for review_file_name in os.listdir(positive_reviews_directory):
        full_file_name = positive_reviews_directory + review_file_name
        dataset.append(open(full_file_name, 'r').read())
        labels.append(1)
    negative_reviews_directory = dataset_directory + NEGATIVE_DATASET_DIRECTORY
    for review_file_name in os.listdir(negative_reviews_directory):
        full_file_name = negative_reviews_directory + review_file_name
        dataset.append(open(full_file_name, 'r').read())
        labels.append(0)
    return np.array(dataset), np.array(labels)


def load_vocabulary():
    vocabulary = {}
    i = 0
    with open('imdb.vocab', 'r') as vocabulary_file:
        for line in vocabulary_file:
            vocabulary[line.replace('\n', '')] = i
            i += 1
    return vocabulary


def find_words_from_vocabulary(text, vocabulary):
    found_words_codes = set()
    for word in text.split(' '):
        if word in vocabulary:
            found_words_codes.add(vocabulary[word])
    return found_words_codes


def find_words_from_vocabulary_preserving_order(text, vocabulary):
    found_words_codes = []
    for word in text.split(' '):
        if word in vocabulary:
            found_words_codes.append(vocabulary[word])
    return pd.Series(found_words_codes).drop_duplicates().tolist()

def to_feature_vector(found_words_codes, vocabulary):
    x = np.array([0]*len(vocabulary))
    for code in found_words_codes:
        x[int(code) - 1] = 1
    return x.reshape(1, -1)


def dataset_to_features(dataset, vocabulary):
    mapped_dataset = []
    for sample in dataset:
        found_words_codes = find_words_from_vocabulary(sample, vocabulary)
        mapped_sample = to_feature_vector(found_words_codes, vocabulary)
        mapped_dataset.append(mapped_sample)
    return np.array(mapped_dataset)


def dataset_to_indices(dataset, vocabulary):
    mapped_dataset = []
    for sample in dataset:
        found_words_codes = find_words_from_vocabulary_preserving_order(sample, vocabulary)
        mapped_dataset.append(found_words_codes)
    return np.array(mapped_dataset)


def save_indices_processed_as_csv(mapped_dataset, labels, output):
    mapped_dataset_in_string = []
    for sample in mapped_dataset:
        mapped_dataset_in_string.append(' '.join(map(str, sample)) if sample else '')
    dataset = np.column_stack([labels, np.array(mapped_dataset_in_string)])
    np.savetxt(output, dataset, delimiter=',', fmt='%s')


def load_indices_processed_as_csv(output):
    csv = pd.read_csv(output)
    labels = csv.iloc[:, 0].to_numpy()
    dataset = []
    for sample in csv.iloc[:, 1:].to_numpy():
        dataset.append(list(map(np.float32, sample[0].split(' '))) if isinstance(sample[0], str) else [])
    return np.array(dataset), labels


def add_conv_layer(model, filters, dropout_rate):
    model.add(
        keras.layers.Conv1D(filters, 11, padding='same', kernel_initializer='he_normal', activation="relu"))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.MaxPooling1D(padding='same'))
    model.add(keras.layers.Dropout(dropout_rate))


if __name__ == "__main__":
    # train_dataset, train_labels = read_dataset(TRAIN_DATASET_DIRECTORY)
    # test_dataset, test_labels = read_dataset(TEST_DATASET_DIRECTORY)
    # print(train_dataset.shape)
    # print(test_dataset.shape)

    vocabulary = load_vocabulary()

    # mapped_train_dataset = dataset_to_indices(train_dataset, vocabulary)
    # print(mapped_train_dataset.shape)
    # mapped_test_dataset = dataset_to_indices(test_dataset, vocabulary)
    # print(mapped_test_dataset.shape)

    # save_indices_processed_as_csv(mapped_train_dataset, train_labels, TRAIN_INDICES_DATASET_FILE_NAME)
    # save_indices_processed_as_csv(mapped_test_dataset, test_labels, TEST_INDICES_DATASET_FILE_NAME)

    mapped_train_dataset, train_labels = load_indices_processed_as_csv(TRAIN_INDICES_DATASET_FILE_NAME)
    print(np.max([len(v) for v in mapped_train_dataset]))
    print(mapped_train_dataset.shape)
    batch_remainder = mapped_train_dataset.shape[0] % (TPU_BATCH_SIZE)
    mapped_train_dataset=mapped_train_dataset[:-batch_remainder]
    train_labels=train_labels[:-batch_remainder]
    mapped_test_dataset, test_labels = load_indices_processed_as_csv(TEST_INDICES_DATASET_FILE_NAME)
    print(np.max([len(v) for v in mapped_test_dataset]))
    print(mapped_test_dataset.shape)

    print('Pad sequences (samples x time)')
    x_train = sequence.pad_sequences(mapped_train_dataset, maxlen=MAX_LENGTH)
    x_test = sequence.pad_sequences(mapped_test_dataset, maxlen=MAX_LENGTH)
    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)

    embeddings_index = {}
    f = open(os.path.join(BASE_PATH, 'glove.6B.300d.txt'))
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    embedding_matrix = np.zeros((VOCABULARY_SIZE, EMBEDDING_VECTOR_SIZE))
    asd = 0
    for word, i in vocabulary.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            asd+=1
    print(asd)

    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
    tf.config.experimental_connect_to_cluster(resolver)
    tf.tpu.experimental.initialize_tpu_system(resolver)
    strategy = tf.distribute.experimental.TPUStrategy(resolver)
    with strategy.scope():
        print('Build model...')
        model = Sequential()
        model.add(Embedding(VOCABULARY_SIZE,
                            EMBEDDING_VECTOR_SIZE,
                            weights=[embedding_matrix],
                            input_length=MAX_LENGTH,
                            trainable=False))
        model.add(Bidirectional(LSTM(LSTM_UNITS_COUNT, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)))
        model.add(Bidirectional(LSTM(LSTM_UNITS_COUNT, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)))
        # model.add(Conv1D(filters=128,
        #        kernel_size=8,
        #        strides=1,
        #        activation='relu',
        #        padding='same'))
        add_conv_layer(model, 48, 0.2)
        add_conv_layer(model, 64, 0.25)
        add_conv_layer(model, 128, 0.3)
        add_conv_layer(model, 160, 0.4)
        add_conv_layer(model, 192, 0.5)
        add_conv_layer(model, 192, 0.5)
        add_conv_layer(model, 192, 0.5)
        add_conv_layer(model, 192, 0.5)
        model.add(Dense(1024, activation='relu', kernel_initializer='he_normal'))
        model.add(Dropout(0.5))
        model.add(Dense(1024, activation='relu', kernel_initializer='he_normal'))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))

        # try using different optimizers and different optimizer configs
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        model.summary()

        print('Train...')
        model.fit(x_train, train_labels, batch_size=TPU_BATCH_SIZE, epochs=EPOCHS_NUMBER)
        score, acc = model.evaluate(x_test, test_labels)
        print('Test accuracy:', acc)


535
(24999,)
678
(24999,)
Pad sequences (samples x time)
x_train shape: (24576, 128)
x_test shape: (24999, 128)
62596




INFO:tensorflow:Initializing the TPU system: 10.14.202.202:8470


INFO:tensorflow:Initializing the TPU system: 10.14.202.202:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


Build model...
Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 128, 300)          26858100  
_________________________________________________________________
bidirectional_23 (Bidirectio (None, 128, 1024)         3330048   
_________________________________________________________________
bidirectional_24 (Bidirectio (None, 128, 1024)         6295552   
_________________________________________________________________
conv1d_63 (Conv1D)           (None, 128, 48)           540720    
_________________________________________________________________
batch_normalization_62 (Batc (None, 128, 48)           192       
_________________________________________________________________
max_pooling1d_62 (MaxPooling (None, 64, 48)            0         
_________________________________________________________________
dropout_88 (Dropout)         (None, 64