In [None]:
from minio import Minio
import os
import math
import tensorflow as tf
from tensorflow import keras

In [None]:
 # Config Paramters
minio_address = "minio.ns-1.svc.cluster.local"
minio_access_key = "kubeflow"
minio_secret_key = "kubeflow123"
datasets_bucket = "datasets"
preprocessed_data_folder = "preprocessed-data"
tf_record_file_size = 500

In [None]:
minioClient = Minio(minio_address,
                    access_key=minio_access_key,
                    secret_key=minio_secret_key,
                    secure=False)

In [None]:
# List all training tfrecord files
objects = minioClient.list_objects(datasets_bucket, prefix=f"{preprocessed_data_folder}/train")
training_files_list = []
for obj in objects:
    training_files_list.append(obj.object_name)

In [None]:
# Configure TF to use MinIO
os.environ["AWS_ACCESS_KEY_ID"] = minio_access_key
os.environ["AWS_SECRET_ACCESS_KEY"] = minio_secret_key
os.environ["AWS_REGION"] = "us-east-1"
os.environ["S3_ENDPOINT"] = minio_address
os.environ["S3_USE_HTTPS"] = "0"
os.environ["S3_VERIFY_SSL"] = "0"

In [None]:
all_training_filenames = [f"s3://datasets/{f}" for f in training_files_list]

In [None]:
total_train_data_files = math.floor(len(all_training_filenames) * 0.9)
if total_train_data_files == len(all_training_filenames):
    total_train_data_files -= 1
training_files = all_training_filenames[0:total_train_data_files]
validation_files = all_training_filenames[total_train_data_files:]

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
ignore_order = tf.data.Options()
ignore_order.experimental_deterministic = False

In [None]:
dataset = tf.data.TFRecordDataset(training_files, num_parallel_reads=AUTO, compression_type="GZIP")
dataset = dataset.with_options(ignore_order)


In [None]:
validation = tf.data.TFRecordDataset(validation_files, num_parallel_reads=AUTO, compression_type="GZIP")
validation = validation.with_options(ignore_order)


In [None]:
 def decode_fn(record_bytes):
    schema = {
        "label": tf.io.FixedLenFeature([2], dtype=tf.int64),
        "sentence": tf.io.FixedLenFeature([512], dtype=tf.float32),
    }

    tf_example = tf.io.parse_single_example(record_bytes, schema)
    new_shape = tf.reshape(tf_example["sentence"], [1, 512])
    label = tf.reshape(tf_example["label"], [1, 2])
    return new_shape, label

In [None]:
# Build model
model = keras.Sequential()


In [None]:
model.add(
    keras.layers.Dense(
        units=256,
        input_shape=(1, 512),
        activation="relu"
    )
)
model.add(
    keras.layers.Dropout(rate=0.5)
)

model.add(
    keras.layers.Dense(
        units=16,
        activation="relu"
    )
)
model.add(
    keras.layers.Dropout(rate=0.5)
)

model.add(keras.layers.Dense(2, activation="softmax"))
model.compile(
    loss="categorical_crossentropy",
    optimizer=keras.optimizers.Adam(0.001),
    metrics=["accuracy"]
)

model.summary()

In [None]:
mapped_ds = dataset.map(decode_fn)
mapped_ds = mapped_ds.repeat(5)
mapped_ds = mapped_ds.batch(128)

mapped_validation = validation.map(decode_fn)
mapped_validation = mapped_validation.repeat(5)
mapped_validation = mapped_validation.batch(128)

In [None]:
checkpoint_path = f"s3://{datasets_bucket}/checkpoints/cp.ckpt"
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
from datetime import datetime
model_note = "256"
logdir = f"s3://{datasets_bucket}/logs/imdb/{model_note}-" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)

In [None]:
history = model.fit(
    mapped_ds,
    epochs=10,
    callbacks=[cp_callback, tensorboard_callback],
    validation_data=mapped_validation,
)

In [None]:
model_destination = f"s3://{datasets_bucket}/imdb_sentiment_analysis/1"
model.save(model_destination)

In [None]:
with open(output_text_path, 'w') as writer:
    writer.write("done training!")
print("Done!")