## Train a neural network using AutoKeras

## Set paths and other variables

In [14]:
train_input_file = "data/train.csv.zip"
BATCH_SIZE = 8 # It runs out-of-memmory quite easily :/

In [15]:
%env TF_GPU_ALLOCATOR=cuda_malloc_async

env: TF_GPU_ALLOCATOR=cuda_malloc_async


## Import libs

In [16]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import autokeras as ak
import keras_tuner as kt

In [17]:
tf.__version__

'2.5.0'

## Load ground truth dataset

In [18]:
train_df = pd.read_csv(train_input_file, compression="zip")


In [19]:
train_df.columns


Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

### Split ground truth dataset into training, validation and test

In [20]:
train_df, test_df = train_test_split(train_df, test_size=0.1)
train_df, val_df = train_test_split(train_df, test_size=0.1)

train_df.shape, val_df.shape, test_df.shape


((129251, 8), (14362, 8), (15958, 8))

In [21]:
train_df[
    ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
].values


array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

### Convert pandas dataframes into tensorflow datasets

In [22]:
train_set = tf.data.Dataset.from_tensor_slices(
    (
        (train_df.comment_text.values,),
        (
            train_df[
                [
                    "toxic",
                    "severe_toxic",
                    "obscene",
                    "threat",
                    "insult",
                    "identity_hate",
                ]
            ].values
        ),
    )
).batch(BATCH_SIZE)
val_set = tf.data.Dataset.from_tensor_slices(
    (
        (val_df.comment_text.values,),
        (
            val_df[
                [
                    "toxic",
                    "severe_toxic",
                    "obscene",
                    "threat",
                    "insult",
                    "identity_hate",
                ]
            ].values
        ),
    )
).batch(BATCH_SIZE)


2021-08-06 05:48:35.067071: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-08-06 05:48:35.440316: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-08-06 05:48:35.441356: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla P100-PCIE-16GB computeCapability: 6.0
coreClock: 1.3285GHz coreCount: 56 deviceMemorySize: 15.90GiB deviceMemoryBandwidth: 681.88GiB/s
2021-08-06 05:48:35.441433: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-08-06 05:48:35.473985: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-08-06 05:48:35.474120: I tensorflow/stream_executor/plat

## Train AutoKeras AutoML model

### Init AutoKeras text classifier model

In [23]:
clf = ak.TextClassifier(
    overwrite=False,  # True,
    multi_label=True,
    max_trials=10,
    metrics=[tf.keras.metrics.AUC()],
)


INFO:tensorflow:Reloading Oracle from existing project ./text_classifier/oracle.json
INFO:tensorflow:Reloading Tuner from ./text_classifier/tuner0.json


### Define earlystop to stop training if it does not improve anymore

In [24]:
earlystop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=0,
    verbose=0,
    mode="auto",
    restore_best_weights=True,
)


In [25]:
%env TF_GPU_ALLOCATOR=cuda_malloc_async

env: TF_GPU_ALLOCATOR=cuda_malloc_async


### Start training a text classifier using AutoKeras AutoML

In [26]:
clf.fit(
    train_set,
    validation_data=val_set,
    epochs=10,
    batch_size=BATCH_SIZE,
    callbacks=[earlystop],
    verbose=1,
)


2021-08-06 05:48:42.879095: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-08-06 05:48:42.881597: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2299995000 Hz


INFO:tensorflow:Oracle triggered exit
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


2021-08-06 05:49:15.570537: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets


INFO:tensorflow:Assets written to: ./text_classifier/best_model/assets






















In [38]:
# Display the best model architecture
clf.export_model().summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
expand_last_dim (ExpandLastDim) (None, 1)            0           input_1[0][0]                    
__________________________________________________________________________________________________
bert_tokenizer (BertTokenizer)  ((None, None), (None 0           expand_last_dim[0][0]            
__________________________________________________________________________________________________
bert_encoder (BertEncoder)      (None, 768)          109482240   bert_tokenizer[0][0]             
                                                                 bert_tokenizer[0][1]         

## Model evaluation

In [39]:
model = clf.export_model()

In [None]:
y_test = test_df[
    ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
].values


In [44]:
test_set = tf.data.Dataset.from_tensor_slices(
    (
        (test_df.comment_text.values,),
        (
            test_df[
                [
                    "toxic",
                    "severe_toxic",
                    "obscene",
                    "threat",
                    "insult",
                    "identity_hate",
                ]
            ].values,
        ),
    )
).batch(BATCH_SIZE)


In [40]:
predicted_y = model.predict(test_df.comment_text.values)


In [41]:
roc_auc_score(
    test_df[
        ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    ].values,
    predicted_y,
)


0.990859527129107

In [45]:
model.evaluate(test_set)




[0.03898276016116142, 0.9880177974700928]

In [46]:
model.evaluate(val_set)




[0.03896249458193779, 0.9869228005409241]

In [47]:
model.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
expand_last_dim (ExpandLastDim) (None, 1)            0           input_1[0][0]                    
__________________________________________________________________________________________________
bert_tokenizer (BertTokenizer)  ((None, None), (None 0           expand_last_dim[0][0]            
__________________________________________________________________________________________________
bert_encoder (BertEncoder)      (None, 768)          109482240   bert_tokenizer[0][0]             
                                                                 bert_tokenizer[0][1]         

## Predict unseen labels (for the Kaggle competition)

### Load the actual test data

In [48]:
real_test_df = pd.read_csv("data/test.csv.zip", compression="zip")

### Predict unseen samples

In [64]:
real_test_pred = model.predict(real_test_df.comment_text)

### Combine predictions with sample ids to store result file in a csv

In [65]:
predictions_df = pd.DataFrame(
    real_test_pred,
    columns=["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"],
)
predictions_df["id"] = real_test_df["id"]
predictions_df = predictions_df[
    ["id", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
]


In [67]:
# Predictions output looks like:
predictions_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.988485,0.511712,0.979811,0.087071,0.96104,0.757864
1,0000247867823ef7,0.000625,5.7e-05,0.000142,8.5e-05,0.000137,5.9e-05
2,00013b17ad220c46,0.000909,6.3e-05,0.000141,0.000114,0.000116,5.6e-05
3,00017563c3f7919a,0.000336,0.000108,0.000111,0.000152,0.000195,0.000107
4,00017695ad8997eb,0.000703,5.4e-05,0.000142,0.000102,0.000122,5.6e-05


In [66]:
# Store prediction to be submitted to Kaggle
predictions_df.to_csv("data/autokeras_predictions.csv", index=False)