## Train a neural network using AutoKeras

## Set paths and other variables

In [None]:
train_input_file = "data/train.csv.zip"
BATCH_SIZE = 8 # It runs out-of-memmory quite easily :/

In [None]:
%env TF_GPU_ALLOCATOR=cuda_malloc_async

## Import libs

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import autokeras as ak
import keras_tuner as kt

In [None]:
tf.__version__

## Load ground truth dataset

In [None]:
train_df = pd.read_csv(train_input_file, compression="zip")


In [None]:
train_df.columns


### Split ground truth dataset into training, validation and test

In [None]:
train_df, test_df = train_test_split(train_df, test_size=0.1)
train_df, val_df = train_test_split(train_df, test_size=0.1)

train_df.shape, val_df.shape, test_df.shape


In [None]:
train_df[
    ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
].values


### Convert pandas dataframes into tensorflow datasets

In [None]:
train_set = tf.data.Dataset.from_tensor_slices(
    (
        (train_df.comment_text.values,),
        (
            train_df[
                [
                    "toxic",
                    "severe_toxic",
                    "obscene",
                    "threat",
                    "insult",
                    "identity_hate",
                ]
            ].values
        ),
    )
).batch(BATCH_SIZE)
val_set = tf.data.Dataset.from_tensor_slices(
    (
        (val_df.comment_text.values,),
        (
            val_df[
                [
                    "toxic",
                    "severe_toxic",
                    "obscene",
                    "threat",
                    "insult",
                    "identity_hate",
                ]
            ].values
        ),
    )
).batch(BATCH_SIZE)


## Train AutoKeras AutoML model

### Init AutoKeras text classifier model

In [None]:
clf = ak.TextClassifier(
    overwrite=False,  # True,
    multi_label=True,
    max_trials=10,
    metrics=[tf.keras.metrics.AUC()],
)


### Define earlystop to stop training if it does not improve anymore

In [None]:
earlystop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=0,
    verbose=0,
    mode="auto",
    restore_best_weights=True,
)


In [None]:
%env TF_GPU_ALLOCATOR=cuda_malloc_async

### Start training a text classifier using AutoKeras AutoML

In [None]:
clf.fit(
    train_set,
    validation_data=val_set,
    epochs=10,
    batch_size=BATCH_SIZE,
    callbacks=[earlystop],
    verbose=1,
)


In [None]:
# Display the best model architecture
clf.export_model().summary()


## Model evaluation

In [None]:
model = clf.export_model()

In [None]:
y_test = test_df[
    ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
].values


In [None]:
test_set = tf.data.Dataset.from_tensor_slices(
    (
        (test_df.comment_text.values,),
        (
            test_df[
                [
                    "toxic",
                    "severe_toxic",
                    "obscene",
                    "threat",
                    "insult",
                    "identity_hate",
                ]
            ].values,
        ),
    )
).batch(BATCH_SIZE)


In [None]:
predicted_y = model.predict(test_df.comment_text.values)


In [None]:
roc_auc_score(
    test_df[
        ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    ].values,
    predicted_y,
)


In [None]:
model.evaluate(test_set)


In [None]:
model.evaluate(val_set)


In [None]:
model.summary()


## Predict unseen labels (for the Kaggle competition)

### Load the actual test data

In [None]:
real_test_df = pd.read_csv("data/test.csv.zip", compression="zip")

### Predict unseen samples

In [None]:
real_test_pred = model.predict(real_test_df.comment_text)

### Combine predictions with sample ids to store result file in a csv

In [None]:
predictions_df = pd.DataFrame(
    real_test_pred,
    columns=["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"],
)
predictions_df["id"] = real_test_df["id"]
predictions_df = predictions_df[
    ["id", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
]


In [None]:
# Predictions output looks like:
predictions_df.head()

In [None]:
# Store prediction to be submitted to Kaggle
predictions_df.to_csv("data/autokeras_predictions.csv", index=False)