In [1]:
# !pip install autokeras


In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf

import logging
logging.getLogger('tensorflow').disabled = True

from sklearn.datasets import load_files
import autokeras as ak

import plotly.express as px
import copy 
import os
import json

In [17]:
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc, precision_recall_curve, roc_auc_score, average_precision_score
from sklearn.datasets import make_classification
from plotly.subplots import make_subplots

def plot_roc(y, y_score):
    fpr, tpr, thresholds = roc_curve(y, y_score)

    fig = px.area(
        x=fpr, y=tpr,
        title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
        labels=dict(x='False Positive Rate', y='True Positive Rate'),
        width=700, height=500
    )
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )

    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    fig.show()
    
def plot_pr_curve(y, y_score):    
    precision, recall, thresholds = precision_recall_curve(y, y_score)

    fig = px.area(
        x=recall, y=precision,
        title=f'Precision-Recall Curve (AUC={auc(recall, precision):.4f})',
        labels=dict(x='Recall', y='Precision'),
        width=700, height=500
    )
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=1, y1=0
    )
    fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_xaxes(constrain='domain')
    fig.show()    
    
def plot_roc_train_test(y_train, y_score_train, y_test, y_score_test):
    fig = go.Figure()
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    Y_Scores = [y_score_train, y_score_test]
    Y_Obs = [y_train, y_test]
    Y_Names = ["Train", "Test"]
    for i in range(len(Y_Scores)):
        y_true = Y_Obs[i]
        y_score = Y_Scores[i]

        fpr, tpr, _ = roc_curve(y_true, y_score)
        auc_score = roc_auc_score(y_true, y_score)

        name = f"{Y_Names[i]} ROC (AUC={auc_score:.4f})"
        fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))

    fig.update_layout(
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        yaxis=dict(scaleanchor="x", scaleratio=1),
        xaxis=dict(constrain='domain'),
        width=700, height=500
    )
    fig.show()
    
def plot_pr_train_test(y_train, y_score_train, y_test, y_score_test):
    #fig = go.Figure()
    fig = make_subplots(rows=1, cols=2)
    
    # subplot for ROC curve
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1,
        row=1, col=1
    )
    Y_Scores = [y_score_train, y_score_test]
    Y_Obs = [y_train, y_test]
    Y_Names = ["Train", "Test"]
    for i in range(len(Y_Scores)):
        y_true = Y_Obs[i]
        y_score = Y_Scores[i]

        fpr, tpr, _ = roc_curve(y_true, y_score)
        auc_score = roc_auc_score(y_true, y_score)

        name = f"{Y_Names[i]} ROC (AUC={auc_score:.4f})"
        fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'),row=1, col=1)
    
    fig.update_xaxes(title_text="False Positive Rate", row=1, col=1)
    fig.update_yaxes(title_text="True Positive Rate", row=1, col=1)
    
    # subplot for PR curve
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=1, y1=0, 
        row=1,col=2
    )
    Y_Scores = [y_score_train, y_score_test]
    Y_Obs = [y_train, y_test]
    Y_Names = ["Train", "Test"]
    for i in range(len(Y_Scores)):
        y_true = Y_Obs[i]
        y_score = Y_Scores[i]

        precision, recall, _ = precision_recall_curve(y_true, y_score)
        auc_score = average_precision_score(y_true, y_score)
    
        name = f"{Y_Names[i]} PR (AUC={auc_score:.4f})"
        fig.add_trace(go.Scatter(x=recall, y=precision, name=name, mode='lines'),row=1,col=2)

    fig.update_xaxes(title_text="Recall", row=1, col=2)
    fig.update_yaxes(title_text="Precision", row=1, col=2)
    fig.update_layout(
        title="ROC & Precision-Recall Curves",
        yaxis=dict(scaleanchor="x", scaleratio=1),
        xaxis=dict(constrain='domain'),
        width=1000, height=500
    )
    fig.show()
    
def hyper_table(path='structured_data_classifier'):
    trial_json = os.popen('ls ./{}/trial_*/trial.json'.format(path)).read().split('\n')[:-1]
    DATA = []
    for file in trial_json:
        with open(file) as f: 
            DATA.append(json.load(f))
    for k in range(len(DATA)):
        DATA[k]['hyperparameters']['values']['score'] = DATA[k]['score']
    hyper_df = pd.concat([pd.DataFrame.from_dict(data['hyperparameters']['values'], orient='index') for data in DATA], axis=1)
    hyper_df.columns = ["trial#{}".format(k+1) for k in range(len(DATA))]
    return(hyper_df)

def predic_error_analysis(x_train, y_train, y_score_train, x_test, y_test, y_score):
    df1 = copy.deepcopy(pd.DataFrame(x_train))
    df1['obs'] = pd.DataFrame(y_train)
    df1['predict'] = pd.DataFrame(y_score_train)
    df1['data'] = 'Train'
    #display(df1)
    df2 = copy.deepcopy(pd.DataFrame(x_test))
    df2['obs'] = pd.DataFrame(y_test)
    df2['predict'] = pd.DataFrame(y_score)
    df2['data'] = 'Test'
    #display(df2)
    df3 = pd.concat([df1,df2])

    fig = px.scatter(
        df3, x='obs', y='predict',
        marginal_x='histogram', marginal_y='histogram',
        color='data', trendline='ols'
    )
    fig.update_traces(histnorm='probability', selector={'type':'histogram'})
    fig.add_shape(
        type="line", line=dict(dash='dash'),
        x0=df3['obs'].min(), y0=df3['obs'].min(),
        x1=df3['obs'].max(), y1=df3['obs'].max()
    )
    fig.update_layout(title="Prediction Error Analysis", 
                      yaxis=dict(range=[df3['obs'].min(), df3['obs'].max()]),
                      xaxis=dict(range=[df3['obs'].min(), df3['obs'].max()]),
                      width=1000, height=1000)
    fig.show()

To make this tutorial easy to follow, we just treat IMDB dataset as a
regression dataset. It means we will treat prediction targets of IMDB dataset,
which are 0s and 1s as numerical values, so that they can be directly used as
the regression targets.

## A Simple Example
The first step is to prepare your data. Here we use the [IMDB
dataset](https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification)
as an example.


In [2]:

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz",
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
    extract=True,
)

# set path to dataset
IMDB_DATADIR = os.path.join(os.path.dirname(dataset), "aclImdb")

classes = ["pos", "neg"]
train_data = load_files(
    os.path.join(IMDB_DATADIR, "train"), shuffle=True, categories=classes
)
test_data = load_files(
    os.path.join(IMDB_DATADIR, "test"), shuffle=False, categories=classes
)

x_train = np.array(train_data.data)
y_train = np.array(train_data.target)
x_test = np.array(test_data.data)
y_test = np.array(test_data.target)

print(x_train.shape)  # (25000,)
print(y_train.shape)  # (25000, 1)
print(x_train[0][:50])  # <START> this film was just brilliant casting <UNK>


(25000,)
(25000,)
b'Zero Day leads you to think, even re-think why two'


The second step is to run the [TextRegressor](/text_regressor).  As a quick
demo, we set epochs to 2.  You can also leave the epochs unspecified for an
adaptive number of epochs.


In [28]:
train_df = pd.DataFrame(zip(train_data.data, train_data.target), columns = ['data','target'])
test_df = pd.DataFrame(zip(test_data.data, test_data.target), columns = ['data','target'])
display(train_df)

Unnamed: 0,data,target
0,"b""Zero Day leads you to think, even re-think w...",1
1,b'Words can\'t describe how bad this movie is....,0
2,b'Everyone plays their part pretty well in thi...,1
3,b'There are a lot of highly talented filmmaker...,0
4,b'I\'ve just had the evidence that confirmed m...,0
...,...,...
24995,b'089: Footlight Parade (1933) - released 9/30...,1
24996,b'Deeply humorous yet honest comedy about a bu...,1
24997,b'1st watched 2/28/2006 - 4 out of 10(Dir-Sydn...,0
24998,"b""I watch lots of scary movies (or at least th...",0


In [29]:
# Initialize the text regressor.
reg = ak.TextRegressor(overwrite=True, max_trials=1)  # It tries 10 different models.
# Feed the text regressor with training data.
reg.fit(x_train, y_train, epochs=2)
# Predict with the best model.
predicted_y = reg.predict(x_test)
# Evaluate the best model with testing data.
print(reg.evaluate(x_test, y_test))

# structure of the model
display(hyper_table(path='text_regressor'))
model = reg.export_model()
model.summary()

Trial 1 Complete [00h 00m 15s]
val_loss: 0.16080547869205475

Best val_loss So Far: 0.16080547869205475
Total elapsed time: 00h 00m 15s
Epoch 1/2
Epoch 2/2
[0.15651313960552216, 0.15651313960552216]


Unnamed: 0,trial#1
text_block_1/block_type,vanilla
text_block_1/max_tokens,5000
text_block_1/text_to_int_sequence_1/output_sequence_length,64
text_block_1/embedding_1/pretraining,none
text_block_1/embedding_1/embedding_dim,128
text_block_1/embedding_1/dropout,0.25
text_block_1/conv_block_1/kernel_size,3
text_block_1/conv_block_1/separable,False
text_block_1/conv_block_1/max_pooling,True
text_block_1/conv_block_1/dropout,0


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None,)]                 0         
_________________________________________________________________
expand_last_dim (ExpandLastD (None, 1)                 0         
_________________________________________________________________
text_vectorization (TextVect (None, 64)                0         
_________________________________________________________________
embedding (Embedding)        (None, 64, 128)           640128    
_________________________________________________________________
dropout (Dropout)            (None, 64, 128)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 62, 32)            12320     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 60, 32)            3104  

## Validation Data
By default, AutoKeras use the last 20% of training data as validation data.  As
shown in the example below, you can use `validation_split` to specify the
percentage.


In [30]:
reg.fit(
    x_train,
    y_train,
    # Split the training data and use the last 15% as validation data.
    validation_split=0.15,
)

You can also use your own validation set instead of splitting it from the
training data with `validation_data`.


In [31]:
split = 5000
x_val = x_train[split:]
y_val = y_train[split:]
x_train = x_train[:split]
y_train = y_train[:split]
reg.fit(
    x_train,
    y_train,
    epochs=2,
    # Use your own validation set.
    validation_data=(x_val, y_val),
)


## Customized Search Space
For advanced users, you may customize your search space by using
[AutoModel](/auto_model/#automodel-class) instead of
[TextRegressor](/text_regressor). You can configure the
[TextBlock](/block/#textblock-class) for some high-level configurations, e.g.,
`vectorizer` for the type of text vectorization method to use.  You can use
'sequence', which uses [TextToInteSequence](/block/#texttointsequence-class) to
convert the words to integers and use [Embedding](/block/#embedding-class) for
embedding the integer sequences, or you can use 'ngram', which uses
[TextToNgramVector](/block/#texttongramvector-class) to vectorize the
sentences.  You can also do not specify these arguments, which would leave the
different choices to be tuned automatically.  See the following example for
detail.


In [32]:

input_node = ak.TextInput()
output_node = ak.TextBlock(block_type="ngram")(input_node)
output_node = ak.RegressionHead()(output_node)
reg = ak.AutoModel(
    inputs=input_node, outputs=output_node, overwrite=True, max_trials=1
)
reg.fit(x_train, y_train, epochs=2)

# structure of the model
display(hyper_table(path='text_regressor'))
model = reg.export_model()
model.summary()

Trial 1 Complete [00h 00m 21s]
val_loss: 0.20535746216773987

Best val_loss So Far: 0.20535746216773987
Total elapsed time: 00h 00m 21s
Epoch 1/2
Epoch 2/2


Unnamed: 0,trial#1
text_block_1/block_type,vanilla
text_block_1/max_tokens,5000
text_block_1/text_to_int_sequence_1/output_sequence_length,64
text_block_1/embedding_1/pretraining,none
text_block_1/embedding_1/embedding_dim,128
text_block_1/embedding_1/dropout,0.25
text_block_1/conv_block_1/kernel_size,3
text_block_1/conv_block_1/separable,False
text_block_1/conv_block_1/max_pooling,True
text_block_1/conv_block_1/dropout,0


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None,)]                 0         
_________________________________________________________________
expand_last_dim (ExpandLastD (None, 1)                 0         
_________________________________________________________________
text_vectorization (TextVect (None, 5000)              0         
_________________________________________________________________
dense (Dense)                (None, 256)               1280256   
_________________________________________________________________
re_lu (ReLU)                 (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                8224      
_________________________________________________________________
re_lu_1 (ReLU)               (None, 32)                0     

The usage of [AutoModel](/auto_model/#automodel-class) is similar to the
[functional API](https://www.tensorflow.org/guide/keras/functional) of Keras.
Basically, you are building a graph, whose edges are blocks and the nodes are
intermediate outputs of blocks.  To add an edge from `input_node` to
`output_node` with `output_node = ak.[some_block]([block_args])(input_node)`.

You can even also use more fine grained blocks to customize the search space
even further. See the following example.


In [33]:

input_node = ak.TextInput()
output_node = ak.TextToIntSequence()(input_node)
output_node = ak.Embedding()(output_node)
# Use separable Conv layers in Keras.
output_node = ak.ConvBlock(separable=True)(output_node)
output_node = ak.RegressionHead()(output_node)
reg = ak.AutoModel(
    inputs=input_node, outputs=output_node, overwrite=True, max_trials=1
)
reg.fit(x_train, y_train, epochs=2)

# structure of the model
display(hyper_table(path='text_regressor'))
model = reg.export_model()
model.summary()

Trial 1 Complete [00h 00m 05s]
val_loss: 0.24539391696453094

Best val_loss So Far: 0.24539391696453094
Total elapsed time: 00h 00m 05s
Epoch 1/2
Epoch 2/2


Unnamed: 0,trial#1
text_block_1/block_type,vanilla
text_block_1/max_tokens,5000
text_block_1/text_to_int_sequence_1/output_sequence_length,64
text_block_1/embedding_1/pretraining,none
text_block_1/embedding_1/embedding_dim,128
text_block_1/embedding_1/dropout,0.25
text_block_1/conv_block_1/kernel_size,3
text_block_1/conv_block_1/separable,False
text_block_1/conv_block_1/max_pooling,True
text_block_1/conv_block_1/dropout,0


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None,)]                 0         
_________________________________________________________________
expand_last_dim (ExpandLastD (None, 1)                 0         
_________________________________________________________________
text_vectorization (TextVect (None, 64)                0         
_________________________________________________________________
embedding (Embedding)        (None, 64, 128)           2560128   
_________________________________________________________________
dropout (Dropout)            (None, 64, 128)           0         
_________________________________________________________________
separable_conv1d (SeparableC (None, 62, 32)            4512      
_________________________________________________________________
separable_conv1d_1 (Separabl (None, 60, 32)            1152  

## Data Format
The AutoKeras TextRegressor is quite flexible for the data format.

For the text, the input data should be one-dimensional For the regression
targets, it should be a vector of numerical values.  AutoKeras accepts
numpy.ndarray.

We also support using [tf.data.Dataset](
https://www.tensorflow.org/api_docs/python/tf/data/Dataset?version=stable)
format for the training data.


In [34]:

train_set = tf.data.Dataset.from_tensor_slices(((x_train,), (y_train,))).batch(32)
test_set = tf.data.Dataset.from_tensor_slices(((x_test,), (y_test,))).batch(32)

reg = ak.TextRegressor(overwrite=True, max_trials=2)
# Feed the tensorflow Dataset to the regressor.
reg.fit(train_set, epochs=2)
# Predict with the best model.
predicted_y = reg.predict(test_set)
# Evaluate the best model with testing data.
print(reg.evaluate(test_set))

# structure of the model
display(hyper_table(path='text_regressor'))
model = reg.export_model()
model.summary()

Trial 2 Complete [00h 00m 04s]
val_loss: 0.18503132462501526

Best val_loss So Far: 0.17859812080860138
Total elapsed time: 00h 00m 08s
Epoch 1/2
Epoch 2/2
[0.21852049231529236, 0.21852049231529236]


Unnamed: 0,trial#1,trial#2
text_block_1/block_type,vanilla,vanilla
text_block_1/max_tokens,5000,5000
text_block_1/text_to_int_sequence_1/output_sequence_length,64,64
text_block_1/embedding_1/pretraining,none,none
text_block_1/embedding_1/embedding_dim,128,128
text_block_1/embedding_1/dropout,0.25,0.25
text_block_1/conv_block_1/kernel_size,3,3
text_block_1/conv_block_1/separable,False,False
text_block_1/conv_block_1/max_pooling,True,True
text_block_1/conv_block_1/dropout,0,0.5


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None,)]                 0         
_________________________________________________________________
expand_last_dim (ExpandLastD (None, 1)                 0         
_________________________________________________________________
text_vectorization (TextVect (None, 64)                0         
_________________________________________________________________
embedding (Embedding)        (None, 64, 128)           640128    
_________________________________________________________________
dropout (Dropout)            (None, 64, 128)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 62, 32)            12320     
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 60, 32)            3104  

## Reference
[TextRegressor](/text_regressor),
[AutoModel](/auto_model/#automodel-class),
[TextBlock](/block/#textblock-class),
[TextToInteSequence](/block/#texttointsequence-class),
[Embedding](/block/#embedding-class),
[TextToNgramVector](/block/#texttongramvector-class),
[ConvBlock](/block/#convblock-class),
[TextInput](/node/#textinput-class),
[RegressionHead](/block/#regressionhead-class).
