# Text Classification of StackOverflow using BiGRU RNNs with deep self-attention

In [22]:
%load_ext autoreload
%load_ext nb_black
%autoreload 2

import sys
import os
import parent_modules

import warnings
import sklearn.exceptions
import talos as ta

from os import pardir, getcwd
from os.path import join, abspath
from definitions import *

warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)

from definitions import TALOS_DIR
from app.preprocessing import (
    load_dataset,
    load_embeddings,
    preprocess_data,
    save_embeddings_matrix,
)
from app.models import load_bi_gru_model, find_best_model_over_scan_logs
from app.metrics import *

# Comment out In case of Testing use only a set of the tags as dataset
RUN_STATE = "testing"

# Comment out In case of Production use all the tags of the dataset
# RUN_STATE = 'production'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

## Preprocessing for the loaded Dataset
1. Format into *lowercase*
2. Remove some of the *punctuation* characters
3. Remove *Numbers*
4. Remove *stopwords*
5. Remove *links*

In [11]:
data, test_data = load_dataset(load_from_pickle=True)
classes_counts = data["label"].value_counts().where(lambda cls: cls > 0).dropna()
Classes = list(classes_counts.index)
Nclasses = len(Classes)
print(classes_counts, Classes, Nclasses)

__label__0     5622
__label__2     2377
__label__3     1400
__label__4     1065
__label__1      573
__label__10     501
__label__14     326
__label__11     324
__label__12     299
__label__13     207
__label__5      180
__label__8      122
__label__9      113
__label__6       94
__label__7       18
Name: label, dtype: int64 ['__label__0', '__label__2', '__label__3', '__label__4', '__label__1', '__label__10', '__label__14', '__label__11', '__label__12', '__label__13', '__label__5', '__label__8', '__label__9', '__label__6', '__label__7'] 15


<IPython.core.display.Javascript object>

In [16]:
# 70% Train & 30% Test
# 70% Train-Dev % 30* Train-Dev
embeddings_voc, embeddings_vec = load_embeddings(load_from_pickle=True)
model_data = preprocess_data(data, "label", "post", cv_split_dev=0.2)
embeddings_matrix_path = save_embeddings_matrix(
    embeddings_voc, embeddings_vec, model_data["words_index"]
)

<IPython.core.display.Javascript object>

In [15]:
model_data["x_train"][1]

array([ 5377,  7020,  3288,  7021, 10777,   388,   941,    54,   198,
          80, 10778,     2,  2631,     2,     3, 10779,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

<IPython.core.display.Javascript object>

## Models Definition and Hyperparameter Tuning

> Below it will be defined, trained and evaluate the 2 models of RNNs the first with a unistack RNN with BiGRU and MLP on top of it and the second with a multistack RNN with BiGRU and MLP on top of it. 
The hyperparameter tuning is implemented throw the Talos library and will calculate the best configuration for each of the 2 models.

### Unistacked RNN with BiGRU

In [17]:
###### Production configuration
rnn_deep_gru_config = {
    "model_type": ["keras_deep_BiGRU_model"],
    "embedding_dim": [embeddings_vec.shape[1]],
    "gru_size": [200],
    "dense": [300],
    "embeddings_matrix_path": [embeddings_matrix_path],
    "visualize_process": [True],
    "with_early_stoping": [True],
    "multistack_run": [False],
    "early_stopping": [True],
    "early_stopping_config__monitor": ["val_f1"],
    "early_stopping_config__min_delta": [0],
    "early_stopping_config__patience": [5],
    "early_stopping_config__mode": ["max"],
    "embeddings_dropout": [0.2],
    "var_dropout": [0.2, 0.6],
    "mlp_dropout": [0.2],
    "mlp_activation": ["softmax"],
    "rnn_activation": ["relu", "tanh"],
    "optimizer": ["Nadam", "Adam"],
    "batch_size": [32, 64],
    "epochs": [3 if RUN_STATE == "testing" else 10],
}

<IPython.core.display.Javascript object>

In [31]:
TALOS_BiGRU_DEEP_LOG_FILENAME = 'talos_logs/talos_bigru_deep_log'
if RUN_STATE == 'testing':
    TALOS_BiGRU_DEEP_LOG_FILENAME += '_test'
# talos_bigru_deep_log_pathname = os.path.join(TALOS_DIR, TALOS_BiGRU_DEEP_LOG_FILENAME)

history_model = ta.Scan(model_data['x_train'],
                        model_data['y_train'],
                        x_val=model_data['x_train_dev'],
                        y_val=model_data['y_train_dev'],
                        model=load_bi_gru_model,
                        params=rnn_deep_gru_config,
                        fraction_limit=0.1,
                        print_params=True,
                        seed=(123),
                        experiment_name=TALOS_BiGRU_DEEP_LOG_FILENAME
                        )


NotADirectoryError: [Errno 20] Not a directory: './talos_logs/talos_bigru_deep_log_test/061520020559.csv'

<IPython.core.display.Javascript object>

> Finds the best model configuration based for our RNN based on the highest value of the *val_f1*.

In [10]:
report_talos = ta.Reporting(history_model)
best_model_idx = report_talos.data['val_f1'].idxmax()
best_model_params = report_talos.data.loc[best_model_idx].to_dict()
best_model_params

{'round_epochs': 3,
 'val_loss': 0.1728040204445521,
 'val_precision': 0.9533045596168155,
 'val_recall': 0.933333333446866,
 'val_f1': 0.9431521987915039,
 'val_accuracy': 0.9625396849995568,
 'val_categorical_accuracy': 0.9447619048754374,
 'loss': 0.1799544942835156,
 'precision': 0.947758856533336,
 'recall': 0.9221768709104888,
 'f1': 0.9346929049816262,
 'accuracy': 0.957097516887042,
 'categorical_accuracy': 0.9352380954002848,
 'model_type': 'keras_deep_BiGRU_model',
 'embedding_dim': 300,
 'gru_size': 200,
 'dense': 300,
 'embeddings_matrix_path': 'embeddings-matrix-pickle',
 'visualize_process': 'True',
 'with_early_stoping': 'True',
 'multistack_run': 'False',
 'early_stopping': 'True',
 'early_stopping_config__monitor': 'val_f1',
 'early_stopping_config__min_delta': 0,
 'early_stopping_config__patience': 5,
 'early_stopping_config__mode': 'max',
 'embeddings_dropout': 0.2,
 'var_dropout': 0.2,
 'mlp_dropout': 0.2,
 'mlp_activation': 'softmax',
 'rnn_activation': 'tanh',
 'o

In [14]:
from app.models import (load_bi_gru_model, 
                        load_bi_lstm_model, 
                        find_best_model_over_scan_logs)

TALOS_BiGRU_DEEP_LOG_FILENAME = 'talos_bigru_deep_log'
talos_bigru_deep_log_pathname = os.path.join(TALOS_DIR, TALOS_BiGRU_DEEP_LOG_FILENAME)

best_model_params = find_best_model_over_scan_logs('val_f1', *[talos_bigru_deep_log_pathname+ '_.csv'])

> Train a new RNN model based on the best Talos configuration.

 ****Note: the below cells have been explicitly removed from the PRODUCTION run because they required an extra model generation which was demanding in terms of recources and time. For extra information about the PRODUCTION run check the README instructions.*

In [15]:
if RUN_STATE == 'testing':
    # Train and Load the best model of given the tuned featured model
    best_model_params['early_stopping'] = True
    best_model_params['with_early_stopping'] = True
    best_model_params['visualize_process'] = True
    model_history, model = load_bi_gru_model(model_data['x_train'],
                                             model_data['y_train'],
                                             model_data['x_train_dev'],
                                             model_data['y_train_dev'],
                                             best_model_params)

TypeError: object of type 'numpy.int64' has no len()

### Visualize Model History Scores

In [None]:
if RUN_STATE == 'testing':
    from app.visualization import plot_history_metrics
    import matplotlib.pylab as plt

    %matplotlib inline
    plot_history_metrics(history_obj=model_history)

### Evaluate performance model

Evaluates the performance of the best trained model in the **test** dataset. 

In [None]:
if RUN_STATE == 'testing':
    score = model.evaluate(model_data_ftc['x_test'],
                           model_data_ftc['y_test'],
                           batch_size=best_model_params['batch_size'],
                           verbose=1)

    print('\nTest f1: %.4f' % (score[1]))
    print('\nTest categorical accuracy: %.4f'% (score[2]))

### Visualize Prediction Perfomance  model

In [None]:
if RUN_STATE == 'testing':
    import numpy as np
    from app.visualization import (plot_prediction_metrics,
                                   create_clf_report,
                                   plot_roc_curve,
                                   plot_precision_recall_curve,
                                   plot_confusion_matrix)
    import matplotlib.pylab as plt

    prediction_val = model.predict(model_data['x_test'], batch_size=best_model_params['batch_size'])

    # returns each entry result to the classification with the relevant probabilities
    y_pred_processed = np.array([np.argmax(val) for val in prediction_val])
    y_true_processed = np.array([np.argmax(val) for val in model_data['y_test']])

    # If you want to see the OneVSAll ROC Curves of each class uncomment the below line
    # plot_roc_curve(model_data['y_test'], prediction_val, Classes, 1)

    # If you want to see the OneVSAll Precission Recall Curves of each class, comment out the below line
    # plot_precision_recall_curve(model_data['y_test'], prediction_val, Classes , 1)

    # If you want to get the Classification Report, comment out the below line
    create_clf_report(model_data['y_test'], (prediction_val > 0.5).astype('int32'),
                      y_true_processed, y_pred_processed)

    # If you want to get the confusion matrix,comment out the below line
    plot_confusion_matrix(y_true_processed, y_pred_processed, Classes)

### Multistack RNN with BiGRU

In [None]:
TALOS_BiGRU_DEEP_MUTLI_LOG_FILENAME = 'talos_bigru_deep_multi_log'
if RUN_STATE == 'testing':
    TALOS_BiGRU_DEEP_MUTLI_LOG_FILENAME += '_test'
talos_bigru_deep_multi_log_pathname = os.path.join(TALOS_DIR, TALOS_BiGRU_DEEP_MUTLI_LOG_FILENAME)


###### Production configuration
rnn_deep_gru_multi_config = rnn_deep_gru_config.copy()
rnn_deep_gru_multi_config.update({
    "model_type": ["keras_deep_BiGRU_multi_model"],
    "multistack_run": [True],
})

history_model_multi = ta.Scan(model_data['x_train'],
                        model_data['y_train'],
                        x_val=model_data['x_train_dev'],
                        y_val=model_data['y_train_dev'],
                        model=load_bi_gru_model,
                        params=rnn_deep_gru_multi_config,
                        grid_downsample=0.1,
                        print_params=True,
                        last_epoch_value=True,
                        seed=(123),
                        dataset_name=talos_bigru_deep_multi_log_pathname
                        )


> Finds the best model configuration based for our RNN based on the highest value of the *val_f1*.

In [None]:
report_talos_multi = ta.Reporting(history_model_multi)
best_model_idx = report_talos_multi.data['val_f1'].idxmax()
best_model_params_multi = report_talos_multi.data.loc[best_model_idx].to_dict()
best_model_params_multi

> Train a new RNN model based on the best Talos configuration.

In [None]:
if RUN_STATE == 'testing':
    # Train and Load the best model of given the tuned featured model
    model_history_multi, model_multi = load_bi_gru_model(model_data['x_train'],
                                                         model_data['y_train'],
                                                         model_data['x_train_dev'],
                                                         model_data['y_train_dev'],
                                                         best_model_params)

### Visualize Model History Scores

In [None]:
if RUN_STATE == 'testing':
    from app.visualization import plot_history_metrics
    import matplotlib.pylab as plt

    %matplotlib inline
    plot_history_metrics(history_obj=model_history_multi)

### Evaluate performance model

Evaluates the performance of the best trained model in the **test** dataset. 

In [None]:
if RUN_STATE == 'testing':
    score_multi = model_multi.evaluate(model_data['x_test'],
                                       model_data['y_test'],
                                       batch_size=best_model_params_multi['batch_size'],
                                       verbose=1)

    print('\nTest f1: %.4f' % (score_multi[1]))
    print('\nTest categorical accuracy: %.4f'% (score_multi[2]))

### Visualize Prediction Perfomance  model

In [None]:
if RUN_STATE == 'testing':
    import numpy as np
    from app.visualization import (plot_prediction_metrics,
                                   create_clf_report,
                                   plot_roc_curve,
                                   plot_precision_recall_curve,
                                   plot_confusion_matrix)
    import matplotlib.pylab as plt

    prediction_val_multi = model_multi.predict(model_data['x_test'], batch_size=best_model_params['batch_size'])

    # returns each entry result to the classification with the relevant probabilities
    y_pred_processed_multi = np.array([np.argmax(val) for val in prediction_val_multi])
    y_true_processed_multi = np.array([np.argmax(val) for val in model_data['y_test']])

    # If you want to see the OneVSAll ROC Curves of each class uncomment the below line
    # plot_roc_curve(model_data['y_test'], prediction_val, Classes, 1)

    # If you want to see the OneVSAll Precission Recall Curves of each class, comment out the below line
    # plot_precision_recall_curve(model_data['y_test'], prediction_val, Classes , 1)

    # If you want to get the Classification Report, comment out the below line
    create_clf_report(model_data['y_test'], (prediction_val_multi > 0.5).astype('int32'),
                      y_true_processed_multi, y_pred_processed_multi)

    # If you want to get the confusion matrix,comment out the below line
    plot_confusion_matrix(y_true_processed_multi, y_pred_processed_multi, Classes)