### Utility packages 

In [None]:
import warnings
warnings.filterwarnings( 'ignore' )
import gc
import os
import time
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

import tensorflow as tf
import keras.backend as K
from keras.models import load_model


from keras.preprocessing import text
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report

## Model Performance Tracking

In [None]:
from kaggle_secrets import UserSecretsClient
import wandb
from wandb.keras import WandbCallback
user_secrets = UserSecretsClient()

# I have saved my API token with "wandb_api" as Label. 
# If you use some other Label make sure to change the same below. 
wandb_api = user_secrets.get_secret("wandb_api")


wandb.login(key=wandb_api)

In [None]:
import wandb
from wandb.keras import WandbCallback

wandb.init(project="CNN-BiLSTM")

## Hyperparmenters 

In [None]:
# hyper parameters for this model

max_len = 150
embed_size = 300
pre_trained_flag = True
embed_trainable = False
emb_weights_init = 'glorot_normal'
optimizer = 'adam'
ker_regularizer = 'L1L2'
decay = True

lr_rate = 0.02
drpt = 0.4
batch = 258
nepochs = 50
patience = 10
decay_rate = 0.2
decay_after = 5
fc_act = 'relu'
fc_weights_init = 'glorot_uniform'

In [None]:
258+64

In [None]:
#embeddingfile = './General_Embeddings/glove.txt'
#embeddingfile = './General_Embeddings/w2v_cbow.txt'
#embeddingfile = './General_Embeddings/w2v_sg.txt'
#embeddingfile = './General_Embeddings/ft_cbow.vec'
embeddingfile = '../input/final-ruhsp-experiments/General_Embeddings/General_Embeddings/ft_sg.vec'

embedding_matrix = []
max_features = 100000



In [None]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [None]:
df = pd.read_csv('../input/final-ruhsp-experiments/cleaned_data.csv')
df.shape

In [None]:
def get_coefs( word, *arr ):
    return word, np.asarray( arr, dtype='float32' )

def get_vectors( tokenizer ):
    word_index = tokenizer.word_index
    num_words = min( max_features, len( word_index ) + 1 )
    embedding_matrix = np.zeros( ( num_words, embed_size ) )
    for word, i in word_index.items(  ):
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get( word )
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    gc.collect()
    return embedding_matrix

if pre_trained_flag == True:
    embeddings_index = dict( get_coefs( *o.rstrip().rsplit(' ') ) for o in open( embeddingfile, encoding='utf-8' ) )

## stratified K-fold cross-validation
the original dataset is randomly partitioned into K folds, where K is a user-specified number. The folds are stratified, meaning that the distribution of the target classes is approximately the same across all folds.

In [None]:
skf = StratifiedKFold( n_splits=5, random_state=0, shuffle=True )
print(skf)

## Model Architecture 

In [None]:
from keras.models import Model
from keras.layers import Input, Embedding
from keras.layers import CuDNNGRU, CuDNNLSTM, Conv1D, Conv2D, Dense, Bidirectional, GRU, LSTM, MaxPool1D
from keras.layers import SpatialDropout1D, Dropout, Concatenate, concatenate, Softmax, Flatten, Reshape
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D, GlobalMaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import *
from keras.callbacks import EarlyStopping

def CNN_BiLSTM( tokenizer, max_len, embed_size=300, embedding_matrix=[], embed_trainable=False,
            drpt=.1, emb_weights_init='', optimizer='', ker_regularizer='',fc_act='',fc_weights_init=''):
    inp = Input( shape=(max_len, ), name='InputLayer' )
    if embedding_matrix == []:
        x = Embedding( input_dim=len(tokenizer.word_index)+1, output_dim=embed_size,
                      embeddings_initializer=emb_weights_init, trainable=embed_trainable, name='Embedding' )( inp )
    else:
        x = Embedding( input_dim=len(tokenizer.word_index)+1, output_dim=embed_size,
                      weights=[embedding_matrix], trainable=embed_trainable, name='Embedding' )( inp )
    x = Dropout( drpt, name='EmbDropout' )( x )
    conv = Conv1D( filters=64, kernel_size=4, padding='same', activation='relu',
                  kernel_regularizer=ker_regularizer, name='conv1' )( x )
    pool1 = MaxPool1D( pool_size=4, name='pool1' )( conv )
    
    conv2 = Conv1D( filters=100, kernel_size=4, padding='same', activation='relu',
                  kernel_regularizer=ker_regularizer, name='conv2' )( x )
    pool2 = MaxPool1D( pool_size=4, name='pool2' )( conv2 )
    

    x1 = Bidirectional(CuDNNLSTM( 128, return_sequences=True ), name='BLSTM1' )(pool1)
    x2 = Bidirectional(CuDNNLSTM( 64, return_sequences=True), name='BLSTM2' )(pool2)
    
    conc = concatenate( [ x1, x2 ] )

    pooled = [  ]

    avg_pool = GlobalAveragePooling1D( name='GlobalAvgPool' )(conc)
    max_pool = GlobalMaxPooling1D( name='GlobalMaxPool' )(conc)

    pooled.append( avg_pool )
    pooled.append( max_pool )

    x = Concatenate( axis=1, name='ConcatenateLayer' )( pooled )

    x = Dropout( drpt, name='DropoutConcat' ) ( x )

    fc1 = Dense( 100, activation=fc_act, kernel_initializer= fc_weights_init, name='FC1' )( x )
    x = Dropout( drpt, name='DropoutConcat' ) ( fc1)
    
    fc2 = Dense( 50, activation=fc_act, kernel_initializer= fc_weights_init, name='FC2' )( fc1 )
    outp = Dense( 5, activation='softmax', name='Output' )( fc2 )
    model = Model( inputs=inp, outputs=outp )
    
    
    model.compile( loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'] )
#     print(model.summary())
    return model



In [None]:
import tensorflow as tf
lr_reduce_callback = tf.keras.callbacks.ReduceLROnPlateau(
 monitor='val_loss', factor=0.1, patience=patience, verbose=1,
 mode='auto', min_delta=1E-7)


In [None]:
warnings.filterwarnings( 'ignore' )
start_time = time.time()

valaccuracy, valprecision, valrecall, valf1, valcm = [], [], [], [], []
testaccuracy, testprecision, testrecall, testf1, testcm = [], [], [], [], []
history_ = dict()
fold = 1
for train_index, test_index in skf.split( df.Comment, df.Toxic ):
    
    # tokenization with keras tokenizer
    tokenizer = text.Tokenizer( num_words=max_features )
    tokenizer.fit_on_texts( df.loc[ train_index ][ 'Comment' ].values.astype('U') )

    traincomments = tokenizer.texts_to_sequences( df.loc[ train_index ][ 'Comment' ].values.astype('U') )
    testcomments = tokenizer.texts_to_sequences( df.loc[ test_index ][ 'Comment' ].values.astype('U') )
    
    # pad the tokenized sequences
    xtrain =pad_sequences( traincomments, maxlen=max_len )
    xtest = pad_sequences( testcomments, maxlen=max_len )
    
    ytrain = df.loc[ train_index ][ 'Toxic' ].values
    ytest = df.loc[ test_index ][ 'Toxic' ].values
    
    # split train and val
    xtrain, xval, ytrain, yval = train_test_split( xtrain, ytrain, test_size=0.20, random_state=0 )
    
    ytrain = to_categorical( ytrain, 5 )
    yval = to_categorical( yval, 5 )
    ytest = to_categorical( ytest, 5 )
    
    # check if pre-trained word embeddings flag is true
    if pre_trained_flag == True:
        embedding_matrix = get_vectors( tokenizer=tokenizer)
    
    # define a model
    model = CNN_BiLSTM( tokenizer=tokenizer, max_len=max_len, embed_size=embed_size,
                                   embedding_matrix=embedding_matrix, embed_trainable=embed_trainable,
                                   emb_weights_init=emb_weights_init, optimizer=optimizer,
                                   ker_regularizer=ker_regularizer, drpt=drpt,fc_act=fc_act,fc_weights_init=fc_weights_init)
    
    K.set_value( model.optimizer.lr, lr_rate )
    
    earlystop = EarlyStopping( monitor='val_loss', min_delta=0, patience=patience, verbose=0, mode='auto' )

    history = model.fit( xtrain, ytrain, batch_size=batch, validation_data=( xval,yval ),
                     epochs=nepochs, verbose=0,callbacks=[WandbCallback(),earlystop] )
    
            # save history of each fold
    history_[ fold ] = history
   
    valpredictions = model.predict( xval, verbose=0 )
    testpredictions =model.predict( xtest, verbose=0)
    
    yval = [ np.argmax(y, axis=None, out=None) for y in yval ]
    ytest = [ np.argmax(y, axis=None, out=None) for y in ytest ]

    valpredictions = [ np.argmax(y, axis=None, out=None) for y in valpredictions ]
    testpredictions = [ np.argmax(y, axis=None, out=None) for y in testpredictions ] 
    
    print( 'Fold: {:02d} out of {:02d} completed.'.format( fold, skf.get_n_splits() ) )

    test_report = classification_report( ytest, testpredictions, output_dict=True )
    testaccuracy.append( test_report[ 'accuracy' ] )
    testprecision.append( test_report[ 'macro avg' ][ 'precision' ] )
    testrecall.append( test_report[ 'macro avg' ][ 'recall' ] )
    testf1.append( test_report[ 'macro avg' ][ 'f1-score' ] )
    print( classification_report( ytest, testpredictions ) )
    
    fold = fold + 1
time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")

In [None]:
# Finish the run
wandb.finish()

## Test Set Evaluation over 5-Folds

In [None]:
print( 'Test Accuracy' )
print( [ '{:0.4f}'.format( x ) for x in testaccuracy ], np.mean( testaccuracy ), '+-', np.std( testaccuracy ), '\n' )

print( 'Test Precision' )
print( [ '{:0.4f}'.format( x ) for x in testprecision ], np.mean( testprecision ), '+-', np.std( testprecision ), '\n' )

print( 'Test Recall' )
print( [ '{:0.4f}'.format( x ) for x in testrecall ], np.mean( testrecall ), '+-', np.std( testrecall ), '\n' )

print( 'Test F1' )
print( [ '{:0.4f}'.format( x ) for x in testf1 ], np.mean( testf1 ), '+-', np.std( testf1 ) )

## Training and Evaluation Graphs  Accuracy and Loss

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

    

In [None]:
plt.figure(1,figsize=(7,5),dpi=100,clear=True)
plot_graphs(history_[1], 'accuracy')
plt.ylim(None, 1)

In [None]:
plt.figure(1,figsize=(7,5),dpi=100,clear=True)
plot_graphs(history_[1], 'loss')
plt.ylim(0, None)

## Graphs for all Folds

In [None]:
plt.figure(figsize=(30,30),dpi=100,)
# plt.figure(figsize=(20, 10))
plt.subplot(5, 2, 1)
plot_graphs(history_[1], 'accuracy')
plt.ylim(None, 1)

plt.subplot(5, 2, 2)
plot_graphs(history_[1], 'loss')
plt.ylim(0, None)

# 2nd fold 
plt.subplot(5, 2, 3)
plot_graphs(history_[2], 'accuracy')
plt.ylim(None, 1)

plt.subplot(5, 2, 4)
plot_graphs(history_[2], 'loss')
plt.ylim(0, None)

# 3nd fold 
plt.subplot(5, 2, 5)
plot_graphs(history_[3], 'accuracy')
plt.ylim(None, 1)

plt.subplot(5, 2, 6)
plot_graphs(history_[3], 'loss')
plt.ylim(0, None)

# 4nd fold 
plt.subplot(5, 2, 7)
plot_graphs(history_[4], 'accuracy')
plt.ylim(None, 1)

plt.subplot(5, 2, 8)
plot_graphs(history_[4], 'loss')
plt.ylim(0, None)

# 5nd fold 
plt.subplot(5, 2, 9)
plot_graphs(history_[5], 'accuracy')
plt.ylim(None, 1)

plt.subplot(5, 2, 10)
plot_graphs(history_[5], 'loss')
plt.ylim(0, None)
plt.show()