In [46]:
import tensorflow as tf
import time
import os
import numpy as np
import json
import pandas as pd
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
from tensorflow.keras.models import load_model
from tensorflow.keras import backend as K
from tqdm import tqdm
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, Bidirectional,LSTM, GRU, GlobalAveragePooling1D, GlobalMaxPool1D, Concatenate, Add, Dense
from tensorflow.keras import Model, regularizers, initializers, constraints
import random
from tensorflow.keras.layers import Layer
import torch.nn as nn

In [47]:
with open('../params.json', 'r') as f:
    params = json.load(f)

max_length = params['max_length']
padding_type = params['padding_type']
vocab_size = params['vocab_size']
embedding_dim = params['embedding_dim']
trunc_type = params['trunc_type']
oov_tok = params['oov_tok']
embedding_dim=300

In [48]:
# processed data:
train_x=np.load('../experiment/processed/train_padded.npy')
train_y=np.load('../experiment/processed/train_y.npy')
val_x=np.load('../experiment/processed/val_padded.npy')
val_y=np.load('../experiment/processed/val_y.npy')
train_y_meta=pd.read_csv('../kaggle_data/train_y.csv')
val_y_meta=pd.read_csv('../kaggle_data/val_y.csv')
word_index=json.load(open('../experiment/processed/word_index.json','r'))
train_meta=pd.read_csv('../processed/train_meta.csv').iloc[:,9:15]
embed_mat=np.load('../experiment/processed/embedding_matrix_fasttext.npy')

In [49]:
identity_columns=train_y_meta.columns[:8]

In [50]:
toxicity_feat=train_meta.columns

In [51]:
def calculate_wga(y,y_pred):
    y.loc[:, 'pred'] = y_pred
    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies = []
    groups=[]
    for category in categories:
        for label in [0, 1]:
            group = y.loc[y[category] == label]
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            groups.append(category+'_'+str(label))
            accuracies.append(group_accuracy)
    wga = np.min(accuracies)
    return wga, dict(zip(groups,accuracies))

In [52]:
def batching_columns(val_x,val_meta,model):
    val_x = val_x.reshape((val_x.shape[0], -1))
    data=tf.data.Dataset.from_tensor_slices((val_x,val_meta))
    data=data.batch(32)

    predictions, indices = [], []
    for idx, (x, y) in tqdm(enumerate(data), leave=False):
        pred = model(x, training=False)
        predictions.extend(tf.squeeze(pred).numpy().tolist())
    return predictions

In [53]:
class WorstGroupAccuracy(Callback):
    def __init__(self, train_data, val_data):
        super(WorstGroupAccuracy, self).__init__()
        self.train_data = train_data
        self.val_data = val_data

    def on_epoch_end(self, epoch, logs=None):
        train_x,_,train_meta = self.train_data
        val_x,_,val_meta = self.val_data
        
        # train_y_pred=batching_columns(train_x, train_meta,self.model)
        # train_wga, train_metric = calculate_wga(train_meta,train_y_pred)
        
        val_y_pred=batching_columns(val_x,val_meta,self.model)
        val_wga,val_metric = calculate_wga(val_meta,val_y_pred)
        
        # print(f'{train_wga},Train WGA: {train_metric}')
        print(f'{val_wga},Val WGA: {val_metric}')
        
wga = WorstGroupAccuracy((train_x,train_y,train_meta), (val_x,val_y,val_y_meta))

In [54]:


class AttentionWithContext(Layer):
    """
    Basically, hidden state of each timestep is passed through a hidden dense layer of n units, with a softmax layer on top which returns the attention weights for each timestep.
    The context vector is then calculated as the attention weighted sum of timestep hidden states.
    Input shape
        3D tensor with shape: (samples, steps, features).
    Output shape
        2D tensor with shape: (samples, features).
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, units=None,return_weights=False,**kwargs):
        tf.random.set_seed(69)
        self.init = initializers.get('glorot_uniform')
        self.supports_masking = True

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.return_weights = return_weights
        self.units = units
        super(AttentionWithContext, self).__init__(**kwargs)

        self.supports_masking = True
        tf.random.set_seed(69)
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.return_weights = return_weights
        self.units = units
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        
        if self.units!=None:
            num_units = self.units
        else:
          num_units = input_shape[-1]
            
        self.W = self.add_weight(name = 'att_W',
                             shape =  (input_shape[-1], num_units),
                             initializer=self.init,
                             regularizer=self.W_regularizer,
                             constraint=self.W_constraint,
                             trainable=True)
        if self.bias:
            self.b = self.add_weight(name = 'att_b',
                                 shape = (num_units,),
                                 initializer='zero',
                                 regularizer=self.b_regularizer,
                                 constraint=self.b_constraint,
                                 trainable=True)
        self.u = self.add_weight(name = 'att_u',
                                 shape = (num_units,1),
                                 initializer=self.init,
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint,
                                 trainable=True)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = tf.matmul(x, self.W) #x(256,).W(256,128) -> uit(128,)

        if self.bias:
            uit += self.b #uit(128,)+b(128,) -> uit(128,)

        uit = tf.keras.activations.tanh(uit) #uit(128,)
        ait = tf.matmul(uit, self.u) #uit(128,).u(128,) -> ait(1,)
        a = tf.keras.activations.exponential(ait) #a(105,1) for all timesteps

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= tf.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        a /= tf.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 

        weighted_input = x * a #x(105,256)*a(105,1)
        weighted_sum = K.sum(weighted_input,axis=1) #weighted_sum(256,) : context vector
        if self.return_weights:
            return [weighted_sum, a]
        return weighted_sum

    def compute_output_shape(self, input_shape):
        if self.units!=None:
            return input_shape[0], self.units
        return input_shape[0], input_shape[-1]
    
    def get_config(self):
        config = super(AttentionWithContext, self).get_config().copy()
        config.update({
            'W_regularizer': self.W_regularizer,
            'u_regularizer': self.u_regularizer,
            'b_regularizer': self.b_regularizer,
            'W_constraint': self.W_constraint,
            'u_constraint': self.u_constraint,
            'b_constraint': self.b_constraint,
            'bias' : self.bias,
            'units': self.units,
            'return_weights' : self.return_weights
        })
        return config


In [55]:
# Overall
weights = np.ones((len(train_y_meta),)) / 4

# Subgroup
weights += (train_y_meta[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(int) / 4

# Background Positive, Subgroup Negative
weights += (( (train_y_meta['y'].values>=0.5).astype(bool).astype(int) +
   (train_y_meta[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(int) ) > 1 ).astype(bool).astype(int) / 4

# Background Negative, Subgroup Positive
weights += (( (train_y_meta['y'].values<0.5).astype(bool).astype(int) +
   (train_y_meta[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(int) ) > 1 ).astype(bool).astype(int) / 4

# for later normalization the loss
loss_weight = 1.0 / weights.mean()

In [56]:
y_columns = ['target'] #0/1
y_aux_columns = \
['target_prob','target_prob','severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat','sexual_explicit']
# two target_prob is for adjusting the weight of aux_columns

In [57]:
def custom_loss_wrapper(weights):
    def tf__custom_loss(y_true, y_pred):
        def custom_loss(preds,targets,weights):
            bce_loss_1 = nn.BCEWithLogitsLoss(weight=weights)(preds[:,0],targets[:,0]) #weighted y_columns
            bce_loss_2 = nn.BCEWithLogitsLoss()(preds[:,1:],targets[:,1:]) # y_aux_columns
            return ((bce_loss_1 * loss_weight)*0.60 + bce_loss_2*0.40)*2 
        pass
    return tf__custom_loss



In [58]:

#Config

SEED = 69
INPUT_LEN = 150 
DROPOUT_RATIO = 0.25
LSTM_UNITS = 128
GRU_UNITS = 128
ATT_UNITS = 128
DENSE_UNITS = 768
NUM_AUX_TARG = len(toxicity_feat)

#Architecture
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

tf.keras.backend.clear_session()

sequences = Input(shape = (INPUT_LEN,))
x = Embedding(input_dim=embed_mat.shape[0], output_dim=embed_mat.shape[1], input_length = INPUT_LEN, weights=[embed_mat], trainable=False)(sequences)
x = SpatialDropout1D(DROPOUT_RATIO)(x)
x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
x = Bidirectional(GRU(GRU_UNITS, return_sequences=True))(x)

a,w = AttentionWithContext(units=ATT_UNITS, return_weights=True)(x)
p1 = GlobalAveragePooling1D()(x)
p2 = GlobalMaxPool1D()(x) 

x = Concatenate()([a,p1,p2])
x = Add()([x, Dense(DENSE_UNITS, activation='relu')(x)])
x = Add()([x, Dense(DENSE_UNITS, activation='relu')(x)])

pred = Dense(1, activation = "sigmoid")(x)
aux_preds = Dense(NUM_AUX_TARG, activation='sigmoid')(x) 

model = Model(inputs = sequences, outputs = [pred, aux_preds])

#Training
model.compile(loss=[custom_loss_wrapper(weights), 'binary_crossentropy'], loss_weights=[loss_weight, 1.0], optimizer=tf.keras.optimizers.Adam(learning_rate=0.0025))


checkpoint = ModelCheckpoint('./attention/model-{epoch:03d}.keras', monitor='val_loss', save_best_only=False, mode='auto')
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

history = model.fit(x = train_x, y = [train_y, train_meta], batch_size = 2048, epochs = 5, callbacks = [early_stopping,checkpoint,wga], verbose = 1)




Epoch 1/5


ValueError: in user code:

    File "/Users/sb/anaconda3/envs/dl/lib/python3.9/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/Users/sb/anaconda3/envs/dl/lib/python3.9/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/sb/anaconda3/envs/dl/lib/python3.9/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/Users/sb/anaconda3/envs/dl/lib/python3.9/site-packages/keras/src/engine/training.py", line 1151, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/sb/anaconda3/envs/dl/lib/python3.9/site-packages/keras/src/engine/training.py", line 1209, in compute_loss
        return self.compiled_loss(
    File "/Users/sb/anaconda3/envs/dl/lib/python3.9/site-packages/keras/src/engine/compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/sb/anaconda3/envs/dl/lib/python3.9/site-packages/keras/src/losses.py", line 161, in __call__
        return losses_utils.compute_weighted_loss(
    File "/Users/sb/anaconda3/envs/dl/lib/python3.9/site-packages/keras/src/utils/losses_utils.py", line 328, in compute_weighted_loss
        losses = tf.convert_to_tensor(losses)

    ValueError: None values not supported.
