In [1]:
import re
from nltk.stem import WordNetLemmatizer
import itertools
wordnet_lemmatizer = WordNetLemmatizer()
from tqdm.notebook import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, Callback
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, Dropout, GlobalMaxPool1D, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model
import json
import time
import os

In [12]:
train_x=np.load('./processed/train_padded.npy')
train_y=np.load('./processed/train_y.npy')
val_x=np.load('./processed/val_padded.npy')
val_y=np.load('./processed/val_y.npy')
train_y_meta=pd.read_csv('../kaggle_data/train_y.csv')
val_y_meta=pd.read_csv('../kaggle_data/val_y.csv')
word_index=json.load(open('./processed/word_index.json','r'))
embedding_matrix_fasttext=np.load('./processed/embedding_matrix_fasttext.npy')
maxpadlen=max_length

In [13]:
with open('../params.json', 'r') as f:
    params = json.load(f)

max_length = params['max_length']
padding_type = params['padding_type']
vocab_size = params['vocab_size']
embedding_dim_fasttext = params['embedding_dim']
trunc_type = params['trunc_type']
oov_tok = params['oov_tok']


In [14]:
val_x.shape

(45180, 150)

In [15]:
# max_features=100000
# tokenizer = Tokenizer(num_words=max_features)
# tokenizer.fit_on_texts(list(processed_train_data))
# list_tokenized_train = tokenizer.texts_to_`sequences(processed_train_data)
# list_tokenized_val = tokenizer.texts_to_sequences(processed_val_data)
# list_tokenized_test = tokenizer.texts_to_sequences(processed_test_data)

In [16]:
# maxpadlen = 200
# X_t=pad_sequences(list_tokenized_train, maxlen=maxpadlen, padding = 'post')
# X_v=pad_sequences(list_tokenized_val, maxlen=maxpadlen, padding = 'post')
# X_te=pad_sequences(list_tokenized_test, maxlen=maxpadlen, padding = 'post')

In [17]:
# x_train=X_t
# y_train=pd.read_csv('../kaggle_data/train_y.csv')
# x_val=X_v
# y_val=pd.read_csv('../kaggle_data/val_y.csv')

In [18]:
train_y_meta=pd.read_csv('../kaggle_data/train_y.csv')
val_y_meta=pd.read_csv('../kaggle_data/val_y.csv')

In [19]:
def calculate_wga(y,y_pred):
    y.loc[:, 'pred'] = y_pred
    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies = []
    groups=[]
    for category in categories:
        for label in [0, 1]:
            group = y.loc[y[category] == label]
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            groups.append(category+'_'+str(label))
            accuracies.append(group_accuracy)
    wga = np.min(accuracies)
    return wga, dict(zip(groups,accuracies))

In [20]:
def batching_columns(val_x,val_meta,model):
    val_x = val_x.reshape((val_x.shape[0], -1))
    data=tf.data.Dataset.from_tensor_slices((val_x,val_meta))
    data=data.batch(32)

    predictions, indices = [], []
    for idx, (x, y) in tqdm(enumerate(data), leave=False):
        pred = model(x, training=False)
        predictions.extend(tf.squeeze(pred).numpy().tolist())
        indices.extend([idx] * len(y))
    return predictions

In [21]:
class WorstGroupAccuracy(Callback):
    def __init__(self, train_data, val_data):
        super(WorstGroupAccuracy, self).__init__()
        self.train_data = train_data
        self.val_data = val_data

    def on_epoch_end(self, epoch, logs=None):
        train_x,_,train_meta = self.train_data
        val_x,_,val_meta = self.val_data
        
        train_y_pred=batching_columns(train_x, train_meta,self.model)
        train_wga, train_metric = calculate_wga(train_meta,train_y_pred)
        
        val_y_pred=batching_columns(val_x,val_meta,self.model)
        val_wga,val_metric = calculate_wga(val_meta,val_y_pred)
        
        print(f'{train_wga},Train WGA: {train_metric}')
        print(f'{val_wga},Val WGA: {val_metric}')
        
wga = WorstGroupAccuracy((train_x,train_y,train_y_meta), (val_x,val_y,val_y_meta))

In [22]:
#Adding Input Parameters to the Function.
def toxic_classifier(x_train,y_train,x_val,y_val,params):

  inp=Input(shape=(maxpadlen, ),dtype='int32')

  embedding_layer = Embedding(len(word_index) + 1,
                           embedding_dim_fasttext,
                           weights = [embedding_matrix_fasttext],
                           input_length = maxpadlen,
                           trainable=False,
                           name = 'embeddings')
  embedded_sequences = embedding_layer(inp)

  x = LSTM(params['output_count_lstm'], return_sequences=True,name='lstm_layer')(embedded_sequences)
  
  x = GlobalMaxPool1D()(x)
  
  x = Dropout(params['dropout'])(x)
  
  x = Dense(params['output_count_dense'], activation=params['activation'], kernel_initializer='he_uniform')(x)
  
  x = Dropout(params['dropout'])(x)
  
  preds = Dense(6, activation=params['last_activation'], kernel_initializer='glorot_uniform')(x)

  model = Model(inputs=inp, outputs=preds)

  model.compile(loss=params['loss'], optimizer=params['optimizer'], metrics=['accuracy'])

  model_info=model.fit(x_train,y_train, epochs=params['epochs'], batch_size=params['batch_size'],  validation_data=(x_val, y_val))

  return model_info, model

In [13]:
#Creating a dictionary of Parameters.
p={
    'output_count_lstm': [40,50,60],
    'output_count_dense': [30,40,50],
    'batch_size': [32],
    'epochs':[2],
    'optimizer':['adam'],
    'activation':['relu'],
    'last_activation': ['sigmoid'],
    'dropout':[0.1,0.2],
    'loss': ['binary_crossentropy']   
}

#Initiating GridSearchCV.
scan_results = talos.Scan(x=x_train,
               y=y_train,
               x_val=x_val,
               y_val=y_val,
               model=toxic_classifier,
               params=p,
               experiment_name='tcc',
               print_params=True)

In [31]:
# Define the LSTM Model.
inp=Input(shape=(maxpadlen, ),dtype='int32')
embedding_layer = Embedding(len(word_index) + 1,
                           embedding_dim_fasttext,
                           weights = [embedding_matrix_fasttext],
                           input_length = maxpadlen,
                           trainable=False,
                           name = 'embeddings')
embedded_sequences = embedding_layer(inp)
x = LSTM(50, return_sequences=True,name='lstm_layer')(embedded_sequences)
x = GlobalMaxPool1D()(x)
x = Dropout(0.2)(x)
x = Dense(40, activation="relu", kernel_initializer='he_uniform')(x)
x = Dropout(0.2)(x)
preds = Dense(1, activation="sigmoid", kernel_initializer='glorot_uniform')(x)

In [32]:
model_1 = Model(inputs=inp, outputs=preds)
model_1.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

Epoch 1/2


2024-01-27 11:24:27.227485: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.




0it [00:00, ?it/s]

0it [00:00, ?it/s]

0.6680440771349863,Train WGA: {'male_0': 0.8786471483271189, 'male_1': 0.8277088225427709, 'female_0': 0.8770329132194129, 'female_1': 0.8471195232314314, 'LGBTQ_0': 0.8773223645335319, 'LGBTQ_1': 0.7393111638954869, 'christian_0': 0.8701485761452744, 'christian_1': 0.8988705213553744, 'muslim_0': 0.88041194273259, 'muslim_1': 0.7375662892360614, 'other_religions_0': 0.87399711993417, 'other_religions_1': 0.8331295843520783, 'black_0': 0.8802085343170926, 'black_1': 0.6843168957154406, 'white_0': 0.8865657446302607, 'white_1': 0.6680440771349863}
0.6546913149633764,Val WGA: {'male_0': 0.8798218483236422, 'male_1': 0.8327387198321091, 'female_0': 0.8793046399755656, 'female_1': 0.8451875742658292, 'LGBTQ_0': 0.8787137204674885, 'LGBTQ_1': 0.7590940288263556, 'christian_0': 0.8711543194683731, 'christian_1': 0.9079120879120879, 'muslim_0': 0.8831669375435338, 'muslim_1': 0.70521327014218, 'other_religions_0': 0.8758428745983617, 'other_religions_1': 0.8306288032454361, 'black_0': 0.88255

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0.705032336297494,Train WGA: {'male_0': 0.8871202367615831, 'male_1': 0.8529017108352902, 'female_0': 0.8864489634612576, 'female_1': 0.8632877165875731, 'LGBTQ_0': 0.8880507102349032, 'LGBTQ_1': 0.7371733966745844, 'christian_0': 0.8811019397441189, 'christian_1': 0.903508115790261, 'muslim_0': 0.8908947640777156, 'muslim_1': 0.745019349290526, 'other_religions_0': 0.8844811690933888, 'other_religions_1': 0.8371026894865525, 'black_0': 0.8901374535968697, 'black_1': 0.705032336297494, 'white_0': 0.893619719426171, 'white_1': 0.7278117139777219}
0.6967312348668281,Val WGA: {'male_0': 0.8888778918718298, 'male_1': 0.8522560335781741, 'female_0': 0.8875003181552088, 'female_1': 0.8684433882193177, 'LGBTQ_0': 0.8891430139743385, 'LGBTQ_1': 0.7611530542210021, 'christian_0': 0.8820329805562392, 'christian_1': 0.9116483516483517, 'muslim_0': 0.8932899930345949, 'muslim_1': 0.7161137440758294, 'other_religions_0': 0.8863646648866362, 'other_religions_1': 0.8245436105476673, 'black_0': 0.8921

In [20]:
model_1 = tf.keras.models.load_model(f'../models/1706353340.637279_new/0.89.keras')

In [21]:
model_info_1=model_1.fit(train_x,train_y, epochs=2, batch_size=32,  validation_data=(val_x, val_y),callbacks=[wga])

Epoch 1/2

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0.7001818916734034,Train WGA: {'male_0': 0.8902344207199826, 'male_1': 0.852230795035223, 'female_0': 0.8897780870640996, 'female_1': 0.8619081779053085, 'LGBTQ_0': 0.890744307760784, 'LGBTQ_1': 0.7399049881235155, 'christian_0': 0.8836112257531985, 'christian_1': 0.9078839105393074, 'muslim_0': 0.8924589546972762, 'muslim_1': 0.7683818259997134, 'other_religions_0': 0.8870602756634437, 'other_religions_1': 0.844437652811736, 'black_0': 0.8931203741577977, 'black_1': 0.7001818916734034, 'white_0': 0.8965007529523659, 'white_1': 0.7276919391543898}
0.6803874092009685,Val WGA: {'male_0': 0.8911047878263021, 'male_1': 0.8467995802728226, 'female_0': 0.889663773575301, 'female_1': 0.8648786284162281, 'LGBTQ_0': 0.8906982594972898, 'LGBTQ_1': 0.75840768702814, 'christian_0': 0.8833620477479694, 'christian_1': 0.9138461538461539, 'muslim_0': 0.8939400975156722, 'muslim_1': 0.7331753554502369, 'other_religions_0': 0.8875186676924469, 'other_religions_1': 0.8377281947261663, 'black_0': 0.89425

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0.6963419563459984,Train WGA: {'male_0': 0.8908948785259251, 'male_1': 0.8522978866152298, 'female_0': 0.8902591991202522, 'female_1': 0.8632325350402825, 'LGBTQ_0': 0.8914886922622383, 'LGBTQ_1': 0.7358669833729217, 'christian_0': 0.8841683862979777, 'christian_1': 0.9088189094173087, 'muslim_0': 0.8926667293911026, 'muslim_1': 0.7760498781711337, 'other_religions_0': 0.8875860019657592, 'other_religions_1': 0.847799511002445, 'black_0': 0.893884434016871, 'black_1': 0.6963419563459984, 'white_0': 0.8974003328842038, 'white_1': 0.7236794825727632}
0.6846246973365617,Val WGA: {'male_0': 0.892836818013114, 'male_1': 0.8474291710388248, 'female_0': 0.8910127516607702, 'female_1': 0.8682736377525038, 'LGBTQ_0': 0.8924593463394552, 'LGBTQ_1': 0.755662319835278, 'christian_0': 0.8850110755599311, 'christian_1': 0.9151648351648352, 'muslim_0': 0.8948456001857441, 'muslim_1': 0.7492890995260664, 'other_religions_0': 0.8890799656061908, 'other_religions_1': 0.8417849898580122, 'black_0': 0.895

In [22]:
t = str(time.time())
path=os.path.join('../models',f'{t}_new')
if not os.path.exists(path):
    os.makedirs(path)
model_path=f'{str(round(model_info_1.history["val_accuracy"][-1],2))}'
export_path = os.path.join(path,model_path)
model_1.save(f'{export_path}.keras')
json.dump(model_info_1.history,open(f'{export_path}.json','w'))

In [23]:
params = {
    "max_length": max_length,
    "padding_type": padding_type,
    "vocab_size": vocab_size,
    "embedding_dim": embedding_dim_fasttext,
    "trunc_type": trunc_type,
    "oov_tok": oov_tok,
    'model_accuracy': f'{str(round(model_info_1.history["val_accuracy"][-1],2))}'
}
params_json = json.dumps(params, indent=4)
with open(f'{path}/params.json', 'w') as f:
    f.write(params_json)


In [34]:
# #Adding Input Parameters to the Function.
# def toxic_classifier(x_train,y_train,x_val,y_val,params):

#   inp=Input(shape=(maxpadlen, ),dtype='int32')

#   embedding_layer = Embedding(len(word_index) + 1,
#                            embedding_dim_fasttext,
#                            weights = [embedding_matrix_fasttext],
#                            input_length = maxpadlen,
#                            trainable=False,
#                            name = 'embeddings')
#   embedded_sequences = embedding_layer(inp)

#   x = LSTM(params['output_count_lstm'], return_sequences=True,name='lstm_layer')(embedded_sequences)

#   x = Conv1D(filters=params['filters'], kernel_size=params['kernel_size'], padding='same', activation='relu', kernel_initializer='he_uniform')(x)

#   x = MaxPooling1D(params['pool_size'])(x)
  
#   x = GlobalMaxPool1D()(x)
  
#   x = BatchNormalization()(x)
  
#   x = Dense(params['output_1_count_dense'], activation=params['activation'], kernel_initializer='he_uniform')(x)
  
#   x = Dropout(params['dropout'])(x)

#   x = Dense(params['output_2_count_dense'], activation=params['activation'], kernel_initializer='he_uniform')(x)
  
#   x = Dropout(params['dropout'])(x)
  
#   preds = Dense(6, activation=params['last_activation'], kernel_initializer='glorot_uniform')(x)

#   model = Model(inputs=inp, outputs=preds)

#   model.compile(loss=params['loss'], optimizer=params['optimizer'], metrics=['accuracy'])

#   model_info=model.fit(x_train,y_train, epochs=params['epochs'], batch_size=params['batch_size'],  validation_data=(x_val, y_val))

#   return model_info, model

# #Creating a dictionary of Parameters.
# p={
#     'output_count_lstm': [50,60],
#     'output_1_count_dense': [40,50],
#     'output_2_count_dense': [30,40],
#     'filters' : [64],
#     'kernel_size' : [3],
#     'batch_size': [32],
#     'pool_size': [3],
#     'epochs':[2],
#     'optimizer':['adam'],
#     'activation':['relu'],
#     'last_activation': ['sigmoid'],
#     'dropout':[0.1,0.2],
#     'loss': ['binary_crossentropy']   
# }

# #Initiating GridSearchCV.
# scan_results = talos.Scan(x=x_train,
#                y=y_train,
#                x_val=x_val,
#                y_val=y_val,
#                model=toxic_classifier,
#                params=p,
#                experiment_name='tcc',
#                print_params=True)



In [35]:
# # Define the LSTM-CNN Model.
# inp=Input(shape=(maxpadlen, ),dtype='int32')
# embedding_layer = Embedding(len(word_index) + 1,
#                            embedding_dim_fasttext,
#                            weights = [embedding_matrix_fasttext],
#                            input_length = maxpadlen,
#                            trainable=False,
#                            name = 'embeddings')
# embedded_sequences = embedding_layer(inp)
# x = LSTM(50, return_sequences=True,name='lstm_layer')(embedded_sequences)
# x = Conv1D(filters=64, kernel_size=3, padding='same', activation='relu', kernel_initializer='he_uniform')(x)
# x = MaxPooling1D(3)(x)
# x = GlobalMaxPool1D()(x)
# x = BatchNormalization()(x)
# x = Dense(40, activation="relu", kernel_initializer='he_uniform')(x)
# x = Dropout(0.2)(x)
# x = Dense(30, activation="relu", kernel_initializer='he_uniform')(x)
# x = Dropout(0.2)(x)
# preds = Dense(6, activation="sigmoid", kernel_initializer='glorot_uniform')(x)


