In [1]:
import sys
import os  
import traceback

RECORDS_PATH = '../input/tfrecords'

if os.path.exists('/kaggle'):
  sys.path.append('/kaggle/input/gezi-melt/utils')
  sys.path.append('/kaggle/input/official')
  from kaggle_datasets import KaggleDatasets
  try:
    GCS_PATH = KaggleDatasets().get_gcs_path('jigsaw-multilingual-toxic-comment-classification')
    RECORDS_PATH = KaggleDatasets().get_gcs_path('toxic-multi-tfrecords') + '/tfrecords'
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
  except Exception:
    print(traceback.format_exc())
    RECORDS_PATH = '../input/toxic-multi-tfrecords/tfrecords'
    pass
!ls ../input
RECORDS_PATH

bert-multi					  tf-xlm-roberta-base
jigsaw-multilingual-toxic-comment-classification  tf-xlm-roberta-large
tfrecords


'../input/tfrecords'

In [2]:
import official
import gezi
import melt
import lele
import husky
import pandas as pd
import numpy as np
import tensorflow as tf
tf.__version__
from absl import app, flags
FLAGS = flags.FLAGS
flags.DEFINE_string('model', None, '')
flags.DEFINE_bool('multi_head', False, '')
flags.DEFINE_string('pretrained', '../input/tf-xlm-roberta-large', '')
flags.DEFINE_integer('max_len', 192, 'xlm 192 bert 128')
flags.DEFINE_bool('freeze_pretrained', False, '')

In [3]:
# flags
argv=['']
FLAGS(argv)
mark='xlm'
FLAGS.train_input=f'{RECORDS_PATH}/{mark}/jigsaw-toxic-comment-train'
FLAGS.valid_input=f'{RECORDS_PATH}/{mark}/validation'
FLAGS.test_input=f'{RECORDS_PATH}/{mark}/test'
FLAGS.valid_interval_steps=100 
FLAGS.verbose=1 
FLAGS.num_epochs=1
FLAGS.keras=1 
FLAGS.buffer_size=2048
FLAGS.learning_rate=1e-5 
FLAGS.min_learning_rate=0.
# FLAGS.opt_epsilon=1e-8 
# FLAGS.optimizer='bert-adamw'
FLAGS.optimizer='adam'
FLAGS.metrics=[] 
FLAGS.test_names=['id', 'toxic']
FLAGS.valid_interval_epochs=0.1
FLAGS.test_interval_epochs=1.
FLAGS.num_gpus=1
FLAGS.cache=0
FLAGS.model_dir='../working/exps/v1/base'
FLAGS.multi_head=0
FLAGS.batch_parse=1
FLAGS.save_model=0
FLAGS.pretrained = '../input/tf-xlm-roberta-large/'
# FLAGS.pretrained = '../input/tf-xlm-roberta-base/'
FLAGS.batch_size=16 if 'large' in FLAGS.pretrained else 32
# FLAGS.batch_size=16
FLAGS.debug=0

toxic_types = ['severe_toxic', 'obscene', 'identity_hate', 'threat', 'insult']

In [4]:
# evaluate
import pandas as pd
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.utils import shuffle

import gezi
logging = gezi.logging

langs = ['es', 'it', 'tr']

def evaluate(y_true, y_pred, x):
  try:
    y_true = y_true[:,0]
    y_pred = y_pred[:,0]
  except Exception:
    pass
  if y_pred.max() > 1. or y_pred.min() < 0:
    y_pred = gezi.sigmoid(y_pred)
  result = {}
  loss = log_loss(y_true, y_pred)
  result['loss'] = loss
  
  auc = roc_auc_score(y_true, y_pred)
  result['auc/all'] = auc
    
  if 'lang' in x:
    x['y_true'] = y_true
    x['pred'] = y_pred
    x['lang'] = gezi.decode(x['lang'])

    df = pd.DataFrame(x) 
    df = shuffle(df)
    logging.info('\n', df)

    for lang in langs:
      df_ = df[df.lang==lang]
      auc = roc_auc_score(df_.y_true, df_.pred)
      result[f'auc/{lang}'] = auc

  return result

In [5]:
# dataset
import tensorflow as tf
from tensorflow.keras import backend as K
import melt

class Dataset(melt.Dataset):
  def __init__(self, subset='valid', **kwargs):
    super(Dataset, self).__init__(subset, **kwargs)

  def parse(self, example):
    MAX_LEN = FLAGS.max_len
    features_dict = {
      'input_word_ids': tf.io.FixedLenFeature([MAX_LEN], tf.int64),
      'toxic': tf.io.FixedLenFeature([], tf.float32),
      'id': tf.io.FixedLenFeature([], tf.string),
      # 'comment_text': tf.io.FixedLenFeature([], tf.string), # TODO
    }

    def _adds(names, dtype=None, length=None):
      dtype_ = dtype
      for name in names:
        if name in self.example:
          dtype = dtype_ or self.example[name][0].dtype 
          if length:
            features_dict[name] = tf.io.FixedLenFeature([length], dtype)
          else:
            features_dict[name] = tf.io.FixedLenFeature([], dtype)

    _adds(['lang'], tf.string)

    _adds(['input_mask', 'all_segment_id'], tf.int64, MAX_LEN)
    
    _adds(toxic_types)

    features = self.parse_(serialized=example, features=features_dict)

    def _casts(names, dtype=tf.int32):
      for name in names:
        if name in features:
          features[name] = tf.cast(features[name], dtype)

    _casts(['input_word_ids', 'input_mask', 'all_segment_id'])
    
    x = features
    y = features['toxic']
#     y = tf.cast(features['toxic'] > 0.5, tf.float32)
    keys = ['toxic', *toxic_types]
    for key in keys:
      if key not in features:
        features[key] = tf.zeros_like(features['toxic'])
        
    _casts(toxic_types, tf.float32)
        
    melt.append_dim(features, keys)

    if FLAGS.multi_head:
      y = tf.concat([features[key] for key in keys], 1)

    return x, y
 

In [6]:
# loss
import tensorflow as tf

def calc_loss(y_true, y_pred):
  pass

def focal_loss(gamma=1.5, alpha=.2):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

def get_loss_fn():
#   return tf.compat.v1.losses.sigmoid_cross_entropy
  return tf.keras.losses.BinaryCrossentropy()
#   return focal_loss()

In [7]:
# model
import tensorflow as tf
import tensorflow_hub as hub

from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Dense, Input

import melt
import gezi 
logging = gezi.logging

# class Model(keras.Model):
#   def __init__(self):
#     super(Model, self).__init__() 

#     self.bert_layer = bert_layer
#     dims = [32]
#     self.mlp = melt.layers.MLP(dims)
#     odim = len(toxic_types) + 1 if FLAGS.multi_head else 1
#     self.dense = keras.layers.Dense(odim, activation='sigmoid')

#   def call(self, input):
#     input_word_ids = input['input_word_ids']
#     input_mask = input['input_mask']
#     segment_ids = input['all_segment_id']
  
#     x, _ = self.bert_layer([input_word_ids, input_mask, segment_ids])
#     x = self.mlp(x)
#     x = self.dense(x)
#     return x

In [8]:
import transformers
from transformers import TFAutoModel

def xlm_model():
  pretrained = FLAGS.pretrained or XLM_PATH
  with gezi.Timer(f'load xlm_model from {pretrained}', True, logging.info):
    transformer = TFAutoModel.from_pretrained(pretrained)
  if FLAGS.freeze_pretrained:
    transformer.trainable = False
  input_word_ids = Input(shape=(FLAGS.max_len,), dtype=tf.int32, name="input_word_ids")
  sequence_output = transformer(input_word_ids)[0]
  cls_token = sequence_output[:, 0, :]
  odim = len(toxic_types) + 1 if FLAGS.multi_head else 1
  out = keras.layers.Dense(odim, activation='sigmoid')(cls_token)

  model = keras.Model(inputs=input_word_ids, outputs=out)

  return model

I0418 10:09:46.857226 140390474569536 file_utils.py:41] PyTorch version 1.4.0 available.
I0418 10:09:46.858443 140390474569536 file_utils.py:57] TensorFlow version 2.2.0-dev20200411 available.


In [9]:
Model = xlm_model

In [10]:
# train

import os
import melt

fit = melt.fit
melt.init()
loss_fn = get_loss_fn()

2020-04-18 10:09:47 0:00:00 fcntl.floc with lock_file /root/.melt.lock (If hang here means other programs calling melt.init have not finished yet)
2020-04-18 10:09:47 0:00:00 Tf dataset and Tf model train in Eager mode, keras 1, distributed:False
2020-04-18 10:09:47 0:00:00 log_level: 20 (try --debug to show more or --log_level=(> 20) to show less(no INFO), try --verbose to show train/valid loss intervaly)
2020-04-18 10:09:47 0:00:00 batch_size: 16 eval_batch_size: 16 batch_size_per_gpu: 16 num_gpus: 1 gpu: [4] CUDA_VISIABLE_DEVICES=[] work_mode: train distributed: False horovod: False
2020-04-18 10:09:49 0:00:02 model: [base] model_dir: [../working/exps/v1/base]


In [11]:
strategy = melt.distributed.get_strategy()
with strategy.scope():
  model = Model()
try:
  model.summary()
except Exception:
  pass

2020-04-18 10:09:49 0:00:02 load xlm_model from ../input/tf-xlm-roberta-large/ start
I0418 10:09:49.689640 140390474569536 configuration_utils.py:281] loading configuration file ../input/tf-xlm-roberta-large/config.json
I0418 10:09:49.691958 140390474569536 configuration_utils.py:319] Model config XLMRobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_roberta_model (TFRobertaM ((None, 192, 1024), (None 559890432 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 1024)]            0         
_________________________________________________________________
dense (Dense)                (None, 1)                 1025      
Total params: 559,891,457
Trainable params: 559,891,457
Non-trainable params: 0
_________________________________________________________________


In [12]:
# model.load_weights('../input/toxic-multi/xlm.toxic.h5')

In [13]:
def run(model=model):
  fit(model,  
      loss_fn,
      Dataset,
      eval_fn=evaluate,
      eval_keys=['lang'],
      )   

In [14]:
FLAGS.train_input=f'{RECORDS_PATH}/xlm-sample1/jigsaw-unintended-bias-train'
FLAGS.learning_rate=1e-5
# FLAGS.opt_epsilon=1e-8
FLAGS.num_epochs=1
FLAGS.num_train=100
FLAGS.num_valid=100
FLAGS.vie=1
FLAGS.tie=0
run()

2020-04-18 10:10:15 0:00:27 -------Round: 0 mode: None train_input:[jigsaw-unintended-bias-train] valid_input:[validation] train_dirs:[1] valid_dir: ../input/tfrecords/xlm/validation
2020-04-18 10:10:15 0:00:27 --start_hour=jigsaw-unintended-bias-train --end_hour=validation root: ../input/tfrecords/xlm-sample1
2020-04-18 10:10:17 0:00:30 num_train_examples: 100    
2020-04-18 10:10:17 0:00:30 num_valid_examples: 100   
2020-04-18 10:10:17 0:00:30 num_test_examples: 63812  
2020-04-18 10:10:18 0:00:30 latest ckpt to restore: None


ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted:  OOM when allocating tensor with shape[16,192,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node model/tf_roberta_model/roberta/encoder/layer_._18/intermediate/activation/Erf (defined at /home/gezi/env/anaconda3/envs/tf2/lib/python3.6/site-packages/transformers/modeling_tf_bert.py:63) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[gradient_tape/model/tf_roberta_model/roberta/embeddings/token_type_embeddings/embedding_lookup/Reshape/_1026]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted:  OOM when allocating tensor with shape[16,192,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node model/tf_roberta_model/roberta/encoder/layer_._18/intermediate/activation/Erf (defined at /home/gezi/env/anaconda3/envs/tf2/lib/python3.6/site-packages/transformers/modeling_tf_bert.py:63) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_58859]

Function call stack:
train_function -> train_function


In [None]:
FLAGS.train_input=f'{RECORDS_PATH}/xlm-sample2/jigsaw-unintended-bias-train'
FLAGS.learning_rate=1e-5
# FLAGS.opt_epsilon=1e-8
FLAGS.num_epochs=1
FLAGS.num_train=100
FLAGS.num_valid=100
FLAGS.vie=1
FLAGS.tie=0
run()

In [None]:
FLAGS.train_input=f'{RECORDS_PATH}/{mark}/jigsaw-toxic-comment-train'
FLAGS.learning_rate=1e-5
# FLAGS.opt_epsilon=1e-8
FLAGS.num_epochs=1
FLAGS.num_train=100
FLAGS.num_valid=100
FLAGS.vie=1
FLAGS.tie=0
run()

In [None]:
# cv
FLAGS.train_input=FLAGS.valid_input
FLAGS.learning_rate=1e-5
valid_input = FLAGS.valid_input
FLAGS.num_folds = 5
FLAGS.vie=1.
run()
FLAGS.num_folds = None
FLAGS.vie=0.1
FLAGS.valid_input = FLAGS.train_input

In [None]:
# cv
FLAGS.train_input=FLAGS.valid_input
FLAGS.learning_rate=1e-5
valid_input = FLAGS.valid_input
FLAGS.num_folds = 5
FLAGS.num_train=None
FLAGS.vie=1.
run()
FLAGS.num_folds = None
FLAGS.vie=0.1
FLAGS.valid_input = FLAGS.train_input

In [None]:
# model.save_weights('./xlm.toxic-uint1.h5')

In [None]:
# FLAGS.train_input=FLAGS.valid_input
# FLAGS.learning_rate=1e-5
# # FLAGS.opt_epsilon=1e-8
# FLAGS.num_epochs=1
# run()

In [None]:
# model.save_weights('./xlm.final.h5')

In [None]:
# # with strategy.scope():
# #   model = Model()
# FLAGS.train_input=f'{RECORDS_GCS_PATH}/{mark}/jigsaw-toxic-comment-train,{RECORDS_GCS_PATH}/xlm-sample2/jigsaw-unintended-bias-train'
# # FLAGS.train_input=f'{RECORDS_GCS_PATH}/xlm/jigsaw-unintended-bias-train'
# FLAGS.learning_rate=3e-5
# FLAGS.opt_epsilon=1e-8
# FLAGS.num_epochs=1  
# FLAGS.valid_interval_epochs=0.1
# run()

In [None]:
# FLAGS.train_input=FLAGS.valid_input
# FLAGS.learning_rate=3e-5
# FLAGS.opt_epsilon=1e-8
# FLAGS.num_epochs=1
# FLAGS.valid_interval_epochs=0.2
# FLAGS.optimizer='bert-adamw'
# run()

In [None]:
# d = pd.read_csv('../working/exps/v1/base/submission.csv')
# d

In [None]:
# d.to_csv('submission.csv', index=False)
# d.head()