## Dependencies

In [1]:
import json, warnings, shutil, glob, json
from jigsaw_utility_scripts import *
from transformers import TFXLMRobertaModel, XLMRobertaConfig
from tensorflow.keras.models import Model
from tensorflow.keras import optimizers, metrics, losses, layers

## TPU configuration

In [2]:
strategy, tpu = set_up_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)
AUTO = tf.data.experimental.AUTOTUNE

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


# Load data

In [3]:
database_base_path = '/kaggle/input/jigsaw-data-split-roberta-192-ratio-2-upper/'
x_test = np.load(database_base_path + 'x_test.npy')

print('Test samples %d' % x_test.shape[1])

Test samples 63812


# Model parameters

In [4]:
input_base_path = '/kaggle/input/99-jigsaw-fold1-xlm-roberta-large-best/'
input_base_path_1 = '/kaggle/input/99-jigsaw-pseudo-fold1-xlm-roberta-large-best3/'
input_base_path_2 = '/kaggle/input/100-jigsaw-pseudo-fold2-xlm-roberta-large-best3/'
input_base_path_3 = '/kaggle/input/101-jigsaw-pseudo-fold3-xlm-roberta-large-best3/'
input_base_path_4 = '/kaggle/input/102-jigsaw-pseudo-fold4-xlm-roberta-large-best3/'
input_base_path_5 = '/kaggle/input/103-jigsaw-pseudo-fold5-xlm-roberta-large-best3/'
with open(input_base_path + 'config.json') as json_file:
    config = json.load(json_file)

config

{'MAX_LEN': 192,
 'BATCH_SIZE': 128,
 'EPOCHS': 4,
 'LEARNING_RATE': 1e-05,
 'ES_PATIENCE': None,
 'base_model_path': '/kaggle/input/jigsaw-transformers/XLM-RoBERTa/tf-xlm-roberta-large-tf_model.h5',
 'config_path': '/kaggle/input/jigsaw-transformers/XLM-RoBERTa/xlm-roberta-large-config.json'}

In [5]:
vocab_path = input_base_path_1 + 'vocab.json'
merges_path = input_base_path_1 + 'merges.txt'
model_path_list = glob.glob(input_base_path_1 + '*.h5')
model_path_list += glob.glob(input_base_path_2 + '*.h5')
model_path_list += glob.glob(input_base_path_3 + '*.h5')
model_path_list += glob.glob(input_base_path_4 + '*.h5')
model_path_list += glob.glob(input_base_path_5 + '*.h5')
model_path_list.sort()
print('Models to predict:')
print(*model_path_list, sep = "\n")

Models to predict:
/kaggle/input/100-jigsaw-pseudo-fold2-xlm-roberta-large-best3/model_pseudo.h5
/kaggle/input/101-jigsaw-pseudo-fold3-xlm-roberta-large-best3/model_pseudo.h5
/kaggle/input/102-jigsaw-pseudo-fold4-xlm-roberta-large-best3/model_pseudo.h5
/kaggle/input/103-jigsaw-pseudo-fold5-xlm-roberta-large-best3/model_pseudo.h5
/kaggle/input/99-jigsaw-pseudo-fold1-xlm-roberta-large-best3/model_pseudo.h5


# Model

In [6]:
module_config = XLMRobertaConfig.from_pretrained(config['config_path'], output_hidden_states=False)

def model_fn(MAX_LEN):
    input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
    
    base_model = TFXLMRobertaModel.from_pretrained(config['base_model_path'], config=module_config)
    last_hidden_state, _ = base_model({'input_ids': input_ids, 'attention_mask': attention_mask})
    cls_token = last_hidden_state[:, 0, :]
    
    output = layers.Dense(1, activation='sigmoid', name='output')(cls_token)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    
    return model

# Make predictions

In [7]:
NUM_TEST_IMAGES = x_test.shape[1]
test_preds = np.zeros((NUM_TEST_IMAGES, 1))

for model_path in model_path_list:
    tf.tpu.experimental.initialize_tpu_system(tpu)
    print(model_path)
    with strategy.scope():
        model = model_fn(config['MAX_LEN'])
        model.load_weights(model_path)

    test_preds += model.predict(get_test_dataset(x_test, config['BATCH_SIZE'], AUTO)) / len(model_path_list)

/kaggle/input/100-jigsaw-pseudo-fold2-xlm-roberta-large-best3/model_pseudo.h5
/kaggle/input/101-jigsaw-pseudo-fold3-xlm-roberta-large-best3/model_pseudo.h5
/kaggle/input/102-jigsaw-pseudo-fold4-xlm-roberta-large-best3/model_pseudo.h5
/kaggle/input/103-jigsaw-pseudo-fold5-xlm-roberta-large-best3/model_pseudo.h5
/kaggle/input/99-jigsaw-pseudo-fold1-xlm-roberta-large-best3/model_pseudo.h5


# Test set predictions

In [8]:
submission = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
submission['toxic'] = test_preds
submission.to_csv('submission.csv', index=False)

display(submission.describe())
display(submission.head(10))

Unnamed: 0,id,toxic
count,63812.0,63812.0
mean,31905.5,0.2011
std,18421.082026,0.293236
min,0.0,0.002105
25%,15952.75,0.006371
50%,31905.5,0.022715
75%,47858.25,0.309005
max,63811.0,0.993803


Unnamed: 0,id,toxic
0,0,0.005758
1,1,0.009946
2,2,0.269383
3,3,0.003955
4,4,0.004368
5,5,0.070531
6,6,0.005725
7,7,0.020432
8,8,0.158615
9,9,0.18412
