## Dependencies

In [1]:
import glob, json
from jigsaw_utility_scripts import *
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from transformers import TFXLMRobertaModel, XLMRobertaConfig

## TPU configuration

In [2]:
strategy, tpu = set_up_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)
AUTO = tf.data.experimental.AUTOTUNE

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


# Load data

In [3]:
database_base_path = '/kaggle/input/jigsaw-dataset-split-toxic-roberta-base-192/'
x_test_path = database_base_path + 'x_test.npy'
x_test = np.load(x_test_path)

print('Test samples %d' % len(x_test[0]))

Test samples 63812


# Model parameters

In [4]:
input_base_path = '/kaggle/input/11-jigsaw-train-3fold-xlm-roberta-base/'
with open(input_base_path + 'config.json') as json_file:
    config = json.load(json_file)

config

{'MAX_LEN': 192,
 'BATCH_SIZE': 128,
 'EPOCHS': 2,
 'LEARNING_RATE': 1e-05,
 'ES_PATIENCE': 1,
 'N_FOLDS': 3,
 'base_model_path': '/kaggle/input/jigsaw-transformers/XLM-RoBERTa/tf-xlm-roberta-base-tf_model.h5',
 'config_path': '/kaggle/input/jigsaw-transformers/XLM-RoBERTa/xlm-roberta-base-config.json'}

In [5]:
vocab_path = input_base_path + 'vocab.json'
merges_path = input_base_path + 'merges.txt'
model_path_list = glob.glob(input_base_path + '*.h5')
model_path_list.sort()
print('Models to predict:')
print(*model_path_list, sep = "\n")

Models to predict:
/kaggle/input/11-jigsaw-train-3fold-xlm-roberta-base/model_fold_1.h5
/kaggle/input/11-jigsaw-train-3fold-xlm-roberta-base/model_fold_2.h5
/kaggle/input/11-jigsaw-train-3fold-xlm-roberta-base/model_fold_3.h5


# Model

In [6]:
module_config = XLMRobertaConfig.from_pretrained(config['config_path'], output_hidden_states=False)

def model_fn(MAX_LEN):
    input_ids = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name='attention_mask')
    base_model = TFXLMRobertaModel.from_pretrained(config['base_model_path'], config=module_config)
    sequence_output = base_model({'input_ids': input_ids, 'attention_mask': attention_mask})
    
    last_state = sequence_output[0]
    cls_token = last_state[:, 0, :]
    
    output = layers.Dense(1, activation='sigmoid', name='output')(cls_token)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    
    return model

# Make predictions

In [7]:
NUM_TEST_IMAGES = len(x_test[0])
test_preds = np.zeros((NUM_TEST_IMAGES, 1))

for model_path in model_path_list:
    tf.tpu.experimental.initialize_tpu_system(tpu)
    print(model_path)
    with strategy.scope():
        model = model_fn(config['MAX_LEN'])
        model.load_weights(model_path)

    test_preds += np.round(model.predict(list(x_test))) / len(model_path_list)

/kaggle/input/11-jigsaw-train-3fold-xlm-roberta-base/model_fold_1.h5
/kaggle/input/11-jigsaw-train-3fold-xlm-roberta-base/model_fold_2.h5
/kaggle/input/11-jigsaw-train-3fold-xlm-roberta-base/model_fold_3.h5


# Test set predictions

In [8]:
submission = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
submission['toxic'] = test_preds
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,id,toxic
0,0,0.0
1,1,0.0
2,2,0.666667
3,3,0.0
4,4,0.0
5,5,0.333333
6,6,0.0
7,7,0.666667
8,8,0.333333
9,9,1.0
