## Dependencies

In [1]:
import glob
import numpy as np
import pandas as pd
from transformers import TFDistilBertModel
from tokenizers import BertWordPieceTokenizer
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Dropout, GlobalAveragePooling1D

In [2]:
# Datasets
def get_test_dataset():
    dataset = tf.data.Dataset.from_tensor_slices(x_test)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

## TPU configuration

In [3]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


# Load data

In [4]:
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv', 
                   usecols=['id', 'content'])

print('Test samples %d' % len(test))
display(test.head())

Test samples 63812


Unnamed: 0,id,content
0,0,Doctor Who adlı viki başlığına 12. doctor olar...
1,1,"Вполне возможно, но я пока не вижу необходимо..."
2,2,"Quindi tu sei uno di quelli conservativi , ..."
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...


# Model parameters

In [5]:
MAX_LEN = 512
BATCH_SIZE = 64 * strategy.num_replicas_in_sync

base_model_path = '/kaggle/input/diltilbert-base-ml-cased-huggingface/distilbert-base-multilingual-cased-tf_model.h5'
config_path = '/kaggle/input/diltilbert-base-ml-cased-huggingface/distilbert-base-multilingual-cased-config.json'
vocab_path = '/kaggle/input/diltilbert-base-ml-cased-huggingface/bert-base-multilingual-cased-vocab.txt'

model_path_list = glob.glob('/kaggle/input/1-jigsaw-train-distilbert-ml-cased-toxic/' + '*.h5')
model_path_list.sort()
print('Models to predict:')
print(*model_path_list, sep = "\n")

Models to predict:
/kaggle/input/1-jigsaw-train-distilbert-ml-cased-toxic/model.h5


## Tokenizer

In [6]:
tokenizer = BertWordPieceTokenizer(vocab_path, lowercase=False)
tokenizer.enable_truncation(max_length=MAX_LEN)
tokenizer.enable_padding(max_length=MAX_LEN)

## Build TF datasets

In [7]:
x_test = [x.ids for x in tokenizer.encode_batch(test['content'].apply(lambda x : x).tolist())]

AUTO = tf.data.experimental.AUTOTUNE

# Model

In [8]:
def model_fn():
    input_word_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
    base_model = TFDistilBertModel.from_pretrained(base_model_path, config=config_path)
    sequence_output = base_model(input_word_ids)[0]
    
    x = GlobalAveragePooling1D()(sequence_output)
    x = Dropout(0.25)(x)
    output = Dense(1, activation='sigmoid', name='output')(x)
    
    model = Model(inputs=input_word_ids, outputs=output)
    
    return model

# Make predictions

In [9]:
NUM_TEST_IMAGES = len(test)
test_preds = np.zeros((NUM_TEST_IMAGES, 1))

for model_path in model_path_list:
    print(model_path)
    ### Model
    with strategy.scope():
        model = model_fn()
        model.load_weights(model_path)
                    
    test_preds += model.predict(get_test_dataset()) / len(model_path_list)

/kaggle/input/1-jigsaw-train-distilbert-ml-cased-toxic/model.h5


# Test set predictions

In [10]:
submission = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')
submission = submission[:len(test_preds)]
submission['toxic'] = test_preds
submission.to_csv('submission.csv', index=False)
submission.head(10)

Unnamed: 0,id,toxic
0,0,0.365365
1,1,0.078546
2,2,0.320815
3,3,0.23403
4,4,0.17568
5,5,0.301331
6,6,0.123409
7,7,0.705944
8,8,0.228006
9,9,0.543985
