## Dependencies

In [1]:
import gc
import numpy as np
import pandas as pd
from tokenizers import BertWordPieceTokenizer

# Parameters

In [2]:
MAX_LEN = 512
config_path = '/kaggle/input/diltilbert-base-ml-cased-huggingface/distilbert-base-multilingual-cased-config.json'
vocab_path = '/kaggle/input/diltilbert-base-ml-cased-huggingface/bert-base-multilingual-cased-vocab.txt'

# File paths
x_train_toxic_path = 'x_train_toxic'
y_train_toxic_path = 'y_train_toxic'
x_valid_path = 'x_valid'
y_valid_path = 'y_valid'
x_test_path = 'x_test'

## Tokenizer

In [3]:
tokenizer = BertWordPieceTokenizer(vocab_path, lowercase=False)
tokenizer.enable_truncation(max_length=MAX_LEN)
tokenizer.enable_padding(max_length=MAX_LEN)

# Train set (toxic)

In [4]:
train_toxic = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv", 
                          usecols=['comment_text', 'toxic'])
print('Train samples %d' % len(train_toxic))
display(train_toxic.head())

x_train_toxic = [x.ids for x in tokenizer.encode_batch(train_toxic['comment_text'].tolist())]
y_train_toxic = train_toxic['toxic'].astype(np.float32).values.reshape(len(train_toxic), 1)

# Save
np.save(x_train_toxic_path, x_train_toxic)
np.save(y_train_toxic_path, y_train_toxic)

print('x_train samples %d' % len(x_train_toxic))
print('y_train samples %d' % len(y_train_toxic))

Train samples 223549


Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


x_train samples 223549
y_train samples 223549


# Validation set

In [5]:
del train_toxic, x_train_toxic, y_train_toxic
gc.collect()

valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv', 
                    usecols=['comment_text', 'toxic'])
print('Validation samples %d' % len(valid))
display(valid.head())

x_valid = [x.ids for x in tokenizer.encode_batch(valid['comment_text'].apply(lambda x : x).tolist())]
y_valid = valid['toxic'].astype(np.float32).values.reshape(len(x_valid), 1)

# Save
np.save(x_valid_path, x_valid)
np.save(y_valid_path, y_valid)

print('x_valid samples %d' % len(x_valid))
print('y_valid samples %d' % len(y_valid))

Validation samples 8000


Unnamed: 0,comment_text,toxic
0,Este usuario ni siquiera llega al rango de ...,0
1,Il testo di questa voce pare esser scopiazzato...,0
2,Vale. Sólo expongo mi pasado. Todo tiempo pasa...,1
3,Bu maddenin alt başlığı olarak uluslararası i...,0
4,Belçika nın şehirlerinin yanında ilçe ve belde...,0


x_valid samples 8000
y_valid samples 8000


# Test set

In [6]:
del valid, x_valid, y_valid
gc.collect()

test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv', 
                   usecols=['content'])
print('Test samples %d' % len(test))
display(test.head())

x_test = [x.ids for x in tokenizer.encode_batch(test['content'].apply(lambda x : x).tolist())]

# Save
np.save(x_test_path, x_test)

print('x_test samples %d' % len(x_test))

Test samples 63812


Unnamed: 0,content
0,Doctor Who adlı viki başlığına 12. doctor olar...
1,"Вполне возможно, но я пока не вижу необходимо..."
2,"Quindi tu sei uno di quelli conservativi , ..."
3,Malesef gerçekleştirilmedi ancak şöyle bir şey...
4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...


x_test samples 63812
