## Dependencies

In [1]:
import gc
import numpy as np
import pandas as pd
from tokenizers import BertWordPieceTokenizer

# Parameters

In [2]:
MAX_LEN = 512
base_path = '/kaggle/input/bert-base-ml-cased-huggingface/bert_base_cased/'
config_path = base_path + 'bert-base-multilingual-cased-config.json'
vocab_path = base_path + 'bert-base-multilingual-cased-vocab.txt'

# File paths
x_train_bias_path = 'x_train_bias'
y_train_bias_path = 'y_train_bias'

## Tokenizer

In [3]:
tokenizer = BertWordPieceTokenizer(vocab_path, lowercase=False)
tokenizer.enable_truncation(max_length=MAX_LEN)
tokenizer.enable_padding(max_length=MAX_LEN)

# Train set (bias)

In [4]:
data_bias_size = 1902194
chuncksize = 100000
for i in range(data_bias_size // chuncksize // 2):
    print((i * chuncksize), '--------------------------------------------')
    train_bias = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv", 
                             usecols=['comment_text', 'toxic'], nrows=chuncksize, skiprows=range(1, i * chuncksize))
    print('Train samples %d' % len(train_bias))
    display(train_bias.head())

    x_train_bias = [x.ids for x in tokenizer.encode_batch(train_bias['comment_text'].tolist())]
    y_train_bias = train_bias['toxic'].astype(np.float32).values.reshape(len(train_bias), 1)

    # Save
    np.save(x_train_bias_path + '_pt%d' % i, x_train_bias)
    np.save(y_train_bias_path + '_pt%d' % i, y_train_bias)

    print('x_train samples %d' % len(x_train_bias))
    print('y_train samples %d' % len(y_train_bias))

0 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,"This is so cool. It's like, 'would you want yo...",0.0
1,Thank you!! This would make my life a lot less...,0.0
2,This is such an urgent design problem; kudos t...,0.0
3,Is this something I'll be able to install on m...,0.0
4,haha you guys are a bunch of losers.,0.893617


x_train samples 100000
y_train samples 100000
100000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,"I should add, I don't like going to Refuges or...",0.0
1,"NONE. The congress makes the laws, and the Ex...",0.0
2,Is there any research to tell us whether oxybe...,0.0
3,"OT, You are definitely on to something with yo...",0.0
4,Not once does anyone mention that the shooter ...,0.4


x_train samples 100000
y_train samples 100000
200000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,I'm guessing you don't know West High.. It's c...,0.3
1,Goodbye Norma Jean....,0.0
2,"It won't matter, Mr Walker gotta go.",0.0
3,It turns out that African Americans don't want...,0.4
4,"In search of Solution ?, what kind of solution...",0.0


x_train samples 100000
y_train samples 100000
300000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,In Saskatchewan....there were several announce...,0.0
1,Little old lady evangelicals in Alaska were pa...,0.4
2,"""No, America needs resentment management."" Ve...",1.0
3,"""A poll on who Alaskans were going to vote for...",0.0
4,Welcome to the world of everyone who didn't vo...,0.0


x_train samples 100000
y_train samples 100000
400000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,Interesting that the opinion piece uses the te...,0.0
1,Well sure the possibility occurred to me. Just...,0.5
2,It always amazes me how people can be manipula...,0.0
3,"Embraer suck it up. China, US and the European...",0.485714
4,The headline doesn't even mention that the tru...,0.0


x_train samples 100000
y_train samples 100000
500000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,Russians beat em down pretty good,0.0
1,I agree. I have noticed that when indigenous p...,0.0
2,"You lost me at ""my relatives own half of Nebra...",0.676056
3,You dirty dirty socialists and your utopias.,0.815385
4,yeah but why risk thousands over one butt hurt...,0.44


x_train samples 100000
y_train samples 100000
600000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,'\n…matching the comments from those who screa...,0.0
1,Good...to protect the environment the EPA peop...,0.0
2,Interesting,0.0
3,You know what I really don't care what people ...,0.0
4,If port Townsend was smart it would lower its ...,0.0


x_train samples 100000
y_train samples 100000
700000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,Straw poll for all the churchgoers out there: ...,0.3
1,My funds were also great under President Obama...,0.0
2,"Dear Mr President, Donald Trump, FYI...I have ...",0.166667
3,There is a huge difference. Trudeau's polici...,0.0
4,The majority of the court in Morgentaler did n...,0.3


x_train samples 100000
y_train samples 100000
800000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,Congrats Betty. Keep up the good work for a gr...,0.0
1,There is intelligence at parliament hill? Wow,0.0
2,The old boy is 78 years old and probably is bo...,0.0
3,We can't afford this project. Our GDP in 2015...,0.0
4,Technology has made wage slaves of many who ar...,0.0


x_train samples 100000
y_train samples 100000
