## Dependencies

In [1]:
import gc
import numpy as np
import pandas as pd
from tokenizers import BertWordPieceTokenizer

# Parameters

In [2]:
MAX_LEN = 512
config_path = '/kaggle/input/diltilbert-base-ml-cased-huggingface/distilbert-base-multilingual-cased-config.json'
vocab_path = '/kaggle/input/diltilbert-base-ml-cased-huggingface/bert-base-multilingual-cased-vocab.txt'

# File paths
x_train_bias_path = 'x_train_bias'
y_train_bias_path = 'y_train_bias'

## Tokenizer

In [3]:
tokenizer = BertWordPieceTokenizer(vocab_path, lowercase=False)
tokenizer.enable_truncation(max_length=MAX_LEN)
tokenizer.enable_padding(max_length=MAX_LEN)

# Train set (bias)

In [4]:
data_bias_size = 1902194
chuncksize = 100000
for i in range((data_bias_size // chuncksize // 2 ), (data_bias_size // chuncksize)):
    print((i * chuncksize), '--------------------------------------------')
    train_bias = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv", 
                             usecols=['comment_text', 'toxic'], nrows=chuncksize, skiprows=range(1, i * chuncksize))
    print('Train samples %d' % len(train_bias))
    display(train_bias.head())

    x_train_bias = [x.ids for x in tokenizer.encode_batch(train_bias['comment_text'].tolist())]
    y_train_bias = train_bias['toxic'].astype(np.float32).values.reshape(len(train_bias), 1)

    # Save
    np.save(x_train_bias_path + '_pt%d' % i, x_train_bias)
    np.save(y_train_bias_path + '_pt%d' % i, y_train_bias)

    print('x_train samples %d' % len(x_train_bias))
    print('y_train samples %d' % len(y_train_bias))

900000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,An assertion without definition. Specifically...,0.0
1,"If corruption reigns supreme, Christy is a sho...",0.3
2,I don't think you could have possibly read the...,0.0
3,"I'm curious why you say ""fake news""? I expect ...",0.0
4,OMG. The things that greed will lead to. Nex...,0.0


x_train samples 100000
y_train samples 100000
1000000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,He didn't come forward... he is a flight risk....,0.0
1,"Forget the Desmond-Reynolds-Parra rotation, pu...",0.0
2,"akgen, could the word you're looking for be ""c...",0.5
3,Another proud gun owner who is a total ignoram...,0.4
4,"""Ours cost over $700 million each for reasons ...",0.7


x_train samples 100000
y_train samples 100000
1100000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,This is the fallout immediately after new gove...,0.0
1,The GOP doesn't offer change. One need only l...,0.1
2,Seems like your nothing more than a plant for ...,0.0
3,I sympathize with the council. Governments are...,0.0
4,"Oh I see now - Harper chose ""aboriginal"" as a ...",0.166667


x_train samples 100000
y_train samples 100000
1200000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,"We just paid out a $10,500,000 jackpot to a co...",0.0
1,Look it up on google yourself. You are an adu...,0.030303
2,My imagination just went to the Munchkins of O...,0.166667
3,What a dumb idea. If you are going to feed th...,0.833333
4,"well, dang.\nI'm glad the airport cops are ok.",0.0


x_train samples 100000
y_train samples 100000
1300000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,"since the 80s, we have accepted a total collap...",0.166667
1,"William M. Cox, MD - So, reading between the l...",0.0
2,"""Terence Holmes, human resources generalist at...",0.0
3,It will be interesting to see what Trump suppo...,0.3
4,"The monthly anti-Israel, pro-Palestine, pro-Ir...",0.1


x_train samples 100000
y_train samples 100000
1400000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,So Alceste thinks that it is OK for the police...,0.0
1,"Regardless of the mode of transportation, impa...",0.0
2,Rail Mafia is organized crime. This has actual...,0.3
3,"Rik, \nOur government isn't perfect and we do...",0.0
4,"On the other hand, Torbayguy, if only 35% are ...",0.6


x_train samples 100000
y_train samples 100000
1500000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,Now that NK has launched a second missile over...,0.5
1,I 100% agree with your assessment. Sakic is as...,0.0
2,I'd prefer my child to be using pot over alcoh...,0.0
3,"Perhaps just a war on real estate agents, brok...",0.0
4,Hilarious! Done better like abandoning he...,0.0


x_train samples 100000
y_train samples 100000
1600000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,A sensible manager would eliminate full time p...,0.0
1,I'm not participating in this conversation bec...,0.0
2,Sad news. That means half of these demonstrabl...,0.166667
3,Van Brocklin is not right. This is an ER Nurs...,0.0
4,The Liberals' solution is always more taxes. I...,0.0


x_train samples 100000
y_train samples 100000
1700000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,So anyone with testosterone has a predispositi...,0.540984
1,All indications are that Mueller's Team is not...,0.0
2,Trump is a draft dodger and a coward.,0.784314
3,Why would you eliminate ESL and bilingual prog...,0.166667
4,now why is any sort of disagreeing with a pers...,0.1


x_train samples 100000
y_train samples 100000
1800000 --------------------------------------------
Train samples 100000


Unnamed: 0,comment_text,toxic
0,Is this pro-iranian view illustrate what it is...,0.0
1,You could not be more wrong. The absolute vast...,0.0
2,None of this is in a court of law. They are ...,0.0
3,People have been killing people since the dawn...,0.2
4,"Actually, there is no proof that any Democrat ...",0.183673


x_train samples 100000
y_train samples 100000
