In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torch.utils.data
import torch.nn.functional as F
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import warnings
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam
from pytorch_pretrained_bert import BertConfig
device = torch.device('cuda')

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    return np.array(all_tokens)

In [3]:
MAX_SEQUENCE_LENGTH = 220
SEED = 1234
BATCH_SIZE = 32
BERT_MODEL_PATH = './datas/uncased_L-12_H-768_A-12'

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

EPOCHS = 1
TOXICITY_COLUMN = 'target'
bert_config = BertConfig('./datas/bert_config.json')
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, do_lower_case=True)

In [4]:
train_df = pd.read_csv("./datas/train.csv").sample(10000)
print('train %d records' % len(train_df))
train_df['comment_text'] = train_df['comment_text'].astype(str)

valid_df = pd.read_csv("./datas/train_multi.csv").sample(10000)
valid_df['comment_text'] = valid_df['comment_text'].astype(str)
print('valid %d records' % len(valid_df))

train_seqs = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)
valid_seqs = convert_lines(valid_df["comment_text"].fillna("DUMMY_VALUE"), MAX_SEQUENCE_LENGTH, tokenizer)
train_df=train_df.fillna(0)
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']
y_columns=['target']

train_df = train_df.drop(['comment_text'],axis=1)
train_df['target']=(train_df['target']>=0.5).astype(float)

valid_df = valid_df.fillna(0)
valid_df = valid_df.drop(['comment_text'],axis=1)
valid_df['target'] = (valid_df['toxic'] == 1) | (valid_df['severe_toxic'] == 1)
valid_df['target'] = valid_df['target'] | (valid_df['obscene'] == 1)
valid_df['target'] = valid_df['target'] | (valid_df['threat'] == 1)
valid_df['target'] = valid_df['target'] | (valid_df['insult'] == 1)
valid_df['target'] = valid_df['target'] | (valid_df['identity_hate'] == 1)
valid_df['target'] = valid_df['target'].astype(float)

train 10000 records


  1%|▏         | 146/10000 [00:00<00:06, 1448.47it/s]

valid 10000 records


100%|██████████| 10000/10000 [00:06<00:00, 1575.15it/s]
100%|██████████| 10000/10000 [00:08<00:00, 1198.00it/s]


In [11]:
X = train_seqs[:]
y = train_df['target'].values[:]
valid_X = valid_seqs[:]
valid_y = valid_df['target'].values[:]
print(X.shape)
print(y.shape)
print(valid_X.shape)
print(valid_y.shape)
X = np.concatenate((X, valid_X), axis=1)
y = np.concatenate((y, valid_y), axis=0)
print(X.shape)
print(y.shape)

(10000, 220)
(10000,)
(10000, 220)
(10000,)
(10000, 440)
(20000,)
