In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import re
import string

import random
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW

from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorWithPadding
from datasets import Dataset, DatasetDict, load_from_disk

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.get_device_name(0)

'NVIDIA GeForce 940MX'

In [3]:
# set random
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_val)

### Dataset

In [4]:
# dataset
train = pd.read_csv('../data/raw/jigsaw/train.csv')
test = pd.read_csv('../data/raw/jigsaw/test.csv')\
        .merge(pd.read_csv('../data/raw/jigsaw/test_labels.csv'),
                on='id',
                how='inner')

In [5]:
# clean
def rm_ip_address(text):
    return re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)

def rm_link(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def rm_emoji(text):
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    return emojis.sub(r'', text)

def rm_middle_dot(text):
    return re.sub(r'(?<=\w)\.(?=\w+)', '', text)

def rm_middle_spaces(text):
    return re.sub(r'(?<=\w)\s(?=\w+)', '', text)

def clean_pipeline(text):
    no_ip_address = rm_ip_address(text)
    no_link = rm_link(no_ip_address)
    no_emoji = rm_emoji(no_link)
    no_mid_dots = rm_middle_dot(no_emoji)

    return no_mid_dots

In [6]:
def is_toxic(row):
    return 1 if row.sum() > 1 else -1 if row.sum() < 0 else 0

train['is_toxic'] = train.iloc[:, 2:].apply(is_toxic, axis=1)
test['is_toxic'] = test.iloc[:, 2:].apply(is_toxic, axis=1)

In [7]:
train['comment_clean'] = train.comment_text.apply(clean_pipeline)
test['comment_clean'] = test.comment_text.apply(clean_pipeline)

In [8]:
train_df = train[['comment_clean', 'is_toxic']]
test_df = test[['comment_clean', 'is_toxic']]

In [9]:
ganbert_train = train_df.sample(frac=0.01)
unlabeled = train_df.drop(ganbert_train.index)

ganbert_train.shape, unlabeled.shape, test_df.shape

((1596, 2), (157975, 2), (153164, 2))

### Network

In [10]:
# BERT
model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
bertmodel = AutoModel.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# GAN architecture

class Generator(nn.Module):
    def __init__(self, noise_size=100, hidden_size=512, output_size=512, dropout_rate=0.1):
        super(Generator, self).__init__()

        ############
        # GENERATOR
        # ----------
        # architecture taken from the paper consist of single hidden layers activated with LeakyReLU
        # and dropout layer with drop rate 0.1
        # takes random noise with size 100 and output size is 768
        ############
        self.gen = nn.Sequential(
            nn.Linear(noise_size, hidden_size),
            nn.LeakyReLU(0.2),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        x = self.gen(x)

        return x


class Discriminator(nn.Module):
    def __init__(self, input_size=512, hidden_size=512, n_classes=2, dropout_rate=0.1) -> None:
        super(Discriminator, self).__init__()

        ###############
        # DISCRIMINATOR
        # -------------
        # architecture taken from the paper and was similar to generator but followed by a softmax layer
        # for final prediction we will have size n_class + 1 as we need to treat fake samples as individual label
        # note that in discriminator's logit
        ###############
        self.disc = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.LeakyReLU(0.2),
            nn.Dropout(dropout_rate),            
        )
        
        self.logit = nn.Linear(hidden_size, n_classes+1)

        self.softmax = nn.Softmax(dim=-1)

    def forward(self, x):
        x = self.disc(x)
        logit = self.logit(x)
        output = self.softmax(logit)

        # return last hidden state, logit, and probabilities
        return x, logit, output

### Tokenize

In [12]:
def get_examples(inputs, labels, label_rate=0.02, balance=False):
    examples = {'text': [], 'label': []}

    total_labeled = len(np.where(labels > -1)[0])
    label_rate = total_labeled / len(inputs)

    for idx, (text, label) in enumerate(zip(inputs, labels)):
        if balance:
            if np.random.randn() > label_rate:
                label = -1
            else:
                repeat_num = int(1/label_rate)
                repeat_num = int(np.log2(repeat_num)) + 1
                if repeat_num < 0:
                    repeat_num = 0
                
                # replicate data
                for _ in range(0, repeat_num):                    
                    examples['text'].append(text)
                    examples['label'].append(label)
        
        examples['text'].append(text)
        examples['label'].append(label)
    
    return examples

In [13]:
# merge labeled and unlabeled dataset
all_texts = ganbert_train.comment_clean.tolist() + unlabeled.comment_clean.tolist()
all_labels = np.array(ganbert_train.is_toxic.tolist() + [-1 for _ in range(len(unlabeled))])

In [14]:
trainset = get_examples(all_texts, all_labels, balance=True)
testset = get_examples(test_df.comment_clean.tolist(), test_df.is_toxic.to_numpy(), balance=False)

In [15]:
dataset = DatasetDict({
    'train': Dataset.from_dict(trainset),
    'test': Dataset.from_dict(testset)
})

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 722427
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 153164
    })
})

In [16]:
def tokenize_data(batch, seq_length=256):
    texts = batch['text']

    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=seq_length,
        return_overflowing_tokens=True
    )

    sample_map = tokenized.pop('overflow_to_sample_mapping')
    for k, v in batch.items():
        tokenized[k] = [v[i] for i in sample_map]

    return tokenized

In [17]:
dataset_tokenized = dataset.map(tokenize_data, batched=True).remove_columns(['text'])
dataset_tokenized.set_format('torch')



  0%|          | 0/723 [00:00<?, ?ba/s]

  0%|          | 0/154 [00:00<?, ?ba/s]

In [18]:
dataset_tokenized.save_to_disk('../data/processed/ganbert')

### Load dataset directly

In [19]:
dataset_tokenized = load_from_disk('../data/processed/ganbert')

In [20]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
trainloader = DataLoader(dataset_tokenized['train'], shuffle=True, batch_size=16, collate_fn=data_collator)
valloader = DataLoader(dataset_tokenized['test'], shuffle=True, batch_size=16, collate_fn=data_collator)

In [22]:
dataset_tokenized['train']

Dataset({
    features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 798358
})

### Training

In [23]:
# hyperparameters
learning_rate = 2e-5
noise_size = 100
hidden_size = 256
total_epoch = 10
warmup_proportion = 0.1
epsilon = 1e-8
print_each_step = 32

In [24]:
# init model
gen = Generator(noise_size, hidden_size=hidden_size).to(device)
disc = Discriminator(input_size=512, hidden_size=hidden_size, n_classes=2).to(device)
bertmodel = bertmodel.to(device)

In [25]:
# optimizer
gen_optim = AdamW(gen.parameters())
disc_optim = AdamW(list(bertmodel.parameters()) + list(disc.parameters()))

# scheduler
### warmp is used to reduce the data requests when new data is generating
### Warm-up is a way to reduce the primacy effect of the early training examples.
### Without it, you may need to run a few extra epochs to get the convergence desired
# TODO: Add scheduler

In [26]:
for epoch in tqdm(range(total_epoch)):
    print(f'========== EPOCH {epoch + 1} / {total_epoch} ==========')    

    train_gen_loss = 0
    train_disc_loss = 0

    # training mode
    bertmodel.train()
    gen.train()    
    disc.train()

    # iterate each step
    for step, batch in tqdm(enumerate(trainloader)):
        # report step progress
        print(f'   Batch {step:>5,} of {len(trainloader):>5}')

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)

        bert_out = bertmodel(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        print(bert_out)
        break
    break

  0%|          | 0/10 [00:00<?, ?it/s]



0it [00:00, ?it/s]

   Batch     0 of 49898


RuntimeError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 2.00 GiB total capacity; 1.34 GiB already allocated; 0 bytes free; 1.39 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF