In [4]:
import matplotlib.pyplot as plt  
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
from IPython.display import display, Markdown, Latex, HTML
from transformers import BertTokenizer
from pprint import pprint

In [5]:
def load_data():
    data_pth = 'Data/'
    train_pth = data_pth + 'train.csv'
    test_pth = data_pth + 'test.csv'
    train = pd.read_csv(train_pth)
    test = pd.read_csv(test_pth)
    # print(test)
    return train, test


In [6]:
train_set, test_set = load_data()

In [7]:
# LEGACY
class Attention(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.softmax = nn.Softmax(dim=-1) # TODO anpassen

    def forward(self, Q:torch.Tensor, K:torch.Tensor, V:torch.Tensor):
        assert(Q.shape[-1] == K.shape[-2])
        assert(K.shape[-1] == V.shape[-2])
        d_k = Q.shape[-1]
        # matmul is done between last 2 dimension, the rest is batch!
        Z = Q @ K.T  / d_k **.5
        A = self.softmax(Z) 
        res = A @ V
        return res

class AttentionHead(nn.Module):
    def __init__(self, d_model:int, d_k:int, d_v:int) -> None:
        super().__init__()
        W_Q = torch.randn(d_model, d_k, dtype=torch.float32, requires_grad=True)
        W_K = torch.randn(d_model, d_k, dtype=torch.float32, requires_grad=True)
        W_V = torch.randn(d_model, d_v, dtype=torch.float32, requires_grad=True)
        self.W_Q = nn.Parameter(W_Q) 
        self.W_K = nn.Parameter(W_K) 
        self.W_V = nn.Parameter(W_V) 

        self.attention = Attention()

    def forward(self, Q:torch.Tensor, K:torch.Tensor, V:torch.Tensor):
        Q_proj = Q @ self.W_Q
        K_proj = K @ self.W_K
        V_proj = V @ self.W_V
        return self.attention(Q_proj, K_proj, V_proj)

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads: int, d_model: int, d_k: int, d_v: int, p_dropout:float) -> None:
        super().__init__()
        attention_heads = [AttentionHead(d_model=d_model, d_k = d_k, d_v = d_v) for _ in range(num_heads)]
        self.attention_heads = nn.ModuleList(attention_heads)
        self.W_O = torch.randn(num_heads*d_v, d_model, dtype=torch.float32, requires_grad=True)
    
    def forward(self, Q:torch.Tensor, K:torch.Tensor, V:torch.Tensor):
        heads = torch.cat([attention_head(Q, K, V) for attention_head in self.attention_heads])
        return heads @ self.W_O

class FFN(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.linear1 = nn.Linear()
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear()
    
    def forward(self, x):
        x = self.relu(self.linear1(x))
        return self.linear2(x)

class EncoderBlock(nn.Module):
    def __init__(self, num_heads: int, d_model: int) -> None:
        super().__init__()
        self.mha = MultiHeadAttention(num_heads = num_heads, d_model = d_model, d_k = d_model, d_v = d_model)
        self.batchnorm1 = nn.BatchNorm1d()
        self.ffn = FFN()
        self.batchnorm2 = nn.BatchNorm1d()
        self.d_model = d_model

    def forward(self, input_seq: torch.Tensor):
        assert(input_seq.shape[1] == self.d_model)
        x = self.mha(input_seq, input_seq, input_seq)
        x = self.batchnorm1(x) + input_seq
        y = self.ffn(x)
        y = self.batchnorm2(y) + x
        return y

In [8]:

def train_val_split(df: pd.DataFrame, train_frac):
    out = df.copy()
    ind = df.index
    perm = np.random.permutation(ind)
    split = int(len(perm) * train_frac)
    perm_train, perm_val = perm[:split], perm[split:]
    train, val  = out.iloc[perm_train], out.iloc[perm_val]
    return train, val

# print(pd.__version__)
# print(train_set)
train, val = train_val_split(train_set, train_frac=0.8)

# print((train))
# print((val))
assert(len(train) + len(val) == len(train_set))
val_to_show = val.drop(['id'], axis=1).iloc[:5]
train_to_show = train.drop(['id'], axis=1).iloc[:5]
# train_to_show = train.iloc[:5]
# val_to_show.style.format(formatter={'comment_text': '{:<10}'})
# display(HTML(val_to_show.to_html()))
# display(Markdown(val_to_show.to_markdown()))
# df_latex = val_to_show.style.set_properties(**{'text-align': 'left'}).to_latex()
# print(df_latex)
# display(Latex('$' + df_latex + '$'))
val_to_show.style.set_properties(**{'text-align': 'left'})
train_to_show.style.set_properties(**{'text-align': 'left'})



Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
82115,Bot error OrphanBot applied an incorrect tag to an image. See — the image had tag already on it; the bot tagged it with . Why is the bot misidentifying licensed images? — The image has only the simple assertion that it is in the public domain. It needs source information so that it's possible for this to be verified.,0,0,0,0,0,0
38292,"""  Reply Thank you for your note. Apart from the issue of the external link mentioned above, the text you refer to was removed because it made an unreferenced claim (""""One of the most important museums in Barcelona"""") which needs a reliable and independent source, and also because it makes specific mention of the contractor, which is a) possibly promotional and b) possibly not relevant to the article on Barcelona. Regards, """,0,0,0,0,0,0
28694,"""  Image:PA_writing.jpg Thanks for uploading Image:PA_writing.jpg. I notice the 'image' page specifies that the image is being used under fair use, but its use in Wikipedia articles fails our first fair use criterion in that it illustrates a subject for which a freely licensed image could reasonably be found or created that provides substantially the same information. If you believe this image is not replaceable, please:  Go to the image description page and edit it to add {{Replaceable fair use disputed}}, without deleting the original Replaceable fair use template.  On the image discussion page, write the reason why this image is not replaceable at all. Alternatively, you can also choose to replace the fair use image by finding a freely licensed image of its subject, requesting that the copyright holder release this (or a similar) image under a free license, or by taking a picture of it yourself. If you have uploaded other fair use media, consider checking that you have specified how these images fully satisfy our fair use criteria. You can find a list of 'image' pages you have edited by clicking on [ this link]. Note that any fair use images which are replaceable by free-licensed alternatives will be deleted one week after they have been uploaded, as described on criteria for speedy deletion. If you have any questions please ask them at the Media copyright questions page. Thank you. ≈talk """,0,0,0,0,0,0
121889,"The distinction I would draw is whether the content is published and accepted. Wikipedia:No original research#What_is_excluded.3F gives a list of seven criteria for determining whether or not something is original research ... but really what it boils down to is whether or not you can find other secondary sources making the same claim. I had never heard of this tomb before this evening so obviously I don't know enough about it to say what is and is not accepted science in this respect. ;) From what you are saying, it sounds like there are two questions - (1) is it appropriate to discuss the symbolism findings in the article and (2) if so, is it appropriate to cite your work. In the case of the former, it depends on whether or not this has become an accepted finding. Is it still a theory or has it gained acceptance? Have other authors picked up on it and do they cite it as accepted truth or as a new theory? The primary reason for the original research policy is that there are constantly new theories coming out about everything - Jimbo (the founder of Wikipedia) originally mentioned physics, but I think it applies to any field. Some theories are good for a term paper or thesis and never see the light of day again. Others become accepted truth over time. Wikipedia is not the place for giving a sounding board to new ideas and theories - we wait until they are accepted. The second question is whether, if this content is to be included, it would be appropriate to cite your book. I believe, as I said before, that you (personally) should not add it, but should let someone else do it - that's just basic journalistic integrity - we as individual editors don't promote our own stuff. Blogs are almost never considered appropriate ... so I really don't think having a link to your blog would be a good idea. Sources on Wikipedia should be peer-reviewed journals, newspapers, etc, where more than one person is responsible for the content. Blogs are generally a bad thing. I hope all this helps ... I know I've rambled a bit. (As a quick side note, please sign messages on talk pages using four tildes - ~~~~ - it will automatically turn into a time/date stamp with your name ... it makes conversations easier to follow.)",0,0,0,0,0,0
75586,"Go fuck yourself, you piece of shit.",1,0,1,0,1,0


In [9]:
from argparse import Namespace
cfg = {'num_encoder_blocks': 12,
       'num_attention_heads': 12,
       'd_model': 768,
       'vocab_size': 30522,
       'max_seq_len': 512, 
       'p_dropout': 0.1, 
       'batchsize': 16,
       'num_target_categories': 6
       }
cfg = Namespace(**cfg)

In [10]:
train.shape

(127656, 8)

In [11]:
def make_batches(dataset, tokenizer, batch_size):
    label_tags = ["toxic", "severe_toxic"	,"obscene"	,"threat"	,"insult"	,"identity_hate"]
    batches = []
    for b in range(0, len(dataset), batch_size):
        ds_batch = dataset.iloc[b:b+batch_size]
        # print(ds_batch)
        if len(ds_batch) == 0: break
        sentences = ds_batch['comment_text'].to_list()
        # max_len = max(len(s) for s in sentences)
        token_dict = tokenizer(sentences, return_tensors='pt', padding='longest', truncation=True)
        labels = ds_batch[label_tags].to_numpy(dtype=int)
        token_dict['labels'] = torch.tensor(labels, dtype=torch.float)
        # print(token_dict)
        batches.append(token_dict)
    return batches


tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
train_batches = make_batches(train.iloc[:256], tokenizer, cfg.batchsize)

print(train_batches[0])
# for i in range(5):
    # print({k: v.shape for (k,v) in train_batches[i].items()})

{'input_ids': tensor([[  101,  9326,  1204,  ...,     0,     0,     0],
        [  101,   107, 20777,  ...,     0,     0,     0],
        [  101,   107, 15065,  ...,     0,     0,     0],
        ...,
        [  101,   155, 10069,  ...,     0,     0,     0],
        [  101,  7102,  1146,  ...,     0,     0,     0],
        [  101,  1130,   124,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
   

In [12]:

class BertSelfAttention(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        self.query = nn.Linear(cfg.d_model, cfg.d_model)
        self.key = nn.Linear(cfg.d_model, cfg.d_model)
        self.value = nn.Linear(cfg.d_model, cfg.d_model)
        self.dropout = nn.Dropout(cfg.p_dropout)
        self.num_attention_heads = cfg.num_attention_heads

    def forward(self, seq:torch.Tensor):
        # seq.shape = (batchsize, seq_len, d_model)
        batchsize, seq_len, d_model = seq.shape
        assert(d_model % self.num_attention_heads == 0)
        d_v = d_model // self.num_attention_heads
        querys_proj = self.query(seq).view(batchsize, seq_len, self.num_attention_heads, d_v)
        querys_proj = querys_proj.transpose(1, 2)
        keys_proj = self.key(seq).view(batchsize, seq_len, self.num_attention_heads, d_v)
        keys_proj = keys_proj.transpose(1, 2)
        values_proj = self.value(seq).view(batchsize, seq_len, self.num_attention_heads, d_v)
        values_proj = values_proj.transpose(1, 2)

        #   (batchsize, num_attention_heads, seq_len, d_v) x (batchsize, num_attention_heads, d_v, seq_len)
        # = (batchsize, num_attention_heads, seq_len, seq_len)
        Z = querys_proj @ keys_proj.transpose(-1, -2) / d_v**.5
        p_attention = nn.Softmax(dim=-1)(Z)
        #   (batchsize, num_attention_heads, seq_len, seq_len) x (batchsize, num_attention_heads, seq_len, d_v)
        # = (batchsize, num_attention_heads, seq_len, d_v)
        attention_output = p_attention @ values_proj

        # concatenate the heads along the last axis by reshaping the output to (batchsize, seq_len, d_model) again
        attention_output = attention_output.transpose(1, 2).reshape(batchsize, seq_len, d_v * self.num_attention_heads)
        return attention_output


class BertSelfOutput(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        self.dense = nn.Linear(cfg.d_model, cfg.d_model)
        self.LayerNorm = nn.LayerNorm((cfg.d_model,), eps=1e-12, elementwise_affine=True)
        self.dropout = nn.Dropout(cfg.p_dropout)
    
    def forward(self, attention_output):
        # attention_output.shape = (batchsize, seq_len, d_model)
        # in which order to apply these things? see below
        return self.dropout(self.LayerNorm(self.dense(attention_output))) + attention_output


class BertAttention(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        # self.attention = BertSelfAttention(cfg)
        self.self = BertSelfAttention(cfg)
        self.output = BertSelfOutput(cfg)

    def forward(self, seq):
        attention_output = self.self(seq)
        return self.output(attention_output)


class BertIntermediate(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        self.dense = nn.Linear(cfg.d_model, cfg.d_model * 4)
        self.intermediate_act_fn = nn.GELU()

    def forward(self, seq):
        return self.intermediate_act_fn(self.dense(seq))


class BertOutput(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        self.dense = nn.Linear(cfg.d_model * 4, cfg.d_model)
        self.LayerNorm = nn.LayerNorm((cfg.d_model,), eps=1e-12, elementwise_affine=True)
        self.dropout = nn.Dropout(cfg.p_dropout)

    def forward(self, seq):
        # in which order to apply dropout, layernorm and residual connection?
        # in Vaswani et al it is layernorm(x + sublayer(x))
        # the order here is from annotated transformer
        return self.dropout(self.LayerNorm(self.dense(seq)))


class BertLayer(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        self.attention = BertAttention(cfg)
        self.intermediate = BertIntermediate(cfg)
        self.output = BertOutput(cfg)

    def forward(self, seq):
        attention_output = self.attention(seq)
        x = self.intermediate(attention_output)
        x = self.output(x)
        return x + attention_output
        

class BertEncoder(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        encoder_stack = [BertLayer(cfg) for _ in range(cfg.num_encoder_blocks)]
        self.layer = nn.ModuleList(encoder_stack)

    def forward(self, seq: torch.Tensor):
        encoder_stack = nn.Sequential(*self.layer)
        return encoder_stack(seq)


class BertEmbeddings(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        self.word_embeddings = nn.Embedding(cfg.vocab_size, cfg.d_model)
        self.position_embeddings = nn.Embedding(cfg.max_seq_len, cfg.d_model)
        self.token_type_embeddings = nn.Embedding(2, cfg.d_model) # only beginning of sentence token and other tokens
        self.LayerNorm = nn.LayerNorm((cfg.d_model,), eps=1e-12, elementwise_affine=True)
        self.dropout = nn.Dropout(cfg.p_dropout)
        self.register_buffer("position_ids", torch.arange(cfg.max_seq_len).expand((1, -1)))
        self.device = cfg.device
    
    # def forward(self, input_ids, token_type_ids):
    def forward(self, input_ids, token_type_ids):
        batchsize, seq_len = input_ids.shape
        position_ids = torch.stack([torch.arange(0, seq_len, dtype=torch.long, device=self.device)] * batchsize)
        input_embeds = self.word_embeddings(input_ids)
        position_embeds = self.position_embeddings(position_ids)
        token_type_embeds = self.token_type_embeddings(token_type_ids)
        # print(f"{input_embeds.shape=}")
        # print(f"{token_type_embeds.shape=}")
        # print(f"{position_embeds.shape=}")
        embeds = input_embeds + position_embeds + token_type_embeds
        embeds = self.LayerNorm(input_embeds)
        embeds = self.dropout(embeds)
        return embeds

class BertPooler(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        self.dense = nn.Linear(cfg.d_model, cfg.d_model)

class MyBertModel(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        self.embeddings = BertEmbeddings(cfg)
        self.encoder = BertEncoder(cfg)
        self.pooler = BertPooler(cfg)

    def forward(self, input_ids:torch.Tensor, token_type_ids:torch.Tensor):
        embeds = self.embeddings(input_ids, token_type_ids)
        encoder_output = self.encoder(embeds)
        return encoder_output


class OutputLayer(nn.Module):
    def __init__(self, cfg) -> None:
        super().__init__()
        self.dense1 = nn.Linear(cfg.d_model, cfg.d_model // 2)
        self.act_fn = nn.GELU()
        self.dense2 = nn.Linear(cfg.d_model // 2, cfg.num_target_categories)
        # self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(cfg.p_dropout)
    
    def forward(self, encoder_output:torch.Tensor):
        # encoder_output.shape: (batchsize, seq_len, d_model)
        x = self.dense1(encoder_output[:, 0]) # use only hidden state corresponding to start of sequence token for classification
        x = self.act_fn(x)
        x = self.dropout(x)
        x = self.dense2(x)
        # x = self.softmax(x)
        return x


class ToxicSentimentClassificationModel(nn.Module):
    def __init__(self, cfg, state_dict) -> None:
        super().__init__()
        self.backbone = MyBertModel(cfg)
        self.backbone.load_state_dict(state_dict)
        self.output_layer = OutputLayer(cfg)
    
    def forward(self, input_ids:torch.Tensor, token_type_ids:torch.Tensor, **kwargs):
        backbone_output = self.backbone(input_ids, token_type_ids)
        return self.output_layer(backbone_output)


        

In [13]:
from tqdm.auto import tqdm
from transformers import AdamW
from transformers import get_scheduler
from transformers import BertModel

device = torch.device("cuda") if False and torch.cuda.is_available() else torch.device("cpu")
print(device)
cfg.device = device
pretrained_model = BertModel.from_pretrained('bert-base-uncased')
pretrained_state_dict = pretrained_model.state_dict()
# pprint({k: v.shape for (k, v) in pre_state_dict.items()})
# pre_state_dict_adapted = {'backbone.' + k: v for (k, v) in pre_state_dict.items()}
# pprint({k: v.shape for (k, v) in pre_state_dict_adapted.items()})
model = ToxicSentimentClassificationModel(cfg, pretrained_state_dict)
model.to(device)
print()

cpu


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





In [14]:

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_batches)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

progress_bar = tqdm(range(num_training_steps))

model.train()
loss_fn = nn.BCEWithLogitsLoss()

for epoch in range(num_epochs):
    print(epoch)
    for batch in train_batches:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        # print(f"{outputs.shape=}")
        # print(f"{batch['labels'].shape=}")
        loss = loss_fn(outputs, batch['labels'])
        # print(f"{loss.item()=}")
        print('loss:', loss.item())
        # loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)



48


  0%|          | 0/48 [00:00<?, ?it/s]

0
loss: 0.7023908495903015


  2%|▏         | 1/48 [00:17<13:53, 17.74s/it]

KeyboardInterrupt: 