In [11]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
import pandas as pd
import pickle
import os
import argparse
import numpy as np
from tqdm import tqdm
from nltk import ngrams
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


from collections import Counter
c = Counter()


In [12]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = torch.device('cuda:0')
# device = torch.device('cpu')

In [13]:
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

KeyboardInterrupt: 

In [None]:
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

## Making a list of negative and positive words

### Give a toxicity and normalcy score 

In [None]:

class NgramSalienceCalculator():
    def __init__(self, tox_corpus, norm_corpus, use_ngrams=False):
        ngrams = (1, 3) if use_ngrams else (1, 1)
        self.vectorizer = CountVectorizer(ngram_range=ngrams)

        tox_count_matrix = self.vectorizer.fit_transform(tox_corpus)
        self.tox_vocab = self.vectorizer.vocabulary_
        self.tox_counts = np.sum(tox_count_matrix, axis=0)

        norm_count_matrix = self.vectorizer.fit_transform(norm_corpus)
        self.norm_vocab = self.vectorizer.vocabulary_
        self.norm_counts = np.sum(norm_count_matrix, axis=0)

    def salience(self, feature, attribute='tox', lmbda=0.5):
        assert attribute in ['tox', 'norm']
        if feature not in self.tox_vocab:
            tox_count = 0.0
        else:
            tox_count = self.tox_counts[0, self.tox_vocab[feature]]

        if feature not in self.norm_vocab:
            norm_count = 0.0
        else:
            norm_count = self.norm_counts[0, self.norm_vocab[feature]]

        if attribute == 'tox':
            return (tox_count + lmbda) / (norm_count + lmbda)
        else:
            return (norm_count + lmbda) / (tox_count + lmbda)


In [None]:
tox_corpus_path = '../data/train/train_toxic'
norm_corpus_path = '../data/train/train_normal'

for fn in [tox_corpus_path, norm_corpus_path]:
    with open(fn, 'r') as corpus:
        for line in corpus.readlines():
            for tok in line.strip().split():
                c[tok] += 1

print(len(c))

88645


In [None]:
vocab = {w for w, _ in c.most_common() if _ > 0}  # if we took words with > 1 occurences, vocabulary would be x2 smaller, but we'll survive this size
print(len(vocab))

88645


In [None]:
with open(tox_corpus_path, 'r') as tox_corpus, open(norm_corpus_path, 'r') as norm_corpus:
    corpus_tox = [' '.join([w if w in vocab else '<unk>' for w in line.strip().split()]) for line in tox_corpus.readlines()]
    corpus_norm = [' '.join([w if w in vocab else '<unk>' for w in line.strip().split()]) for line in norm_corpus.readlines()]

In [None]:
threshold = 4

sc = NgramSalienceCalculator(corpus_tox, corpus_norm, False)
seen_grams = set()

neg_out_name = '../data/vocab/negative_words.txt'
pos_out_name = '../data/vocab/positive_words.txt'

with open(neg_out_name, 'a') as neg_out, open(pos_out_name, 'a') as pos_out:
    for gram in set(sc.tox_vocab.keys()).union(set(sc.norm_vocab.keys())):
        if gram not in seen_grams:
            seen_grams.add(gram)
            toxic_salience = sc.salience(gram, attribute='tox')
            polite_salience = sc.salience(gram, attribute='norm')
            if toxic_salience > threshold:
                neg_out.writelines(f'{gram}\n')
            elif polite_salience > threshold:
                pos_out.writelines(f'{gram}\n')

## Logistic Regression for evaluating word toxicities

## Fine Tuning the BERT Model

In [None]:
with open("../data/train/train_toxic", 'r') as f:
    toxic = f.readlines()

with open("../data/train/train_normal", 'r') as f:
    normal = f.readlines()



In [None]:
len(toxic)

135390

In [None]:
len(normal)

135390

In [None]:
# read a txt in a list, every line a new element
def read_txt(path):
    with open(path, 'r') as f:
        lines = f.readlines()
    # remove \n
    lines = [line.strip() for line in lines]
    return lines

negative = read_txt("../data/vocab/negative_words.txt")
positive = read_txt("../data/vocab/positive_words.txt")

In [None]:
# convert list to pandas
toxic_df = pd.DataFrame(toxic, columns=['text'])
normal_df = pd.DataFrame(normal, columns=['text'])


In [None]:
device

device(type='cuda', index=0)

In [None]:
normal_df

Unnamed: 0,text
0,"just a comment regarding family trusts , they ..."
1,nor do they conform to the notion that our tit...
2,"yeah , the pers employees and their pensions f..."
3,why risk our marine parks and sea life for a f...
4,not sure what flavor koolaid you drinking but ...
...,...
135385,"i think you , other like minded individuals ge..."
135386,a lot of people will be wondering what they we...
135387,they have endured saber rattling by the us for...
135388,the cbc has been totally silent on reporting a...


In [None]:
# tokenize the text

def masking(df, word_list):
    df['tokenized_text'] = df['text'].apply((lambda x: tokenizer.tokenize(x)))
    df['masked_text'] = df['tokenized_text'].apply((lambda x: [word.lower() if word not in word_list else '[MASK]' for word in x]))
    df["masked_encoded_text"] = df['masked_text'].apply((lambda x: tokenizer.encode(x,)))
    df["encoded_text"] = df['tokenized_text'].apply((lambda x: tokenizer.encode(x)))
    return df

toxic_df_masked = masking(toxic_df, negative)
normal_df_masked = masking(normal_df, positive)

In [None]:
toxic_df_masked["label"] = 1
normal_df_masked["label"] = 0

In [None]:
normal_df_masked

Unnamed: 0,text,tokenized_text,masked_text,masked_encoded_text,encoded_text,label
0,"just a comment regarding family trusts , they ...","[just, a, comment, regarding, family, trusts, ...","[just, a, comment, regarding, family, trusts, ...","[101, 2074, 1037, 7615, 4953, 2155, 20278, 101...","[101, 2074, 1037, 7615, 4953, 2155, 20278, 101...",0
1,nor do they conform to the notion that our tit...,"[nor, do, they, conform, to, the, notion, that...","[nor, do, they, conform, to, the, notion, that...","[101, 4496, 2079, 2027, 23758, 2000, 1996, 936...","[101, 4496, 2079, 2027, 23758, 2000, 1996, 936...",0
2,"yeah , the pers employees and their pensions f...","[yeah, ,, the, per, ##s, employees, and, their...","[yeah, ,, the, [MASK], ##s, [MASK], and, their...","[101, 3398, 1010, 1996, 103, 2015, 103, 1998, ...","[101, 3398, 1010, 1996, 2566, 2015, 5126, 1998...",0
3,why risk our marine parks and sea life for a f...,"[why, risk, our, marine, parks, and, sea, life...","[why, risk, our, marine, [MASK], and, sea, lif...","[101, 2339, 3891, 2256, 3884, 103, 1998, 2712,...","[101, 2339, 3891, 2256, 3884, 6328, 1998, 2712...",0
4,not sure what flavor koolaid you drinking but ...,"[not, sure, what, flavor, ko, ##ola, ##id, you...","[not, sure, what, flavor, [MASK], ##ola, ##id,...","[101, 2025, 2469, 2054, 14894, 103, 6030, 3593...","[101, 2025, 2469, 2054, 14894, 12849, 6030, 35...",0
...,...,...,...,...,...,...
135385,"i think you , other like minded individuals ge...","[i, think, you, ,, other, like, minded, indivi...","[i, think, you, ,, other, like, minded, indivi...","[101, 1045, 2228, 2017, 1010, 2060, 2066, 1312...","[101, 1045, 2228, 2017, 1010, 2060, 2066, 1312...",0
135386,a lot of people will be wondering what they we...,"[a, lot, of, people, will, be, wondering, what...","[a, lot, of, people, will, be, wondering, what...","[101, 1037, 2843, 1997, 2111, 2097, 2022, 6603...","[101, 1037, 2843, 1997, 2111, 2097, 2022, 6603...",0
135387,they have endured saber rattling by the us for...,"[they, have, endured, saber, rattling, by, the...","[they, have, endured, saber, rattling, by, the...","[101, 2027, 2031, 16753, 25653, 26347, 2011, 1...","[101, 2027, 2031, 16753, 25653, 26347, 2011, 1...",0
135388,the cbc has been totally silent on reporting a...,"[the, cbc, has, been, totally, silent, on, rep...","[the, cbc, has, been, totally, silent, on, rep...","[101, 1996, 13581, 2038, 2042, 6135, 4333, 200...","[101, 1996, 13581, 2038, 2042, 6135, 4333, 200...",0


In [None]:
# concat the two dataframes and shuffle them
df = pd.concat([toxic_df_masked, normal_df_masked], ignore_index=True)

In [None]:
# print avg length of the sentences
print(df['masked_encoded_text'].apply(len).max())

72


In [None]:
df

Unnamed: 0,text,tokenized_text,masked_text,masked_encoded_text,encoded_text,label
0,"fuck you , you nixonian twit .\n","[fuck, you, ,, you, nixon, ##ian, t, ##wi, ##t...","[[MASK], you, ,, you, nixon, ##ian, t, ##wi, #...","[101, 103, 2017, 1010, 2017, 11296, 2937, 1056...","[101, 6616, 2017, 1010, 2017, 11296, 2937, 105...",1
1,"just another vicious and trashy antitrump , pr...","[just, another, vicious, and, trash, ##y, anti...","[just, another, vicious, and, [MASK], ##y, ant...","[101, 2074, 2178, 13925, 1998, 103, 2100, 3424...","[101, 2074, 2178, 13925, 1998, 11669, 2100, 34...",1
2,"if you want to talk hypocrites , bauer , im up...","[if, you, want, to, talk, h, ##yp, ##oc, ##rit...","[if, you, want, to, talk, h, ##yp, ##oc, ##rit...","[101, 2065, 2017, 2215, 2000, 2831, 1044, 2257...","[101, 2065, 2017, 2215, 2000, 2831, 1044, 2257...",1
3,that you were shocked by the backlash only fur...,"[that, you, were, shocked, by, the, backlash, ...","[that, you, were, shocked, by, the, backlash, ...","[101, 2008, 2017, 2020, 7135, 2011, 1996, 2574...","[101, 2008, 2017, 2020, 7135, 2011, 1996, 2574...",1
4,you all suck shapiros cock and lick his mother...,"[you, all, suck, shapiro, ##s, cock, and, lick...","[you, all, [MASK], shapiro, ##s, [MASK], and, ...","[101, 2017, 2035, 103, 24630, 2015, 103, 1998,...","[101, 2017, 2035, 11891, 24630, 2015, 10338, 1...",1
...,...,...,...,...,...,...
270775,"i think you , other like minded individuals ge...","[i, think, you, ,, other, like, minded, indivi...","[i, think, you, ,, other, like, minded, indivi...","[101, 1045, 2228, 2017, 1010, 2060, 2066, 1312...","[101, 1045, 2228, 2017, 1010, 2060, 2066, 1312...",0
270776,a lot of people will be wondering what they we...,"[a, lot, of, people, will, be, wondering, what...","[a, lot, of, people, will, be, wondering, what...","[101, 1037, 2843, 1997, 2111, 2097, 2022, 6603...","[101, 1037, 2843, 1997, 2111, 2097, 2022, 6603...",0
270777,they have endured saber rattling by the us for...,"[they, have, endured, saber, rattling, by, the...","[they, have, endured, saber, rattling, by, the...","[101, 2027, 2031, 16753, 25653, 26347, 2011, 1...","[101, 2027, 2031, 16753, 25653, 26347, 2011, 1...",0
270778,the cbc has been totally silent on reporting a...,"[the, cbc, has, been, totally, silent, on, rep...","[the, cbc, has, been, totally, silent, on, rep...","[101, 1996, 13581, 2038, 2042, 6135, 4333, 200...","[101, 1996, 13581, 2038, 2042, 6135, 4333, 200...",0


In [None]:
# save the dataframe in pickle file
df.to_pickle("../data/data.pkl")

In [None]:
# get the data from pickle file
df = pd.read_pickle("../data/data.pkl")

In [None]:
df

Unnamed: 0,text,tokenized_text,masked_text,masked_encoded_text,encoded_text,label
0,"fuck you , you nixonian twit .\n","[fuck, you, ,, you, nixon, ##ian, t, ##wi, ##t...","[[MASK], you, ,, you, nixon, ##ian, t, ##wi, #...","[101, 103, 2017, 1010, 2017, 11296, 2937, 1056...","[101, 6616, 2017, 1010, 2017, 11296, 2937, 105...",1
1,"just another vicious and trashy antitrump , pr...","[just, another, vicious, and, trash, ##y, anti...","[just, another, vicious, and, [MASK], ##y, ant...","[101, 2074, 2178, 13925, 1998, 103, 2100, 3424...","[101, 2074, 2178, 13925, 1998, 11669, 2100, 34...",1
2,"if you want to talk hypocrites , bauer , im up...","[if, you, want, to, talk, h, ##yp, ##oc, ##rit...","[if, you, want, to, talk, h, ##yp, ##oc, ##rit...","[101, 2065, 2017, 2215, 2000, 2831, 1044, 2257...","[101, 2065, 2017, 2215, 2000, 2831, 1044, 2257...",1
3,that you were shocked by the backlash only fur...,"[that, you, were, shocked, by, the, backlash, ...","[that, you, were, shocked, by, the, backlash, ...","[101, 2008, 2017, 2020, 7135, 2011, 1996, 2574...","[101, 2008, 2017, 2020, 7135, 2011, 1996, 2574...",1
4,you all suck shapiros cock and lick his mother...,"[you, all, suck, shapiro, ##s, cock, and, lick...","[you, all, [MASK], shapiro, ##s, [MASK], and, ...","[101, 2017, 2035, 103, 24630, 2015, 103, 1998,...","[101, 2017, 2035, 11891, 24630, 2015, 10338, 1...",1
...,...,...,...,...,...,...
270775,"i think you , other like minded individuals ge...","[i, think, you, ,, other, like, minded, indivi...","[i, think, you, ,, other, like, minded, indivi...","[101, 1045, 2228, 2017, 1010, 2060, 2066, 1312...","[101, 1045, 2228, 2017, 1010, 2060, 2066, 1312...",0
270776,a lot of people will be wondering what they we...,"[a, lot, of, people, will, be, wondering, what...","[a, lot, of, people, will, be, wondering, what...","[101, 1037, 2843, 1997, 2111, 2097, 2022, 6603...","[101, 1037, 2843, 1997, 2111, 2097, 2022, 6603...",0
270777,they have endured saber rattling by the us for...,"[they, have, endured, saber, rattling, by, the...","[they, have, endured, saber, rattling, by, the...","[101, 2027, 2031, 16753, 25653, 26347, 2011, 1...","[101, 2027, 2031, 16753, 25653, 26347, 2011, 1...",0
270778,the cbc has been totally silent on reporting a...,"[the, cbc, has, been, totally, silent, on, rep...","[the, cbc, has, been, totally, silent, on, rep...","[101, 1996, 13581, 2038, 2042, 6135, 4333, 200...","[101, 1996, 13581, 2038, 2042, 6135, 4333, 200...",0


In [None]:
# split the data into train, val and test
train, val, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

In [None]:
class BERTFineTuneDataset(torch.utils.data.Dataset):
  def __init__(self,df, maxlen = 64):
    super(BERTFineTuneDataset, self).__init__()
    # pad the encoded text to maxlen
    self._encoded_text = torch.nn.utils.rnn.pad_sequence(df['encoded_text'].apply(torch.tensor), batch_first=True, padding_value=0)
    df["labels"] = df["label"].apply((lambda x: torch.ones(maxlen).int() if x == 1 else torch.zeros(maxlen).int()))
    
    self._labels = df['labels'].values

    self._maxlen = maxlen
  
    self._masked_encoded_text = torch.nn.utils.rnn.pad_sequence(df['masked_encoded_text'].apply(torch.tensor), batch_first=True, padding_value=0)
    self._attention_mask = torch.nn.utils.rnn.pad_sequence(df['masked_encoded_text'].apply(lambda x: torch.tensor([1 if i != 0 else 0 for i in x])), batch_first=True, padding_value=0)

  def __len__(self):
    return len(self._labels)
  
  def __getitem__(self, idx):
    # make dict of the data
    return {

        'encoded_text': self._encoded_text[idx][:self._maxlen],
        'masked_encoded_text': self._masked_encoded_text[idx][:self._maxlen],
        'attention_mask': self._attention_mask[idx][:self._maxlen],
        'labels': self._labels[idx][:self._maxlen]    
    }

In [None]:
# dataloader
train_dataset = BERTFineTuneDataset(train)
val_dataset = BERTFineTuneDataset(val)
test_dataset = BERTFineTuneDataset(test)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

In [None]:
# save the dataloader in pickle file
with open("../data/train_dataloader.pkl", 'wb') as f:
    pickle.dump(train_dataloader, f)

with open("../data/val_dataloader.pkl", 'wb') as f:
    pickle.dump(val_dataloader, f)

with open("../data/test_dataloader.pkl", 'wb') as f:
    pickle.dump(test_dataloader, f)

In [None]:
# get the dataloader from pickle file
with open("../data/train_dataloader.pkl", 'rb') as f:
    train_dataloader = pickle.load(f)

AttributeError: Can't get attribute 'BERTFineTuneDataset' on <module '__main__'>

In [None]:
for batch in train_dataloader:
    print(batch["labels"])
    print(len(batch["labels"]))
    break

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32)
32


In [None]:
#fine tuning bert model

import torch
from transformers import BertTokenizer, BertForMaskedLM
from transformers import AdamW
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

model.to(torch.device('cuda'))

model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)

epochs = 5

for epoch in range(10):
    loop  = tqdm(train_dataloader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['masked_encoded_text'].to(torch.device('cuda'))
        attention_mask = batch['attention_mask'].to(torch.device('cuda'))
        labels = batch['encoded_text'].to(torch.device('cuda'))
        segment_ids = batch['labels'].to(torch.device('cuda'))
        outputs = model(input_ids, attention_mask=attention_mask,token_type_ids = segment_ids, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 0: 100%|██████████| 5078/5078 [13:38<00:00,  6.21it/s, loss=0.0794]
Epoch 1: 100%|██████████| 5078/5078 [13:37<00:00,  6.21it/s, loss=0.0355] 
Epoch 2: 100%|██████████| 5078/5078 [13:38<00:00,  6.20it/s, loss=7.31e-5]
Epoch 3:   3%|▎         | 134/5078 [00:21<13:15,  6.21it/s, loss=0.0346]


KeyboardInterrupt: 

In [14]:
#fine tuning bert model

import torch
from transformers import BertTokenizer, BertForMaskedLM
from transformers import AdamW
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

model.to(torch.device('cuda'))

model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)

epochs = 5

for epoch in range(epochs):
    loop  = tqdm(train_dataloader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['masked_encoded_text'].to(torch.device('cuda'))
        attention_mask = batch['attention_mask'].to(torch.device('cuda'))
        labels = batch['encoded_text'].to(torch.device('cuda'))
        segment_ids = batch['labels'].to(torch.device('cuda'))
        outputs = model(input_ids, attention_mask=attention_mask,token_type_ids = segment_ids, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    
    path = "../models/bert_detox_ft_"+str(epoch+1)+"epochs.pth"
    torch.save(model, path)
    

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 0: 100%|██████████| 5078/5078 [14:27<00:00,  5.85it/s, loss=0.0174]
Epoch 1: 100%|██████████| 5078/5078 [14:34<00:00,  5.81it/s, loss=0.0618] 
Epoch 2: 100%|██████████| 5078/5078 [14:23<00:00,  5.88it/s, loss=0.0265] 
Epoch 3: 100%|██████████| 5078/5078 [14:13<00:00,  5.95it/s, loss=0.0249] 
Epoch 4: 100%|██████████| 5078/5078 [14:13<00:00,  5.95it/s, loss=0.

In [None]:
# save the finetuned model
model.save_pretrained("../model/BERT_finetuned3")



In [None]:
torch.save(model, "../model/bert_detox_ft_3epochs.pth")

In [None]:
# validation code
model.eval()
val_loss = 0
for batch in val_dataloader:
    input_ids = batch['masked_encoded_text'].to(torch.device('cuda'))
    attention_mask = batch['attention_mask'].to(torch.device('cuda'))
    labels = batch['encoded_text'].to(torch.device('cuda'))
    segment_ids = batch['labels'].to(torch.device('cuda'))
    outputs = model(input_ids, attention_mask=attention_mask,token_type_ids = segment_ids, labels=labels)
    loss = outputs[0]
    val_loss += loss.item()

print(val_loss/len(val_dataloader))


In [None]:
# test code
model.eval()
test_loss = 0
for batch in test_dataloader:
    input_ids = batch['masked_encoded_text'].to(torch.device('cuda'))
    attention_mask = batch['attention_mask'].to(torch.device('cuda'))
    labels = batch['encoded_text'].to(torch.device('cuda'))
    segment_ids = batch['labels'].to(torch.device('cuda'))
    outputs = model(input_ids, attention_mask=attention_mask,token_type_ids = segment_ids, labels=labels)
    loss = outputs[0]
    test_loss += loss.item()

print(test_loss/len(test_dataloader))



### Mask one word at random - Fine Tuning BERT

In [None]:
import random
# df["masked_random_encoding"] = df["tokenized_text"].apply(lambda x: [ i[random.randint(1, len(i)-1)] = 103 for i in x]) # 103 is the encoding for [MASK]

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# make a new column for masked random encoding
masked_random_encoding = []
masked_random = []

for i in df["tokenized_text"]:
    temp = i.copy()
    rand = random.randint(1, len(i)-1)
    temp[rand] = "[MASK]"
    masked_random.append(temp)

    masked_random_encoding.append(tokenizer.encode(temp))

df["masked_random_encoding"] = masked_random_encoding
df["masked_random"] = masked_random


In [None]:
df

Unnamed: 0,text,tokenized_text,masked_text,masked_encoded_text,encoded_text,label,masked_random_encoding,masked_random
0,"fuck you , you nixonian twit .\n","[fuck, you, ,, you, nixon, ##ian, t, ##wi, ##t...","[[MASK], you, ,, you, nixon, ##ian, t, ##wi, #...","[101, 103, 2017, 1010, 2017, 11296, 2937, 1056...","[101, 6616, 2017, 1010, 2017, 11296, 2937, 105...",1,"[101, 6616, 103, 1010, 2017, 11296, 2937, 1056...","[fuck, [MASK], ,, you, nixon, ##ian, t, ##wi, ..."
1,"just another vicious and trashy antitrump , pr...","[just, another, vicious, and, trash, ##y, anti...","[just, another, vicious, and, [MASK], ##y, ant...","[101, 2074, 2178, 13925, 1998, 103, 2100, 3424...","[101, 2074, 2178, 13925, 1998, 11669, 2100, 34...",1,"[101, 2074, 2178, 13925, 1998, 11669, 2100, 34...","[just, another, vicious, and, trash, ##y, anti..."
2,"if you want to talk hypocrites , bauer , im up...","[if, you, want, to, talk, h, ##yp, ##oc, ##rit...","[if, you, want, to, talk, h, ##yp, ##oc, ##rit...","[101, 2065, 2017, 2215, 2000, 2831, 1044, 2257...","[101, 2065, 2017, 2215, 2000, 2831, 1044, 2257...",1,"[101, 2065, 2017, 2215, 2000, 2831, 1044, 2257...","[if, you, want, to, talk, h, ##yp, ##oc, [MASK..."
3,that you were shocked by the backlash only fur...,"[that, you, were, shocked, by, the, backlash, ...","[that, you, were, shocked, by, the, backlash, ...","[101, 2008, 2017, 2020, 7135, 2011, 1996, 2574...","[101, 2008, 2017, 2020, 7135, 2011, 1996, 2574...",1,"[101, 2008, 2017, 2020, 7135, 2011, 1996, 2574...","[that, you, were, shocked, by, the, backlash, ..."
4,you all suck shapiros cock and lick his mother...,"[you, all, suck, shapiro, ##s, cock, and, lick...","[you, all, [MASK], shapiro, ##s, [MASK], and, ...","[101, 2017, 2035, 103, 24630, 2015, 103, 1998,...","[101, 2017, 2035, 11891, 24630, 2015, 10338, 1...",1,"[101, 2017, 2035, 11891, 24630, 103, 10338, 19...","[you, all, suck, shapiro, [MASK], cock, and, l..."
...,...,...,...,...,...,...,...,...
270775,"i think you , other like minded individuals ge...","[i, think, you, ,, other, like, minded, indivi...","[i, think, you, ,, other, like, minded, indivi...","[101, 1045, 2228, 2017, 1010, 2060, 2066, 1312...","[101, 1045, 2228, 2017, 1010, 2060, 2066, 1312...",0,"[101, 1045, 2228, 2017, 1010, 2060, 2066, 1312...","[i, think, you, ,, other, like, minded, indivi..."
270776,a lot of people will be wondering what they we...,"[a, lot, of, people, will, be, wondering, what...","[a, lot, of, people, will, be, wondering, what...","[101, 1037, 2843, 1997, 2111, 2097, 2022, 6603...","[101, 1037, 2843, 1997, 2111, 2097, 2022, 6603...",0,"[101, 1037, 2843, 1997, 2111, 2097, 2022, 6603...","[a, lot, of, people, will, be, wondering, what..."
270777,they have endured saber rattling by the us for...,"[they, have, endured, saber, rattling, by, the...","[they, have, endured, saber, rattling, by, the...","[101, 2027, 2031, 16753, 25653, 26347, 2011, 1...","[101, 2027, 2031, 16753, 25653, 26347, 2011, 1...",0,"[101, 2027, 2031, 16753, 25653, 26347, 2011, 1...","[they, have, endured, saber, rattling, by, the..."
270778,the cbc has been totally silent on reporting a...,"[the, cbc, has, been, totally, silent, on, rep...","[the, cbc, has, been, totally, silent, on, rep...","[101, 1996, 13581, 2038, 2042, 6135, 4333, 200...","[101, 1996, 13581, 2038, 2042, 6135, 4333, 200...",0,"[101, 1996, 13581, 2038, 2042, 6135, 103, 2006...","[the, cbc, has, been, totally, [MASK], on, rep..."


In [None]:
# split the data into train, val and test
train_rand, val_rand, test_rand = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

# save the data in pickle file
with open("../data/train_rand.pkl", 'wb') as f:
    pickle.dump(train_rand, f)

In [None]:
class BERTFineTuneRandDataset(torch.utils.data.Dataset):
  def __init__(self,df, maxlen = 64):
    super(BERTFineTuneRandDataset, self).__init__()
    # pad the encoded text to maxlen
    self._encoded_text = torch.nn.utils.rnn.pad_sequence(df['encoded_text'].apply(torch.tensor), batch_first=True, padding_value=0)
    df["labels"] = df["label"].apply((lambda x: torch.ones(maxlen).int() if x == 1 else torch.zeros(maxlen).int()))
    
    self._labels = df['labels'].values

    self._maxlen = maxlen
  
    self._masked_encoded_text = torch.nn.utils.rnn.pad_sequence(df['masked_random_encoding'].apply(torch.tensor), batch_first=True, padding_value=0)
    self._attention_mask = torch.nn.utils.rnn.pad_sequence(df['masked_random_encoding'].apply(lambda x: torch.tensor([1 if i != 0 else 0 for i in x])), batch_first=True, padding_value=0)

  def __len__(self):
    return len(self._labels)
  
  def __getitem__(self, idx):
    # make dict of the data
    return {

        'encoded_text': self._encoded_text[idx][:self._maxlen],
        'masked_random_encoding': self._masked_encoded_text[idx][:self._maxlen],
        'attention_mask': self._attention_mask[idx][:self._maxlen],
        'labels': self._labels[idx][:self._maxlen]    
    }

In [None]:
# dataloader
train_dataset_rand = BERTFineTuneDataset(train_rand)
val_dataset_rand = BERTFineTuneDataset(val_rand)
test_dataset_rand = BERTFineTuneDataset(test_rand)

train_dataloader_rand = torch.utils.data.DataLoader(train_dataset_rand, batch_size=32, shuffle=True)
val_dataloader_rand = torch.utils.data.DataLoader(val_dataset_rand, batch_size=32, shuffle=True)
test_dataloader_rand = torch.utils.data.DataLoader(test_dataset_rand, batch_size=32, shuffle=True)



In [None]:
# save the dataloader in pickle file
with open("../data/train_dataloader_rand.pkl", 'wb') as f:
    pickle.dump(train_dataloader_rand, f)

with open("../data/val_dataloader_rand.pkl", 'wb') as f:
    pickle.dump(val_dataloader_rand, f)

with open("../data/test_dataloader_rand.pkl", 'wb') as f:
    pickle.dump(test_dataloader_rand, f)

In [None]:
#fine tuning bert model

import torch
from transformers import BertTokenizer, BertForMaskedLM
from transformers import AdamW
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

model.to(torch.device('cuda'))

model.train()
optimizer = AdamW(model.parameters(), lr=1e-5)

epochs = 10

for epoch in range(epochs):
    loop  = tqdm(train_dataloader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['masked_encoded_text'].to(torch.device('cuda'))
        attention_mask = batch['attention_mask'].to(torch.device('cuda'))
        labels = batch['encoded_text'].to(torch.device('cuda'))
        segment_ids = batch['labels'].to(torch.device('cuda'))
        outputs = model(input_ids, attention_mask=attention_mask,token_type_ids = segment_ids, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
    
    path = "./model/bert_detox_rand_ft_"+str(epoch+1)+"epochs.pth"
    torch.save(model, path)
    