**This code is for CSoNet-2021 paper "Detecting Hate Speech Contents Using Embedding Models".** 

**The resources of this paper are available at [here](https://github.com/duonghuuphuc/hate-speech-detection)**

# **Download Dataset**

In [None]:
!mkdir datasets

mkdir: cannot create directory ‘datasets’: File exists


## HASOC-2019

In [None]:
!mkdir ./datasets/HASOC2019
!wget -O ./datasets/HASOC2019/english_dataset.tsv https://raw.githubusercontent.com/socialmediaie/HASOC2019/master/data/raw/training_data/english_dataset.tsv
!wget -O ./datasets/HASOC2019/test_english_dataset.tsv https://raw.githubusercontent.com/socialmediaie/HASOC2019/master/data/raw/test_data_gold/english_data.tsv

mkdir: cannot create directory ‘./datasets/HASOC2019’: File exists
--2021-07-09 03:12:22--  https://raw.githubusercontent.com/socialmediaie/HASOC2019/master/data/raw/training_data/english_dataset.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1223457 (1.2M) [text/plain]
Saving to: ‘./datasets/HASOC2019/english_dataset.tsv’


2021-07-09 03:12:22 (25.1 MB/s) - ‘./datasets/HASOC2019/english_dataset.tsv’ saved [1223457/1223457]

--2021-07-09 03:12:22--  https://raw.githubusercontent.com/socialmediaie/HASOC2019/master/data/raw/test_data_gold/english_data.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.1

## Davidson-2017

In [None]:
!mkdir ./datasets/Davidson2017
!wget -O ./datasets/Davidson2017/labeled_data.csv https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv

mkdir: cannot create directory ‘./datasets/Davidson2017’: File exists
--2021-07-09 03:12:36--  https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2546446 (2.4M) [text/plain]
Saving to: ‘./datasets/Davidson2017/labeled_data.csv’


2021-07-09 03:12:36 (33.8 MB/s) - ‘./datasets/Davidson2017/labeled_data.csv’ saved [2546446/2546446]



## HS2-2021

In [None]:
!mkdir ./datasets/HS2-2021

mkdir: cannot create directory ‘./datasets/HS2-2021’: File exists


# **Import Library & Package**

In [None]:
!pip3 install transformers



In [None]:
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from tqdm import tqdm
from transformers import  AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn import metrics
from sklearn.model_selection import KFold, train_test_split

import csv
import copy
import torch
import time
import logging

In [None]:
logging.basicConfig(format='%(message)s', level=logging.INFO, filemode='a')
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# **Data Preprocess**

In [None]:
def read_tsv(file_path, text_idx, class_idx, delimiter='\t'):
    samples = []
    with open(file_path, 'r') as f:
        tsv_reader = csv.reader(f, delimiter=delimiter)
        next(tsv_reader)
        for row in tsv_reader:
          samples.append((row[text_idx].strip(), row[class_idx].strip()))
    return samples

In [None]:
class BertweetDataset(Dataset):
    def __init__(self, data_file_path, bert_tokenizer, label_map, text_idx, class_idx, delimiter='\t', batch_size=32, max_length=128):
        self.file_path = data_file_path
        self.text_idx = text_idx
        self.class_idx = class_idx
        self.delimiter = delimiter
        self.bz = batch_size
        self.max_len = max_length
        self.label_map = label_map
        self.examples = []
        self.create_examples(bert_tokenizer)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, index):
        example = self.examples[index]
        token_ids = torch.tensor(example[0], dtype=torch.long)
        token_type_ids = torch.tensor(example[1], dtype=torch.long)
        attention_masks = torch.tensor(example[2], dtype=torch.long)
        labels_id = torch.tensor(example[3], dtype=torch.long)
        return token_ids, token_type_ids, attention_masks, labels_id

    def create_examples(self, bert_tokenizer):
        samples = read_tsv(self.file_path, self.text_idx, self.class_idx, self.delimiter)
        sidx, eidx = 0, self.bz
        pbar = tqdm(total=len(samples), position=0)
        while sidx <= len(samples):
            batch_samples = samples[sidx: eidx]
            if len(batch_samples) == 0:
                break
            texts, labels = list(zip(*batch_samples))
            label_ids = [self.label_map[label] for label in labels]
            encoded_inputs = bert_tokenizer(text=texts, max_length=self.max_len, padding='max_length', truncation='longest_first')
            encoded_inputs["label_ids"] = label_ids
            self.examples.extend(list(zip(*encoded_inputs.values())))
            sidx += self.bz
            eidx += self.bz
            pbar.update(len(batch_samples))

# **Model**

In [None]:
def build_model(model_name, num_labels, task_name, lr, weight_decay, device='cuda'):
  model_config = AutoConfig.from_pretrained(model_name, num_labels=num_labels, finetuning_task=task_name)
  model = AutoModelForSequenceClassification.from_pretrained(model_name, config=model_config)
  no_decay = ["bias", "LayerNorm.weight"]
  optimizer_grouped_parameters = [
      {
          "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
          "weight_decay": weight_decay,
      },
      {
          "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
          "weight_decay": 0.0,
      },
  ]
  optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
  model.to(device)
  return model, optimizer


In [None]:
def train(model, optimizer, data_iter, device='cuda'):
    model.train()
    train_bar = tqdm(data_iter, total=len(data_iter), desc='\tTRAIN:', position=0, leave=None)
    train_loss = 0
    train_preds, train_golds = [], []
    start_time = time.time()
    for batch in train_bar:
        input_ids, token_type_ids, att_masks, label_ids = batch
        if device == "cuda":
            input_ids = input_ids.to(device)
            token_type_ids = token_type_ids.to(device)
            att_masks = att_masks.to(device)
            label_ids = label_ids.to(device)
        outputs = model(input_ids=input_ids, attention_mask=att_masks, token_type_ids=token_type_ids, labels=label_ids)
        outputs.loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += outputs.loss.item()
        train_preds += [y.argmax().item() for y in outputs.logits]
        train_golds += label_ids.tolist()
    acc_score, f1_macro, f1_weighted = caculate_score(train_golds, train_preds)
    train_loss = train_loss / len(data_iter)
    return model, train_loss, (acc_score, f1_macro, f1_weighted), get_total_time(start_time)

In [None]:
def eval(model, data_iter, device='cuda'):
    model.eval()
    eval_bar = tqdm(data_iter, total=len(data_iter), desc='\tEVAL:', position=0, leave=None)
    eval_loss = 0
    eval_preds, eval_golds = [], []
    start_time = time.time()
    for batch in eval_bar:
        input_ids, token_type_ids, att_masks, label_ids = batch
        if device == "cuda":
            input_ids = input_ids.to(device)
            token_type_ids = token_type_ids.to(device)
            att_masks = att_masks.to(device)
            label_ids = label_ids.to(device)
        outputs = model(input_ids=input_ids, attention_mask=att_masks, token_type_ids=token_type_ids, labels=label_ids)
        eval_loss += outputs.loss.item()
        eval_preds += [y.argmax().item() for y in outputs.logits]
        eval_golds += label_ids.tolist()
    acc_score, f1_macro, f1_weighted = caculate_score(eval_golds, eval_preds)
    eval_loss = eval_loss / len(data_iter)
    return eval_loss, (acc_score, f1_macro, f1_weighted), get_total_time(start_time)

In [None]:
def parse_time(run_time):
    hours, rem = divmod(run_time, 3600)
    minutes, seconds = divmod(rem, 60)
    return "{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)

def get_total_time(start_time):
    end = time.time()
    spended_time = end - start_time
    return parse_time(spended_time)

def caculate_score(actuals, predicts):
    acc_score = metrics.accuracy_score(actuals, predicts)
    f1_macro_score = metrics.f1_score(actuals, predicts, average="macro")
    f1_weighted_score = metrics.f1_score(actuals, predicts, average="weighted")
    return acc_score, f1_macro_score, f1_weighted_score

# **Experiments**

## HASOC-2019

### Parameters

In [None]:
task_name = "HASOC"
model_name = "vinai/bertweet-base"
# Dataset Options
train_file_path = "./datasets/HASOC2019/english_dataset.tsv"
test_file_path = "./datasets/HASOC2019/test_english_dataset.tsv"
output_path = "./outputs"
text_idx = 1
class_idx = 2
delimiter = "\t"
label_maps = {"NOT": 0, "HOF": 1}
# Task Options
num_exp = 5
kfold = 5
num_epochs = 10
train_batch_size = 32
test_batch_size = 16
max_length = 128
weight_decay = 0.0
learning_rate = 0.00001
early_stop = 2

### Training

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, normalization=True)
train_dataset = BertweetDataset(train_file_path, tokenizer, label_maps, text_idx, class_idx, delimiter, batch_size=train_batch_size, max_length=max_length)
test_dataset = BertweetDataset(test_file_path, tokenizer, label_maps, text_idx, class_idx, delimiter, batch_size=train_batch_size, max_length=max_length)
test_iter = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=True)

emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 5852/5852 [00:02<00:00, 2195.22it/s]
100%|██████████| 1153/1153 [00:00<00:00, 2292.13it/s]


In [None]:
exp_scores = {"acc": [], "f1_macro": [], "f1_weighted": [], "exp_time": [], "epoch_time": [], "fold_time": []}
for exp in range(num_exp):
    exp_time = 0
    kf = KFold(n_splits=kfold, shuffle=True)
    avg_scores = {"acc": [], "f1_macro": [], "f1_weighted": []}
    data_idxs = list(range(train_dataset.__len__()))
    for idx, (train_idx, eval_idx) in enumerate(kf.split(data_idxs)):
        best_epoch, best_loss, best_score = 0, float("inf"), 0
        model, optimizer = build_model(model_name, len(set(label_maps.values())), task_name, learning_rate, weight_decay)
        best_model = copy.deepcopy(model)
        train_iter = DataLoader(train_dataset, batch_size=train_batch_size, sampler=SubsetRandomSampler(train_idx))
        eval_iter = DataLoader(train_dataset, batch_size=test_batch_size, sampler=SubsetRandomSampler(eval_idx))
        fold_time = 0
        for epoch in range(num_epochs):
            epoch_time = time.time()
            logger.info(f"Exp: {exp+1} - Fold: {idx+1} - Epoch: {epoch+1}/{num_epochs}")
            model, train_loss, train_score, train_time = train(model, optimizer, train_iter)
            epoch_time = time.time() - epoch_time
            fold_time += epoch_time
            exp_scores["epoch_time"].append(epoch_time)
            eval_loss, eval_score, eval_time = eval(model, eval_iter)
            logger.info(f"\tEVAL  - Time: {eval_time}; AVG Loss: {eval_loss:.6f}; Accurancy: {eval_score[0]:.4f}; F1_maro: {eval_score[1]:.4f}; F1_weighted: {eval_score[2]:.4f}")
            if best_score <= eval_score[1]:
              best_model = copy.deepcopy(model)
              best_score = eval_score[1]
              best_epoch = epoch
            if best_loss >= eval_loss:
              best_loss = eval_loss
              counter = 0
            else:
              counter += 1
            if counter >= early_stop:
              break
        exp_time += fold_time
        exp_scores["fold_time"].append(fold_time)
        logger.info(f"Test at epoch {best_epoch+1}:")
        test_loss, test_score, test_time = eval(best_model, test_iter)
        logger.info(f"\tTEST  - Time: {test_time}; AVG Loss: {test_loss:.6f}; Accurancy: {test_score[0]:.4f}; F1_maro: {test_score[1]:.4f}; F1_weighted: {test_score[2]:.4f}")
        avg_scores["acc"].append(test_score[0])
        avg_scores["f1_macro"].append(test_score[1])
        avg_scores["f1_weighted"].append(test_score[2])
    logger.info("Summary:")
    fold_acc = (sum(avg_scores['acc'])/kfold)
    fold_f1_macro = (sum(avg_scores['f1_macro'])/kfold)
    fold_f1_weighted = (sum(avg_scores['f1_weighted'])/kfold)
    logger.info(f"\tAccurancy: {fold_acc:.4f}")
    logger.info(f"\tF1 Macro: {fold_f1_macro:.4f}")
    logger.info(f"\tF1 Weighted: {fold_f1_weighted:.4f}")
    exp_scores["acc"].append(fold_acc)
    exp_scores["f1_macro"].append(fold_f1_macro)
    exp_scores["f1_weighted"].append(fold_f1_weighted)
    exp_scores["exp_time"].append(exp_time)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

### Results

In [None]:
logger.info(f"Accurancy: {exp_scores['acc']}")
logger.info(f"AVG Accurancy: {(sum(exp_scores['acc'])/num_exp):.4f} - MAX: {max(exp_scores['acc']):.4f} - MIN: {min(exp_scores['acc']):.4f}")
logger.info(f"F1 Macro: {exp_scores['f1_macro']}")
logger.info(f"AVG F1 Macro: {(sum(exp_scores['f1_macro'])/num_exp):.4f} - MAX: {max(exp_scores['f1_macro']):.4f} - MIN: {min(exp_scores['f1_macro']):.4f}")
logger.info(f"F1 Weighted: {exp_scores['f1_weighted']}")
logger.info(f"AVG F1 Weighted: {(sum(exp_scores['f1_weighted'])/num_exp):.4f} - MAX: {max(exp_scores['f1_weighted']):.4f} - MIN: {min(exp_scores['f1_weighted']):.4f}")
logger.info(f"Exp time: {exp_scores['exp_time']}")
logger.info(f"AVG Exp time: {parse_time(sum(exp_scores['exp_time'])/num_exp)}")
logger.info(f"Fold time: {exp_scores['fold_time']}")
logger.info(f"AVG Fold time: {parse_time(sum(exp_scores['fold_time'])/len(exp_scores['fold_time']))}")
logger.info(f"Epoch time: {exp_scores['epoch_time']}")
logger.info(f"AVG Epoch time: {parse_time(sum(exp_scores['epoch_time'])/len(exp_scores['epoch_time']))}")

Accurancy: [0.8065915004336514, 0.83642671292281, 0.8333044232437121, 0.813009540329575, 0.8286209887250651]
AVG Accurancy: 0.8236 - MAX: 0.8364 - MIN: 0.8066
F1 Macro: [0.7610441573542819, 0.7826131453095382, 0.7837822762730859, 0.770088038643431, 0.7720094081083645]
AVG F1 Macro: 0.7739 - MAX: 0.7838 - MIN: 0.7610
F1 Weighted: [0.8126585132082272, 0.8366496424531341, 0.8355268213513305, 0.8194694700196188, 0.8284189288217293]
AVG F1 Weighted: 0.8265 - MAX: 0.8366 - MIN: 0.8127
Exp time: [1010.7412478923798, 1178.764492034912, 1123.4087042808533, 1124.2814745903015, 1180.3996152877808]
AVG Exp time: 00:18:43.52
Fold time: [224.52934098243713, 168.39970707893372, 168.44618725776672, 280.9198799133301, 168.4461326599121, 168.36570024490356, 280.6077060699463, 280.6714689731598, 224.54959201812744, 224.57002472877502, 224.54648089408875, 224.58594942092896, 280.9306650161743, 168.5754039287567, 224.77020502090454, 224.9931833744049, 224.6971890926361, 224.7620770931244, 224.8769922256469

## Davidson-2017

### Parameters

In [None]:
task_name = "Davidson"
model_name = "vinai/bertweet-base"
# Dataset Options
train_file_path = "./datasets/Davidson2017/labeled_data.csv"
output_path = "./outputs"
text_idx = 6
class_idx = 5
delimiter = ","
label_maps = {"0": 1, "1": 1, "2": 0}
test_split = 0.1
# Task Options
num_exp = 5
kfold = 5
num_epochs = 10
train_batch_size = 32
test_batch_size = 16
max_length = 128
weight_decay = 0.0
learning_rate = 0.00001
early_stop = 2

### Training

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, normalization=True)
train_dataset = BertweetDataset(train_file_path, tokenizer, label_maps, text_idx, class_idx, delimiter, batch_size=train_batch_size, max_length=max_length)

emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 24783/24783 [00:06<00:00, 3998.27it/s]


In [None]:
exp_scores = {"acc": [], "f1_macro": [], "f1_weighted": [], "exp_time": [], "epoch_time": [], "fold_time": []}
for exp in range(num_exp):
    exp_time = 0
    kf = KFold(n_splits=kfold, shuffle=True)
    avg_scores = {"acc": [], "f1_macro": [], "f1_weighted": []}
    data_idxs = list(range(train_dataset.__len__()))
    train_idx, test_idxs = train_test_split(data_idxs, test_size=test_split, shuffle=True)
    test_iter = DataLoader(train_dataset, batch_size=test_batch_size, sampler=SubsetRandomSampler(test_idxs))
    for idx, (train_idx, eval_idx) in enumerate(kf.split(data_idxs)):
        best_epoch, best_loss, best_score = 0, float("inf"), 0
        model, optimizer = build_model(model_name, len(set(label_maps.values())), task_name, learning_rate, weight_decay)
        best_model = copy.deepcopy(model)
        train_iter = DataLoader(train_dataset, batch_size=train_batch_size, sampler=SubsetRandomSampler(train_idx))
        eval_iter = DataLoader(train_dataset, batch_size=test_batch_size, sampler=SubsetRandomSampler(eval_idx))
        fold_time = 0
        for epoch in range(num_epochs):
            epoch_time = time.time()
            logger.info(f"Exp: {exp+1} - Fold: {idx+1} - Epoch: {epoch+1}/{num_epochs}")
            model, train_loss, train_score, train_time = train(model, optimizer, train_iter)
            epoch_time = time.time() - epoch_time
            fold_time += epoch_time
            exp_scores["epoch_time"].append(epoch_time)
            eval_loss, eval_score, eval_time = eval(model, eval_iter)
            logger.info(f"\tEVAL  - Time: {eval_time}; AVG Loss: {eval_loss:.6f}; Accurancy: {eval_score[0]:.4f}; F1_maro: {eval_score[1]:.4f}; F1_weighted: {eval_score[2]:.4f}")
            if best_score <= eval_score[1]:
              best_model = copy.deepcopy(model)
              best_score = eval_score[1]
              best_epoch = epoch
            if best_loss >= eval_loss:
              best_loss = eval_loss
              counter = 0
            else:
              counter += 1
            if counter >= early_stop:
              break
        exp_time += fold_time
        exp_scores["fold_time"].append(fold_time)
        logger.info(f"Test at epoch {best_epoch+1}:")
        test_loss, test_score, test_time = eval(best_model, test_iter)
        logger.info(f"\tTEST  - Time: {test_time}; AVG Loss: {test_loss:.6f}; Accurancy: {test_score[0]:.4f}; F1_maro: {test_score[1]:.4f}; F1_weighted: {test_score[2]:.4f}")
        avg_scores["acc"].append(test_score[0])
        avg_scores["f1_macro"].append(test_score[1])
        avg_scores["f1_weighted"].append(test_score[2])
    logger.info("Summary:")
    fold_acc = (sum(avg_scores['acc'])/kfold)
    fold_f1_macro = (sum(avg_scores['f1_macro'])/kfold)
    fold_f1_weighted = (sum(avg_scores['f1_weighted'])/kfold)
    logger.info(f"\tAccurancy: {avg_scores['acc']}")
    logger.info(f"\tAVG Accurancy: {fold_acc:.4f} - MAX: {max(avg_scores['acc']):.4f} - MIN: {min(avg_scores['acc']):.4f}")
    logger.info(f"\tF1 Macro: {avg_scores['f1_macro']}")
    logger.info(f"\tAVG F1 Macro: {fold_f1_macro:.4f} - MAX: {max(avg_scores['f1_macro']):.4f} - MIN: {min(avg_scores['f1_macro']):.4f}")
    logger.info(f"\tF1 Weighted: {avg_scores['f1_weighted']}")
    logger.info(f"\t AVG F1 Weighted: {(sum(avg_scores['f1_weighted'])/num_exp):.4f} - MAX: {max(avg_scores['f1_weighted']):.4f} - MIN: {min(avg_scores['f1_weighted']):.4f}")
    exp_scores["acc"].append(fold_acc)
    exp_scores["f1_macro"].append(fold_f1_macro)
    exp_scores["f1_weighted"].append(fold_f1_weighted)
    exp_scores["exp_time"].append(exp_time)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

### Results

In [None]:
logger.info(f"Accurancy: {exp_scores['acc']}")
logger.info(f"AVG Accurancy: {(sum(exp_scores['acc'])/num_exp):.4f} - MAX: {max(exp_scores['acc']):.4f} - MIN: {min(exp_scores['acc']):.4f}")
logger.info(f"F1 Macro: {exp_scores['f1_macro']}")
logger.info(f"AVG F1 Macro: {(sum(exp_scores['f1_macro'])/num_exp):.4f} - MAX: {max(exp_scores['f1_macro']):.4f} - MIN: {min(exp_scores['f1_macro']):.4f}")
logger.info(f"F1 Weighted: {exp_scores['f1_weighted']}")
logger.info(f"AVG F1 Weighted: {(sum(exp_scores['f1_weighted'])/num_exp):.4f} - MAX: {max(exp_scores['f1_weighted']):.4f} - MIN: {min(exp_scores['f1_weighted']):.4f}")
logger.info(f"Exp time: {exp_scores['exp_time']}")
logger.info(f"AVG Exp time: {parse_time(sum(exp_scores['exp_time'])/num_exp)}")
logger.info(f"Fold time: {exp_scores['fold_time']}")
logger.info(f"AVG Fold time: {parse_time(sum(exp_scores['fold_time'])/len(exp_scores['fold_time']))}")
logger.info(f"Epoch time: {exp_scores['epoch_time']}")
logger.info(f"AVG Epoch time: {parse_time(sum(exp_scores['epoch_time'])/len(exp_scores['epoch_time']))}")

Accurancy: [0.9746672045179506, 0.981686163775716, 0.9740217829770069, 0.9752319483662767, 0.9826542960871321]
AVG Accurancy: 0.9777 - MAX: 0.9827 - MIN: 0.9740
F1 Macro: [0.9541054322979561, 0.9678398158699583, 0.9554160866871735, 0.957177434348828, 0.96742380560921]
AVG F1 Macro: 0.9604 - MAX: 0.9678 - MIN: 0.9541
F1 Weighted: [0.9748118138473864, 0.98179100701663, 0.9741551448409707, 0.9754481181821593, 0.9827076341588505]
AVG F1 Weighted: 0.9778 - MAX: 0.9827 - MIN: 0.9742
Exp time: [5235.850959062576, 4282.668512105942, 4047.149932384491, 5238.381111383438, 4755.714826822281]
AVG Exp time: 01:18:31.95
Fold time: [951.7350206375122, 1189.7493391036987, 713.9705059528351, 1190.251636981964, 1190.1444563865662, 1189.5103492736816, 713.7171165943146, 713.7561626434326, 951.6523752212524, 714.0325083732605, 714.0691299438477, 714.1224255561829, 1190.0536375045776, 714.4440062046051, 714.4607331752777, 1190.303902387619, 952.2989089488983, 1190.3329434394836, 952.4688613414764, 952.9764

## HS2-2021

### Parameters

In [None]:
task_name = "HS2_2021"
model_name = "vinai/bertweet-base"
# Dataset Options
train_file_path = "./datasets/HS2-2021/hs2_2021.csv"
output_path = "./outputs"
text_idx = 0
class_idx = 1
delimiter = ","
label_maps = {"0": 0, "1": 1}
test_split = 0.1
# Task Options
num_exp = 5
kfold = 5
num_epochs = 5
train_batch_size = 32
test_batch_size = 16
max_length = 128
weight_decay = 0.0
learning_rate = 0.00001
early_stop = 2

### Training

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, normalization=True)
train_dataset = BertweetDataset(train_file_path, tokenizer, label_maps, text_idx, class_idx, delimiter, batch_size=train_batch_size, max_length=max_length)

emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 31788/31788 [00:09<00:00, 3367.50it/s]


In [None]:
exp_scores = {"acc": [], "f1_macro": [], "f1_weighted": [], "exp_time": [], "epoch_time": [], "fold_time": []}
for exp in range(num_exp):
    exp_time = 0
    kf = KFold(n_splits=kfold, shuffle=True)
    avg_scores = {"acc": [], "f1_macro": [], "f1_weighted": []}
    data_idxs = list(range(train_dataset.__len__()))
    train_idx, test_idxs = train_test_split(data_idxs, test_size=test_split, shuffle=True)
    test_iter = DataLoader(train_dataset, batch_size=test_batch_size, sampler=SubsetRandomSampler(test_idxs))
    for idx, (train_idx, eval_idx) in enumerate(kf.split(data_idxs)):
        best_epoch, best_loss, best_score = 0, float("inf"), 0
        model, optimizer = build_model(model_name, len(set(label_maps.values())), task_name, learning_rate, weight_decay)
        best_model = copy.deepcopy(model)
        train_iter = DataLoader(train_dataset, batch_size=train_batch_size, sampler=SubsetRandomSampler(train_idx))
        eval_iter = DataLoader(train_dataset, batch_size=test_batch_size, sampler=SubsetRandomSampler(eval_idx))
        fold_time = 0
        for epoch in range(num_epochs):
            epoch_time = time.time()
            logger.info(f"Exp: {exp+1} - Fold: {idx+1} - Epoch: {epoch+1}/{num_epochs}")
            model, train_loss, train_score, train_time = train(model, optimizer, train_iter)
            epoch_time = time.time() - epoch_time
            fold_time += epoch_time
            exp_scores["epoch_time"].append(epoch_time)
            eval_loss, eval_score, eval_time = eval(model, eval_iter)
            logger.info(f"\tEVAL  - Time: {eval_time}; AVG Loss: {eval_loss:.6f}; Accurancy: {eval_score[0]:.4f}; F1_maro: {eval_score[1]:.4f}; F1_weighted: {eval_score[2]:.4f}")
            if best_score <= eval_score[1]:
              best_model = copy.deepcopy(model)
              best_score = eval_score[1]
              best_epoch = epoch
            if best_loss >= eval_loss:
              best_loss = eval_loss
              counter = 0
            else:
              counter += 1
            if counter >= early_stop:
              break
        exp_time += fold_time
        exp_scores["fold_time"].append(fold_time)
        logger.info(f"Test at epoch {best_epoch+1}:")
        test_loss, test_score, test_time = eval(best_model, test_iter)
        logger.info(f"\tTEST  - Time: {test_time}; AVG Loss: {test_loss:.6f}; Accurancy: {test_score[0]:.4f}; F1_maro: {test_score[1]:.4f}; F1_weighted: {test_score[2]:.4f}")
        avg_scores["acc"].append(test_score[0])
        avg_scores["f1_macro"].append(test_score[1])
        avg_scores["f1_weighted"].append(test_score[2])
    logger.info("Summary:")
    fold_acc = (sum(avg_scores['acc'])/kfold)
    fold_f1_macro = (sum(avg_scores['f1_macro'])/kfold)
    fold_f1_weighted = (sum(avg_scores['f1_weighted'])/kfold)
    logger.info(f"\tAccurancy: {avg_scores['acc']}")
    logger.info(f"\tAVG Accurancy: {fold_acc:.4f} - MAX: {max(avg_scores['acc']):.4f} - MIN: {min(avg_scores['acc']):.4f}")
    logger.info(f"\tF1 Macro: {avg_scores['f1_macro']}")
    logger.info(f"\tAVG F1 Macro: {fold_f1_macro:.4f} - MAX: {max(avg_scores['f1_macro']):.4f} - MIN: {min(avg_scores['f1_macro']):.4f}")
    logger.info(f"\tF1 Weighted: {avg_scores['f1_weighted']}")
    logger.info(f"\t AVG F1 Weighted: {(sum(avg_scores['f1_weighted'])/num_exp):.4f} - MAX: {max(avg_scores['f1_weighted']):.4f} - MIN: {min(avg_scores['f1_weighted']):.4f}")
    exp_scores["acc"].append(fold_acc)
    exp_scores["f1_macro"].append(fold_f1_macro)
    exp_scores["f1_weighted"].append(fold_f1_weighted)
    exp_scores["exp_time"].append(exp_time)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

### Results

In [None]:
logger.info(f"Accurancy: {exp_scores['acc']}")
logger.info(f"AVG Accurancy: {(sum(exp_scores['acc'])/num_exp):.4f} - MAX: {max(exp_scores['acc']):.4f} - MIN: {min(exp_scores['acc']):.4f}")
logger.info(f"F1 Macro: {exp_scores['f1_macro']}")
logger.info(f"AVG F1 Macro: {(sum(exp_scores['f1_macro'])/num_exp):.4f} - MAX: {max(exp_scores['f1_macro']):.4f} - MIN: {min(exp_scores['f1_macro']):.4f}")
logger.info(f"F1 Weighted: {exp_scores['f1_weighted']}")
logger.info(f"AVG F1 Weighted: {(sum(exp_scores['f1_weighted'])/num_exp):.4f} - MAX: {max(exp_scores['f1_weighted']):.4f} - MIN: {min(exp_scores['f1_weighted']):.4f}")
logger.info(f"Exp time: {exp_scores['exp_time']}")
logger.info(f"AVG Exp time: {parse_time(sum(exp_scores['exp_time'])/num_exp)}")
logger.info(f"Fold time: {exp_scores['fold_time']}")
logger.info(f"AVG Fold time: {parse_time(sum(exp_scores['fold_time'])/len(exp_scores['fold_time']))}")
logger.info(f"Epoch time: {exp_scores['epoch_time']}")
logger.info(f"AVG Epoch time: {parse_time(sum(exp_scores['epoch_time'])/len(exp_scores['epoch_time']))}")