In [1]:
import os
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import pprint

import torch
from datasets import Dataset, DatasetDict
print(f"{torch.__version__ = }")

from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
from transformers import file_utils
print(file_utils.default_cache_path)

from sklearn.metrics import \
    accuracy_score, recall_score, precision_score, f1_score

torch.__version__ = '1.13.1+cu117'
/home/dangkhoadl/.cache/huggingface/hub


In [2]:
random_seed = 147

np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)

os.environ["CUDA_VISIBLE_DEVICES"]="2"
os.environ["PYTORCH_CUDA_ALLOC_CONF"]="max_split_size_mb:512"

# device = torch.device(
#     "cuda" if torch.cuda.is_available() else "cpu")

# 1. Data

In [3]:
data = pd.read_csv("data/data.csv", encoding='utf-8')

print(data['label'].value_counts())
data.head()

0    29720
1     2242
Name: label, dtype: int64


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
df_0 = data[data['label'] == 0] \
    .sample(n=3200,random_state=random_seed) \
    .reset_index(drop=True)
df_1 = data[data['label'] == 1] \
    .reset_index(drop=True)

df = pd.concat([df_0, df_1], ignore_index=True) \
    .sample(frac=1) \
    .reset_index(drop=True)

print(df['label'].value_counts())
df.head()

0    3200
1    2242
Name: label, dtype: int64


Unnamed: 0,id,label,tweet
0,1393,1,@user buffalo school districts boots trump all...
1,9689,1,are â¦ #black &amp; feel like the are stompi...
2,18889,0,be happy. be bright. be you.ðð #monday...
3,16067,0,@user rubbing her head in the grass #pepitaes...
4,322,1,"@user ""the dying of the light"" village green/..."


#### Train-Eval-Test Split

In [5]:
from sklearn.model_selection import train_test_split


train_eval_df, test_df = train_test_split(df,
    test_size=0.3,
    random_state=147,
    stratify=df['label'])

train_df, eval_df = train_test_split(train_eval_df,
    test_size=0.2,
    random_state=147,
    stratify=train_eval_df['label'])

print(f"{train_df.shape[0] = }")
print(f"{eval_df.shape[0] = }")
print(f"{test_df.shape[0] = }")

train_df.shape[0] = 3047
eval_df.shape[0] = 762
test_df.shape[0] = 1633


# 2. Preprocess data

#### Dataset

In [6]:
# Load Datasets
dsets = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(eval_df),
    'test': Dataset.from_pandas(test_df),
})
dsets = dsets.remove_columns(
    ['__index_level_0__'])

pprint.pprint(dsets)

{'test': Dataset({
    features: ['id', 'label', 'tweet'],
    num_rows: 1633
}),
 'train': Dataset({
    features: ['id', 'label', 'tweet'],
    num_rows: 3047
}),
 'validation': Dataset({
    features: ['id', 'label', 'tweet'],
    num_rows: 762
})}


#### Tokenize

In [7]:
def tokenize_fn(examples, tokenizer):
    # X
    inputs = tokenizer(examples['tweet'],
        padding="max_length", truncation=True, max_length=512,
        return_tensors="pt",
    )

    examples['input_ids'] = inputs['input_ids']
    examples['attention_mask'] = inputs['attention_mask']
    examples['token_type_ids'] = inputs['token_type_ids']

    # y
    examples['labels'] = torch.tensor(
        examples["label"])

    return examples

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
dsets = dsets \
    .map(lambda x: tokenize_fn(x, tokenizer),
        batched=True)
pprint.pprint(dsets)

Map:   0%|          | 0/3047 [00:00<?, ? examples/s]

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

Map:   0%|          | 0/1633 [00:00<?, ? examples/s]

{'test': Dataset({
    features: ['id', 'label', 'tweet', 'input_ids', 'attention_mask', 'token_type_ids', 'labels'],
    num_rows: 1633
}),
 'train': Dataset({
    features: ['id', 'label', 'tweet', 'input_ids', 'attention_mask', 'token_type_ids', 'labels'],
    num_rows: 3047
}),
 'validation': Dataset({
    features: ['id', 'label', 'tweet', 'input_ids', 'attention_mask', 'token_type_ids', 'labels'],
    num_rows: 762
})}


In [8]:
print(dsets['train'][0]['input_ids'][:50])
print(dsets['train'][0]['attention_mask'][:50])
print(dsets['train'][0]['labels'])

[101, 1030, 5310, 1030, 5310, 2021, 2298, 2129, 13459, 2027, 2024, 2055, 1001, 7332, 24498, 6824, 2361, 2006, 1996, 4946, 1012, 2029, 2001, 1001, 8275, 1012, 2664, 8568, 1001, 5922, 9453, 2232, 1012, 1001, 10958, 2050, 29649, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1


# 3. Train BERT from scratch

In [9]:
config = BertConfig(num_labels=2)
model = BertForSequenceClassification(config=config)

#### Eval metrics

In [10]:
def compute_metrics(eval_pred, average='binary'):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids

    # accuracy, precision, recall, f1-score
    acc = accuracy_score(
        y_true=labels, y_pred=predictions,
        normalize=True)
    r = recall_score(
        y_true=labels, y_pred=predictions,
        average=average, zero_division=0)
    p = precision_score(
        y_true=labels, y_pred=predictions,
        average=average, zero_division=0)
    f1 = f1_score(
        y_true=labels, y_pred=predictions,
        average=average, zero_division=0)

    return {
        "accuracy": acc,
        "precision": p,
        "recall": r,
        "f1": f1 }

#### optimizer, scheduler, loss function

In [11]:
optimizer = torch.optim.AdamW(model.parameters(),
    lr=1e-5,
    weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.LinearLR(
    optimizer=optimizer)

criterion = torch.nn.CrossEntropyLoss()
class Custom_Trainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)

        loss = criterion(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

#### Trainer Args

In [12]:
batch_size = 16
num_workers = 32
num_epochs = 20
output_dir = "exp/bert-from-scratch/"


training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_total_limit=4,   # Only last ... models are saved. Older ones are deleted.
    save_steps=50,         # Save checkpoints after ... steps
    eval_steps=50,         # Evaluation happens every ... steps
    logging_steps=50,
    learning_rate=1e-5,
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=num_workers,
    eval_accumulation_steps=4,
    gradient_accumulation_steps=4,
    num_train_epochs=num_epochs,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    seed=147,
    push_to_hub=False,
)

#### Setup log

In [13]:
from transformers import EarlyStoppingCallback, TrainerCallback

In [14]:
log_dir = os.path.join(output_dir, "log/")
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# from transformers.utils import logging
# logging.set_verbosity_info()

class PrinterCallback(TrainerCallback):
    def __write_log(self, state):
        train_log, eval_log = [], []

        for e in state.log_history:
            e_keys = set(e)
            if "loss" in e_keys: train_log.append(e)
            elif "eval_loss" in e_keys: eval_log.append(e)
            elif "train_runtime" in e_keys:
                with open(f"{log_dir}/trainer_info.json", 'w+', encoding='utf-8') as fin:
                    json.dump(e, fin, ensure_ascii=False, indent=4)

        if train_log != []:
            train_log_df = pd.DataFrame.from_dict(train_log) \
                .sort_values("step", ascending=True) \
                .reset_index(drop=True)
            train_log_df.to_csv(f"{log_dir}/log_trainset.csv", index=False)

        if eval_log != []:
            eval_log_df = pd.DataFrame.from_dict(eval_log) \
                .sort_values("step", ascending=True) \
                .reset_index(drop=True)
            eval_log_df.to_csv(f"{log_dir}/log_evalset.csv", index=False)

    def on_evaluate(self, args, state, control, logs=None, **kwargs):
        '''Write log after every eval round'''
        self.__write_log(state)
    def on_train_end(self, args, state, control, logs=None, **kwargs):
        '''Write log after training'''
        self.__write_log(state)

#### Train

In [15]:
trainer = Custom_Trainer(
    model=model,
    args=training_args,
    train_dataset=dsets['train'],
    eval_dataset=dsets['validation'],
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=5,
            early_stopping_threshold=1.0),
        PrinterCallback()]
)

In [16]:
trainer.train()

# Save best model
best_ckpts_path = os.path.join(output_dir, "checkpoint-best")
trainer.save_model(best_ckpts_path)

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,0.6126,0.530887,0.74147,0.791045,0.506369,0.617476
100,0.4444,0.467825,0.766404,0.673469,0.840764,0.747875
150,0.3425,0.429586,0.801837,0.718499,0.853503,0.780204
200,0.2662,0.414967,0.832021,0.796178,0.796178,0.796178


#### Evaluate

In [17]:
eval_res = trainer.evaluate()
pprint.pprint(eval_res)

{'epoch': 20.0,
 'eval_accuracy': 0.8320209973753281,
 'eval_f1': 0.7961783439490446,
 'eval_loss': 0.4149673879146576,
 'eval_precision': 0.7961783439490446,
 'eval_recall': 0.7961783439490446,
 'eval_runtime': 6.3375,
 'eval_samples_per_second': 120.237,
 'eval_steps_per_second': 1.894}


# 4. Inference

In [18]:
ckpt_fpath = "exp/bert-from-scratch/checkpoint-best"

model = BertForSequenceClassification.from_pretrained(ckpt_fpath,
    num_labels=2)

In [19]:
# Inference
test_trainer = Trainer(model)
preds, _ , _ = test_trainer.predict(dsets['test'])
print(f"{preds.shape = }")

# Prediction
y_preds = np.argmax(preds, axis=1).astype(int)
print(f"{y_preds.shape = }")



preds.shape = (1633, 2)
y_preds.shape = (1633,)


In [20]:
# scores
num_classes = 2
scores = pd.DataFrame(
    data=preds,
    columns=[ f"Class_{c}_score" for c in (np.arange(num_classes)) ])

# Prediction
y_preds = pd.DataFrame(
    data=y_preds,
    columns=[ 'Prediction' ])

# out
test_df = pd.concat(
    [test_df.reset_index(drop=True), scores, y_preds], axis=1)
test_df.head()

Unnamed: 0,id,label,tweet,Class_0_score,Class_1_score,Prediction
0,13539,1,black girl porn teen virgin sex pictures,-1.907104,2.276661,1
1,3168,1,@user .@user what a douchebag. like his dad! ...,-1.815158,1.950696,1
2,30372,0,", 30 rock, tracy jordan",-0.157196,0.207884,1
3,11365,0,back to schoolð¤ð #dnhs ð,2.991613,-3.127951,0
4,22540,0,yes these days they call stuff like this journ...,1.853733,-1.899091,0


In [21]:
def calculate_accuracy(df, num_classes=2, average='binary'):
    assert average in [None, 'binary', 'micro', 'macro', 'weighted']
    y_test = df['label'].values
    y_pred = df.loc[:, [f'Class_{c}_score' for c in range(num_classes)]] \
        .to_numpy().argmax(axis=1)

    # accuracy, precision, recall, f1-score
    acc = accuracy_score(
        y_true=y_test,
        y_pred=y_pred, normalize=True)

    p = precision_score(
        y_true=y_test,
        y_pred=y_pred, average=average)

    r = recall_score(
        y_true=y_test,
        y_pred=y_pred, average=average)

    f1 = f1_score(
        y_true=y_test,
        y_pred=y_pred, average=average)

    return acc, p, r, f1

acc, p, r, f1 = calculate_accuracy(test_df, num_classes=2)
print(f"Test Accuracy: {acc}")
print(f"Test Precision: {p}")
print(f"Test Recall: {r}")
print(f"Test F1 score: {f1}")

Test Accuracy: 0.8315982853643601
Test Precision: 0.8015151515151515
Test Recall: 0.7860326894502229
Test F1 score: 0.7936984246061515
