In [1]:
import os
import numpy as np
import pandas as pd
import json
from tqdm import tqdm

import torch
from torch.utils.data import Dataset
print(f"{torch.__version__ = }")

from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
from transformers import file_utils
print(file_utils.default_cache_path)

from sklearn.metrics import \
    accuracy_score, recall_score, precision_score, f1_score

torch.__version__ = '1.13.1+cu117'
/home/dangkhoadl/.cache/huggingface/hub


In [2]:
random_seed = 147

np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed)

os.environ["CUDA_VISIBLE_DEVICES"]="1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"]="max_split_size_mb:512"

# device = torch.device(
#     "cuda" if torch.cuda.is_available() else "cpu")

# 1. Dataset

In [3]:
data = pd.read_csv("data.csv")

print(data['label'].value_counts())
data.head()

0    29720
1     2242
Name: label, dtype: int64


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
df_0 = data[data['label'] == 0] \
    .sample(n=3200,random_state=random_seed) \
    .reset_index(drop=True)
df_1 = data[data['label'] == 1] \
    .reset_index(drop=True)

df = pd.concat([df_0, df_1], ignore_index=True) \
    .sample(frac=1) \
    .reset_index(drop=True)

print(df['label'].value_counts())
df.head()

0    3200
1    2242
Name: label, dtype: int64


Unnamed: 0,id,label,tweet
0,1393,1,@user buffalo school districts boots trump all...
1,9689,1,are â¦ #black &amp; feel like the are stompi...
2,18889,0,be happy. be bright. be you.ðð #monday...
3,16067,0,@user rubbing her head in the grass #pepitaes...
4,322,1,"@user ""the dying of the light"" village green/..."


#### Train-Eval-Test Split

In [5]:
from sklearn.model_selection import train_test_split


train_eval_df, test_df = train_test_split(df,
    test_size=0.3,
    random_state=147,
    stratify=df['label'])

train_df, eval_df = train_test_split(train_eval_df,
    test_size=0.2,
    random_state=147,
    stratify=train_eval_df['label'])

print(f"{len(train_df) = }")
print(f"{len(eval_df) = }")
print(f"{len(test_df) = }")

len(train_df) = 3047
len(eval_df) = 762
len(test_df) = 1633


# 2. Preprocess data

#### Tokenize

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

X_train_tokenized = tokenizer(train_df['tweet'].tolist(),
    padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(eval_df['tweet'].tolist(),
    padding=True, truncation=True, max_length=512)

print(f"{X_train_tokenized.keys() = }")
print(f"{type(X_train_tokenized['input_ids']) = }")
print(f"{torch.tensor(X_train_tokenized['input_ids']).shape = }")
print(f"{X_train_tokenized['input_ids'][0][:20] = }")

X_train_tokenized.keys() = dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
type(X_train_tokenized['input_ids']) = <class 'list'>
torch.tensor(X_train_tokenized['input_ids']).shape = torch.Size([3047, 73])
X_train_tokenized['input_ids'][0][:20] = [101, 1030, 5310, 1030, 5310, 2021, 2298, 2129, 13459, 2027, 2024, 2055, 1001, 7332, 24498, 6824, 2361, 2006, 1996, 4946]


#### Dataset

In [7]:
class SentimentAnalysis_Dset(Dataset):
    def __init__(self, encodings, labels):
        super().__init__()

        self._encodings = encodings
        self._labels = labels

    def __len__(self):
        return len(self._encodings["input_ids"])

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self._encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self._encodings['attention_mask'][idx]),
            'token_type_ids': torch.tensor(self._encodings['token_type_ids'][idx]),
            'labels': torch.tensor(self._labels[idx])
        }

In [8]:
train_dset = SentimentAnalysis_Dset(
    encodings=X_train_tokenized,
    labels=train_df['label'].tolist())

eval_dset = SentimentAnalysis_Dset(
    encodings=X_val_tokenized,
    labels=eval_df['label'].tolist())

# Pretty Print a Dictionary using pprint
import pprint
pprint.pprint(train_dset[0])

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0]),
 'input_ids': tensor([  101,  1030,  5310,  1030,  5310,  2021,  2298,  2129, 13459,  2027,
         2024,  2055,  1001,  7332, 24498,  6824,  2361,  2006,  1996,  4946,
         1012,  2029,  2001,  1001,  8275,  1012,  2664,  8568,  1001,  5922,
         9453,  2232,  1012,  1001, 10958,  2050, 29649,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0]),
 'labels': tensor(1),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0,

# 3. Train BERT from scratch

In [9]:
config = BertConfig(num_labels=2)
model = BertForSequenceClassification(config=config)

#### Training arg

In [10]:
batch_size = 32
output_dir = "ckpts/bert-from-scratch"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

log_dir = "log/"
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=100,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    dataloader_num_workers=32,
    eval_accumulation_steps=4,
    gradient_accumulation_steps=4,
    num_train_epochs=35,
    load_best_model_at_end=True,
    warmup_ratio=0.1,
    logging_steps=100,
    metric_for_best_model="eval_f1",
    seed=147,
    push_to_hub=False,
)

#### optimizer, scheduler, loss function

In [11]:
optimizer = torch.optim.AdamW(model.parameters(),
    lr=1e-5,
    weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.LinearLR(
    optimizer=optimizer)

criterion = torch.nn.CrossEntropyLoss()
class Custom_Trainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)

        loss = criterion(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

#### Eval metrics

In [12]:
def compute_metrics(eval_pred, average='binary'):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids

    # accuracy, precision, recall, f1-score
    acc = accuracy_score(
        y_true=labels, y_pred=predictions,
        normalize=True)
    r = recall_score(
        y_true=labels, y_pred=predictions,
        average=average, zero_division=0)
    p = precision_score(
        y_true=labels, y_pred=predictions,
        average=average, zero_division=0)
    f1 = f1_score(
        y_true=labels, y_pred=predictions,
        average=average, zero_division=0)

    return {
        "accuracy": acc,
        "precision": p,
        "recall": r,
        "f1": f1 }

#### Train

In [13]:
from transformers import EarlyStoppingCallback

trainer = Custom_Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dset,
    eval_dataset=eval_dset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [14]:
trainer.train()
trainer.save_model(output_dir)

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,0.5596,0.505925,0.753281,0.652174,0.859873,0.741758
200,0.3369,0.397671,0.817585,0.769231,0.796178,0.782473
300,0.3123,0.682287,0.759843,0.637895,0.964968,0.768061
400,0.2594,0.62698,0.792651,0.676471,0.952229,0.791005
500,0.1475,0.442011,0.853018,0.834437,0.802548,0.818182
600,0.1013,0.599431,0.835958,0.769231,0.859873,0.81203
700,0.0959,0.691772,0.821522,0.731771,0.894904,0.805158
800,0.1018,0.752626,0.822835,0.723192,0.923567,0.811189


#### Evaluate

In [15]:
eval_res = trainer.evaluate()
pprint.pprint(eval_res)

{'epoch': 33.33,
 'eval_accuracy': 0.8530183727034121,
 'eval_f1': 0.8181818181818182,
 'eval_loss': 0.44201064109802246,
 'eval_precision': 0.8344370860927153,
 'eval_recall': 0.802547770700637,
 'eval_runtime': 2.8278,
 'eval_samples_per_second': 269.467,
 'eval_steps_per_second': 8.487}


#### Log

In [16]:
train_log, eval_log = [], []
for e in trainer.state.log_history:
    e_keys = set(e)
    if "loss" in e_keys: train_log.append(e)
    elif "eval_loss" in e_keys: eval_log.append(e)
    elif "train_runtime" in e_keys:
        with open(f"{log_dir}/trainer_info.json", 'w+', encoding='utf-8') as fin:
            json.dump(e, fin, ensure_ascii=False, indent=4)

if train_log != []:
    train_log_df = pd.DataFrame.from_dict(train_log) \
        .sort_values("step", ascending=True) \
        .reset_index(drop=True)
    train_log_df.to_csv(f"{log_dir}/log_trainset.csv", index=False)

if eval_log != []:
    eval_log_df = pd.DataFrame.from_dict(eval_log) \
        .sort_values("step", ascending=True) \
        .reset_index(drop=True)
    eval_log_df.to_csv(f"{log_dir}/log_evalset.csv", index=False)

# 4. Inference

In [17]:
ckpt_fpath = "ckpts/bert-from-scratch"

model = BertForSequenceClassification.from_pretrained(ckpt_fpath,
    num_labels=2)

In [18]:
# Tokenize
X_test_tokenized = tokenizer(test_df['tweet'].tolist(),
    padding=True, truncation=True, max_length=512)

# Create dset
test_dset = SentimentAnalysis_Dset(
    encodings=X_test_tokenized,
    labels=test_df['label'].tolist())

# Inference
test_trainer = Trainer(model)
preds, _ , _ = test_trainer.predict(test_dset)
print(f"{preds.shape = }")

# Prediction
y_preds = np.argmax(preds, axis=1).astype(int)
print(f"{y_preds.shape = }")

preds.shape = (1633, 2)
y_preds.shape = (1633,)


In [19]:
# scores
num_classes = 2
scores = pd.DataFrame(
    data=preds,
    columns=[ f"Class_{c}_score" for c in (np.arange(num_classes)) ])

# Prediction
y_preds = pd.DataFrame(
    data=y_preds,
    columns=[ 'Prediction' ])

# out
test_df = pd.concat(
    [test_df.reset_index(drop=True), scores, y_preds], axis=1)
test_df.head()

Unnamed: 0,id,label,tweet,Class_0_score,Class_1_score,Prediction
0,13539,1,black girl porn teen virgin sex pictures,-2.998579,3.328954,1
1,3168,1,@user .@user what a douchebag. like his dad! ...,-2.366719,2.525227,1
2,30372,0,", 30 rock, tracy jordan",-2.324242,2.505891,1
3,11365,0,back to schoolð¤ð #dnhs ð,2.7786,-3.102725,0
4,22540,0,yes these days they call stuff like this journ...,1.534112,-1.644753,0


In [20]:
def calculate_accuracy(df, num_classes=2, average='binary'):
    assert average in [None, 'binary', 'micro', 'macro', 'weighted']
    y_test = df['label'].values
    y_pred = df.loc[:, [f'Class_{c}_score' for c in range(num_classes)]] \
        .to_numpy().argmax(axis=1)

    # accuracy, precision, recall, f1-score
    acc = accuracy_score(
        y_true=y_test,
        y_pred=y_pred, normalize=True)

    p = precision_score(
        y_true=y_test,
        y_pred=y_pred, average=average)

    r = recall_score(
        y_true=y_test,
        y_pred=y_pred, average=average)

    f1 = f1_score(
        y_true=y_test,
        y_pred=y_pred, average=average)

    return acc, p, r, f1

acc, p, r, f1 = calculate_accuracy(test_df, num_classes=2)
print(f"Test Accuracy: {acc}")
print(f"Test Precision: {p}")
print(f"Test Recall: {r}")
print(f"Test F1 score: {f1}")

Test Accuracy: 0.8493570116350275
Test Precision: 0.8505747126436781
Test Recall: 0.7696879643387816
Test F1 score: 0.8081123244929797
