In [1]:
import torch
from torch.utils.data import DataLoader
import csv
from tqdm.notebook import tqdm

# !jupyter nbextension enable --py widgetsnbextension

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [3]:
import pandas as pd

train_data_path = "/home/liefe/Downloads/bbc-text.csv"
df = pd.read_csv(train_data_path)

df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
pd.unique(df.category)

array(['tech', 'business', 'sport', 'entertainment', 'politics'],
      dtype=object)

In [5]:
df.category.value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [6]:
df2 = pd.get_dummies(df["category"])
df2

Unnamed: 0,business,entertainment,politics,sport,tech
0,0,0,0,0,1
1,1,0,0,0,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,1,0,0,0
...,...,...,...,...,...
2220,1,0,0,0,0
2221,0,0,1,0,0
2222,0,1,0,0,0
2223,0,0,1,0,0


In [7]:
data = pd.concat([df, df2], axis=1)
data

Unnamed: 0,category,text,business,entertainment,politics,sport,tech
0,tech,tv future in the hands of viewers with home th...,0,0,0,0,1
1,business,worldcom boss left books alone former worldc...,1,0,0,0,0
2,sport,tigers wary of farrell gamble leicester say ...,0,0,0,1,0
3,sport,yeading face newcastle in fa cup premiership s...,0,0,0,1,0
4,entertainment,ocean s twelve raids box office ocean s twelve...,0,1,0,0,0
...,...,...,...,...,...,...,...
2220,business,cars pull down us retail figures us retail sal...,1,0,0,0,0
2221,politics,kilroy unveils immigration policy ex-chatshow ...,0,0,1,0,0
2222,entertainment,rem announce new glasgow concert us band rem h...,0,1,0,0,0
2223,politics,how political squabbles snowball it s become c...,0,0,1,0,0


In [8]:
pd.factorize(df["category"])

(array([0, 1, 2, ..., 3, 4, 2]),
 Index(['tech', 'business', 'sport', 'entertainment', 'politics'], dtype='object'))

In [9]:
df["category"] = pd.factorize(df["category"])[0]
df

Unnamed: 0,category,text
0,0,tv future in the hands of viewers with home th...
1,1,worldcom boss left books alone former worldc...
2,2,tigers wary of farrell gamble leicester say ...
3,2,yeading face newcastle in fa cup premiership s...
4,3,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,1,cars pull down us retail figures us retail sal...
2221,4,kilroy unveils immigration policy ex-chatshow ...
2222,3,rem announce new glasgow concert us band rem h...
2223,4,how political squabbles snowball it s become c...


In [10]:
from datasets import Dataset

ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['category', 'text'],
    num_rows: 2225
})

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Get HF tokenizer, let's try this one:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

In [12]:
# Set up tokenizer to encode whole dataset
ds = ds.map(lambda example: tokenizer(example["text"], padding="max_length", truncation=True),
            batched=True,
            remove_columns=["text"])
print(ds[0])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  6.63ba/s]

{'category': 0, 'input_ids': [0, 18724, 499, 11, 5, 1420, 9, 5017, 19, 184, 8870, 1743, 1437, 29051, 239, 12, 38835, 326, 15597, 1437, 8, 1778, 569, 638, 268, 1375, 88, 5, 1207, 929, 1437, 5, 169, 82, 1183, 30016, 40, 28, 26396, 430, 11, 292, 107, 1437, 86, 4, 1437, 14, 16, 309, 7, 41, 3827, 2798, 61, 4366, 23, 5, 1013, 2267, 8917, 311, 11, 5573, 5030, 16306, 7, 2268, 141, 209, 92, 4233, 40, 913, 65, 9, 84, 5548, 375, 9452, 4, 19, 5, 201, 981, 5, 2904, 1437, 8864, 8, 97, 1383, 40, 28, 2781, 7, 5017, 1241, 184, 4836, 1437, 149, 6129, 1437, 7595, 1437, 9146, 29, 451, 1437, 8, 11451, 544, 4898, 7, 760, 5351, 8, 15295, 2110, 4, 1437, 65, 9, 5, 144, 3244, 12, 9006, 4233, 9, 740, 293, 34, 57, 1778, 8, 1081, 569, 638, 268, 36, 417, 37032, 8, 181, 37032, 322, 209, 278, 12, 8766, 7644, 1437, 101, 5, 201, 579, 326, 9697, 8, 5, 1717, 330, 579, 6360, 2744, 467, 1437, 1157, 82, 7, 638, 1437, 1400, 1437, 310, 1437, 13787, 8, 556, 2508, 30016, 8864, 77, 51, 236, 4, 1437, 5700, 1437, 5, 806, 2386, 13,




In [13]:
len(ds[0]["input_ids"])

512

Thus the input has been truncated to max=512

In [14]:
# Remove unnecessary columns/can also set above
ds = ds.rename_column("category", "labels")
print(ds[0])

{'labels': 0, 'input_ids': [0, 18724, 499, 11, 5, 1420, 9, 5017, 19, 184, 8870, 1743, 1437, 29051, 239, 12, 38835, 326, 15597, 1437, 8, 1778, 569, 638, 268, 1375, 88, 5, 1207, 929, 1437, 5, 169, 82, 1183, 30016, 40, 28, 26396, 430, 11, 292, 107, 1437, 86, 4, 1437, 14, 16, 309, 7, 41, 3827, 2798, 61, 4366, 23, 5, 1013, 2267, 8917, 311, 11, 5573, 5030, 16306, 7, 2268, 141, 209, 92, 4233, 40, 913, 65, 9, 84, 5548, 375, 9452, 4, 19, 5, 201, 981, 5, 2904, 1437, 8864, 8, 97, 1383, 40, 28, 2781, 7, 5017, 1241, 184, 4836, 1437, 149, 6129, 1437, 7595, 1437, 9146, 29, 451, 1437, 8, 11451, 544, 4898, 7, 760, 5351, 8, 15295, 2110, 4, 1437, 65, 9, 5, 144, 3244, 12, 9006, 4233, 9, 740, 293, 34, 57, 1778, 8, 1081, 569, 638, 268, 36, 417, 37032, 8, 181, 37032, 322, 209, 278, 12, 8766, 7644, 1437, 101, 5, 201, 579, 326, 9697, 8, 5, 1717, 330, 579, 6360, 2744, 467, 1437, 1157, 82, 7, 638, 1437, 1400, 1437, 310, 1437, 13787, 8, 556, 2508, 30016, 8864, 77, 51, 236, 4, 1437, 5700, 1437, 5, 806, 2386, 13, 2

In [15]:
# Convert to torch tensors.
# Strange things happen if you forget this, like time major ds
ds.set_format(type="torch", device=device)
print(ds[0])

{'labels': tensor(0, device='cuda:0'), 'input_ids': tensor([    0, 18724,   499,    11,     5,  1420,     9,  5017,    19,   184,
         8870,  1743,  1437, 29051,   239,    12, 38835,   326, 15597,  1437,
            8,  1778,   569,   638,   268,  1375,    88,     5,  1207,   929,
         1437,     5,   169,    82,  1183, 30016,    40,    28, 26396,   430,
           11,   292,   107,  1437,    86,     4,  1437,    14,    16,   309,
            7,    41,  3827,  2798,    61,  4366,    23,     5,  1013,  2267,
         8917,   311,    11,  5573,  5030, 16306,     7,  2268,   141,   209,
           92,  4233,    40,   913,    65,     9,    84,  5548,   375,  9452,
            4,    19,     5,   201,   981,     5,  2904,  1437,  8864,     8,
           97,  1383,    40,    28,  2781,     7,  5017,  1241,   184,  4836,
         1437,   149,  6129,  1437,  7595,  1437,  9146,    29,   451,  1437,
            8, 11451,   544,  4898,     7,   760,  5351,     8, 15295,  2110,
            

In [16]:
ds = ds.train_test_split(.1)

In [17]:
ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2002
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 223
    })
})

In [18]:
dataloaders = {partition: DataLoader(ds[partition], batch_size=8, shuffle=True) for partition in ds.keys()}
dataloaders

{'train': <torch.utils.data.dataloader.DataLoader at 0x7fe9f477f850>,
 'test': <torch.utils.data.dataloader.DataLoader at 0x7fe9f47f99c0>}

In [19]:
for batch in dataloaders["train"]:
    print(batch.values())
    break

dict_values([tensor([3, 4, 0, 0, 2, 4, 0, 2], device='cuda:0'), tensor([[    0,   687,  4440,  ...,     1,     1,     1],
        [    0,  2413, 12778,  ...,  3825,    11,     2],
        [    0,  4291,  2607,  ...,    16, 11190,     2],
        ...,
        [    0,  4651,  2767,  ...,    32,   103,     2],
        [    0, 36436, 20654,  ...,     1,     1,     1],
        [    0,  4651, 24773,  ...,    35, 28041,     2]], device='cuda:0'), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]], device='cuda:0')])


In [20]:
model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base", num_labels=5)
model.to(device)

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.bias

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [21]:
from transformers import AdamW

# Set weight decay on params
names_params = list(model.named_parameters())
no_decay = ("bias", "gamma", "beta")
optimizer_grouped_params = (
    dict(params=(p for n, p in names_params if not any(nd in n for nd in no_decay)),
         weight_decay=0.01),
    dict(params=(p for n, p in names_params if any(nd in n for nd in no_decay)),
         weight_decay=0.0)
)

lr = 5e-5
optimizer = AdamW(optimizer_grouped_params, lr=lr, correct_bias=False)



In [22]:
# from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# def compute_metrics(p):    
#     pred, labels = p
#     pred = np.argmax(pred, axis=1)
#     accuracy = accuracy_score(y_true=labels, y_pred=pred)
#     recall = recall_score(y_true=labels, y_pred=pred)
#     precision = precision_score(y_true=labels, y_pred=pred)
#     f1 = f1_score(y_true=labels, y_pred=pred)    
#     return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1} 


In [23]:
from transformers import Trainer, TrainingArguments
import numpy as np

torch.cuda.empty_cache()

args = TrainingArguments(output_dir=".",
                         per_device_train_batch_size=8,
                         per_device_eval_batch_size=8,
                         dataloader_pin_memory=False,
                         evaluation_strategy="epoch",
                         gradient_accumulation_steps=3,
                         learning_rate=5e-5,
                         warmup_steps=1000,
                         optim="adamw_hf")

trainer = Trainer(model=model,
                  args=args,
                  train_dataset=ds["train"],
                  eval_dataset=ds["test"],
                  #compute_metrics=compute_metrics
                 )
trainer.train()


***** Running training *****
  Num examples = 2002
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 24
  Gradient Accumulation steps = 3
  Total optimization steps = 249


Epoch,Training Loss,Validation Loss
0,No log,1.521323
1,No log,0.098699
2,No log,0.018713


***** Running Evaluation *****
  Num examples = 223
  Batch size = 8
***** Running Evaluation *****
  Num examples = 223
  Batch size = 8
***** Running Evaluation *****
  Num examples = 223
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=249, training_loss=0.7996157604049009, metrics={'train_runtime': 189.3994, 'train_samples_per_second': 31.711, 'train_steps_per_second': 1.315, 'total_flos': 794317016616960.0, 'train_loss': 0.7996157604049009, 'epoch': 2.99})

In [24]:
# trainer.evaluate(ds["test"], metric_key_prefix="eval_acc")

In [25]:
out = trainer.predict(ds["test"])

***** Running Prediction *****
  Num examples = 223
  Batch size = 8


In [27]:
out

PredictionOutput(predictions=array([[-1.454209  ,  5.1654735 , -1.1436566 , -1.540897  , -1.344166  ],
       [ 0.45798662,  4.688294  , -1.6425015 , -1.6239082 , -1.738363  ],
       [-1.383109  , -1.576714  ,  5.087396  , -1.1475655 , -0.9978528 ],
       ...,
       [-1.2850131 , -0.9422651 , -1.4965856 , -1.3963528 ,  4.7641954 ],
       [-0.9424531 , -1.325981  , -1.2116297 ,  4.647848  , -1.041074  ],
       [ 5.0915403 , -1.0789853 , -1.3039528 , -1.0373774 , -1.2336445 ]],
      dtype=float32), label_ids=array([1, 1, 2, 0, 0, 3, 1, 1, 4, 3, 4, 2, 1, 4, 2, 4, 3, 1, 0, 3, 3, 3,
       2, 1, 4, 1, 1, 0, 0, 0, 1, 3, 2, 0, 2, 1, 1, 0, 4, 0, 3, 3, 1, 1,
       3, 2, 0, 3, 1, 1, 1, 2, 1, 2, 2, 1, 1, 4, 2, 3, 0, 0, 2, 3, 4, 4,
       4, 2, 4, 1, 4, 1, 3, 2, 3, 3, 0, 3, 3, 1, 3, 1, 0, 1, 3, 2, 0, 0,
       1, 0, 2, 4, 3, 4, 3, 3, 0, 1, 1, 3, 2, 1, 4, 1, 1, 1, 1, 3, 2, 3,
       4, 2, 4, 2, 2, 3, 2, 2, 4, 3, 0, 3, 2, 0, 4, 2, 1, 4, 0, 3, 2, 3,
       2, 1, 4, 2, 4, 1, 3, 1, 1, 4, 1, 1, 4

In [28]:
y_pred, y_true, _  = out

In [29]:
y_pred

array([[-1.454209  ,  5.1654735 , -1.1436566 , -1.540897  , -1.344166  ],
       [ 0.45798662,  4.688294  , -1.6425015 , -1.6239082 , -1.738363  ],
       [-1.383109  , -1.576714  ,  5.087396  , -1.1475655 , -0.9978528 ],
       ...,
       [-1.2850131 , -0.9422651 , -1.4965856 , -1.3963528 ,  4.7641954 ],
       [-0.9424531 , -1.325981  , -1.2116297 ,  4.647848  , -1.041074  ],
       [ 5.0915403 , -1.0789853 , -1.3039528 , -1.0373774 , -1.2336445 ]],
      dtype=float32)

In [30]:
y_pred = np.argmax(y_pred, axis=1)
y_pred

array([1, 1, 2, 0, 0, 3, 1, 1, 4, 3, 4, 2, 1, 4, 2, 4, 3, 1, 0, 3, 3, 3,
       2, 1, 4, 1, 1, 0, 0, 0, 1, 3, 2, 0, 2, 1, 1, 0, 4, 0, 3, 3, 1, 1,
       3, 2, 0, 3, 1, 1, 1, 2, 1, 2, 2, 1, 1, 4, 2, 3, 0, 0, 2, 3, 4, 4,
       4, 2, 4, 1, 4, 1, 3, 2, 3, 3, 0, 3, 3, 1, 3, 1, 0, 1, 3, 2, 0, 0,
       1, 0, 2, 4, 3, 4, 3, 3, 0, 1, 1, 3, 2, 1, 4, 1, 1, 1, 1, 3, 2, 3,
       4, 2, 4, 2, 2, 3, 2, 2, 4, 3, 0, 3, 2, 0, 4, 2, 1, 4, 0, 3, 2, 3,
       2, 1, 4, 2, 4, 1, 3, 1, 1, 4, 1, 1, 4, 1, 2, 4, 4, 0, 1, 1, 4, 2,
       2, 0, 1, 3, 2, 4, 2, 0, 4, 0, 1, 2, 3, 4, 4, 3, 1, 4, 1, 1, 1, 4,
       4, 1, 1, 3, 1, 2, 4, 0, 2, 2, 1, 3, 0, 2, 0, 2, 1, 1, 2, 0, 3, 1,
       0, 3, 2, 0, 1, 4, 2, 0, 3, 4, 0, 0, 1, 4, 2, 0, 4, 1, 3, 2, 0, 1,
       4, 3, 0])

In [31]:
from sklearn.metrics import classification_report, confusion_matrix

confusion_matrix(y_true, y_pred)

array([[37,  0,  0,  0,  0],
       [ 0, 59,  0,  0,  0],
       [ 0,  1, 44,  0,  0],
       [ 0,  0,  0, 42,  0],
       [ 0,  0,  0,  0, 40]])

In [33]:
classification_report(y_true, y_pred)

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00        37\n           1       0.98      1.00      0.99        59\n           2       1.00      0.98      0.99        45\n           3       1.00      1.00      1.00        42\n           4       1.00      1.00      1.00        40\n\n    accuracy                           1.00       223\n   macro avg       1.00      1.00      1.00       223\nweighted avg       1.00      1.00      1.00       223\n'