## Here is a script of training a text classifying model for the Kaggle competition ["Natural Language Processing with Disaster Tweets"](https://www.kaggle.com/competitions/nlp-getting-started/overview).
## Actual score with this approach is 0.81887.

In [None]:
!pip install datasets evaluate transformers[sentencepiece] torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downl

In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import load_dataset

#The dataset is provided on the competition's page
train_dataset = load_dataset("csv", data_files='/content/drive/MyDrive/nlp-getting-started/train.csv')



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-b3cde6cc715c9812/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-b3cde6cc715c9812/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
import torch
from datasets import DatasetDict

#Within the tokenization function all the text, keywords, and location data
#is collected into full_text string and tokenized.
#The target column is used as label for classification.
def tokenization(raw):
    location = ""
    if raw['location'] is not None:
        location = '\n{}'.format(raw['location'])
    keyword = ""
    if raw['keyword'] is not None:
        keyword = '\n{}'.format(raw['keyword'])
    full_text = '{}{}{}'.format(raw['text'], location, keyword)
    result = raw
    result['input_ids'] = torch.tensor(tokenizer(full_text, truncation=True, padding="max_length")["input_ids"])
    result['attention_mask'] = torch.tensor(tokenizer(full_text, truncation=True, padding="max_length")["attention_mask"])
    if 'target' in result:
      result['label'] = int(result['target'])
    return result

tokenized_train = train_dataset["train"].map(tokenization)

dataset = DatasetDict({'train': tokenized_train})
dataset = dataset['train'].train_test_split()

  0%|          | 0/7613 [00:00<?, ?ex/s]

In [None]:
from transformers import DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#The best results for the training are gotten with 5 warmup steps and
#0.005–0.01 weight decay.
training_args = TrainingArguments(
    output_dir="test_trainer",
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    eval_steps=2,
    logging_steps=2,
    num_train_epochs=12,
    per_device_train_batch_size=2,
    save_steps=2,
    save_total_limit=12,
    gradient_accumulation_steps=512,
    warmup_steps=5,
    weight_decay=0.01,
)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [11]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: keyword, id, target, text, location. If keyword, id, target, text, location are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5709
  Num Epochs = 12
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 512
  Total optimization steps = 60
  Number of trainable parameters = 66955010
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
2,2.3922,2.260211,0.506303
4,1.9922,1.480731,0.496324
6,1.5714,0.959324,0.573004
8,0.817,0.679405,0.630777
10,0.6375,0.617675,0.678046
12,0.7571,0.587382,0.704832
14,0.5505,0.550896,0.732668
16,0.6477,0.507438,0.768908
18,0.4704,0.476905,0.780987
20,0.4514,0.475489,0.785189


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: keyword, id, target, text, location. If keyword, id, target, text, location are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1904
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-2
Configuration saved in test_trainer/checkpoint-2/config.json
Model weights saved in test_trainer/checkpoint-2/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-2/tokenizer_config.json
Special tokens file saved in test_trainer/checkpoint-2/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: keyword, id, target, text, location. If keyword, id, target, text, location are not expected by `DistilBert

Step,Training Loss,Validation Loss,Accuracy
2,2.3922,2.260211,0.506303
4,1.9922,1.480731,0.496324
6,1.5714,0.959324,0.573004
8,0.817,0.679405,0.630777
10,0.6375,0.617675,0.678046
12,0.7571,0.587382,0.704832
14,0.5505,0.550896,0.732668
16,0.6477,0.507438,0.768908
18,0.4704,0.476905,0.780987
20,0.4514,0.475489,0.785189


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: keyword, id, target, text, location. If keyword, id, target, text, location are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1904
  Batch size = 8
Saving model checkpoint to test_trainer/checkpoint-28
Configuration saved in test_trainer/checkpoint-28/config.json
Model weights saved in test_trainer/checkpoint-28/pytorch_model.bin
tokenizer config file saved in test_trainer/checkpoint-28/tokenizer_config.json
Special tokens file saved in test_trainer/checkpoint-28/special_tokens_map.json
Deleting older checkpoint [test_trainer/checkpoint-4] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: keyword, id, target

TrainOutput(global_step=60, training_loss=0.5857906828324, metrics={'train_runtime': 4251.7691, 'train_samples_per_second': 16.113, 'train_steps_per_second': 0.014, 'total_flos': 8997053249316864.0, 'train_loss': 0.5857906828324, 'epoch': 11.9})

In [12]:
#The checkpoint with the least validation lost is taken
path = 'test_trainer/checkpoint-42'
result_model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=2)

loading configuration file test_trainer/checkpoint-42/config.json
Model config DistilBertConfig {
  "_name_or_path": "test_trainer/checkpoint-42",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.0",
  "vocab_size": 30522
}

loading weights file test_trainer/checkpoint-42/pytorch_model.bin
All model checkpoint weights were used when in

In [13]:
test_dataset = load_dataset("csv", data_files='/content/drive/MyDrive/nlp-getting-started/test.csv')



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-e3ed2d34aa40735c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-e3ed2d34aa40735c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
from transformers import pipeline

#All the texts from the test dataset is classified using pipeline.
#Target is marked with 0 and 1 according to the competition conditions.
def class_predicted(text):
  classif = pipeline("text-classification", model=result_model, tokenizer=tokenizer, device=0)
  result = classif(text)
  if result[0]['label'] == 'POSITIVE':
    return 1
  return 0

dataset_predicted = test_dataset.map(lambda x: {"target" : class_predicted(x["text"])})

  0%|          | 0/3263 [00:00<?, ?ex/s]

In [15]:
#Only 'id' and 'target' columns are saved as a result table.
dataset_predicted['train'].remove_columns(["keyword", "location", "text"]).to_csv("/content/drive/MyDrive/nlp-getting-started/result.csv", index=False)

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

22746