In this notebook, we train the English language text-only classification models for Humanitarian Classification Task

We install libraries needed for the task.

In [1]:
!pip install tweet-preprocessor
!pip install transformers
!pip install ftfy



We use the [CrisisMMD](https://arxiv.org/abs/1805.00713) dataset comprising of 7216 Twitter posts (images + text) that are categorized into 5 humanitarian categories. We store the dataset (eg filename for DEV set : `task_humanitarian_text_img_dev.tsv`) on a Google Drive at the (relative) path `/content/drive/My Drive/crisis_bert/data/` and mount the drive at this path. We are assuming that this notebook is stored at the path /content/drive/My Drive/crisis_bert/code

In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/crisis_bert/code')
!pwd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/crisis_bert/code


For each of the train, test and dev files, we preprocess the data and and pickle and store the preprocessed files at `/content/drive/My Drive/crisis_bert/data`

In [3]:
import numpy as np
import pandas as pd
import os
import sys
import pickle as pkl
import preprocessor as tp
import ftfy

def FormatText(filename):
    data_list = []
    data = pd.read_csv(filename, sep = '\t')

    # Clean the "tweet_text" column
    tp.set_options(tp.OPT.URL, tp.OPT.EMOJI, tp.OPT.SMILEY, tp.OPT.RESERVED)
    data["tweet_text"] = data["tweet_text"].apply(lambda x: tp.clean(x))
    data["tweet_text"] = data["tweet_text"].apply(lambda x : ftfy.fix_text(x))
    data["tweet_text"] = data["tweet_text"].str.replace(r'\\n',' ', regex=True) 
    data["tweet_text"] = data["tweet_text"].str.replace(r"\'t", " not")
    data["tweet_text"] = data["tweet_text"].str.strip()
    data["tweet_text"] = data["tweet_text"].str.replace("#","")
    data["tweet_text"] = data["tweet_text"].str.replace("@","")
    tweet_id = data['tweet_id'].to_list()
    image_id = data['image_id'].to_list()
    tweet_text = data['tweet_text'].to_list()
    tweet_text = [str(x) for x in tweet_text]

    label = data['label'].to_list()
    alignment = data['label_text_image'].to_list()
    for a_var in range(len(tweet_id)):
        data_point = {}
        if alignment[a_var] == 'Positive':
            data_point['tweet_id'] = tweet_id[a_var]
            data_point['image_id'] = image_id[a_var]
            data_point['tweet_text'] = tweet_text[a_var]
            data_point['label'] = label[a_var] #these labels are not final, need to be updated in a downstream script
            data_list.append(data_point)
    return data_list

folderpath = '../data/'
filenames = ['task_humanitarian_text_img_dev.tsv', 'task_humanitarian_text_img_test.tsv', 'task_humanitarian_text_img_train.tsv']

for a_file in filenames:
    data = FormatText(folderpath + a_file)
    source_text = [x['tweet_text'] for x in data]
    for a_var in range(len(data)):
        data[a_var]['tweet_text'] = [source_text[a_var]]
    with open(folderpath + a_file.split('.')[0] + '.pkl', 'wb') as f:
        pkl.dump(data, f)



We use these pickled files to convert the preprocessed data to a suitable input for building and training the model.

In [8]:
import os, sys, re

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

import torch
import pickle as pkl
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertForMaskedLM, BertTokenizer

NUM_LABELS = 5

class DatasetFormatting(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def AlignFormatLabels(list, labels):
    aligned_list = []
    for item in list:
        if item == 'vehicle_damage':
            aligned_list.append('infrastructure_and_utility_damage')
        elif item == 'missing_or_found_people' or item == 'injured_or_dead_people':
            aligned_list.append('affected_individuals')
        else:
            aligned_list.append(item)
    final_labels = []
    for item in aligned_list:
        final_labels.append(labels.index(item))
    return final_labels

def GetData(folder):
    filenames_prefix = 'task_humanitarian_text_img_'
    with open(folder + filenames_prefix + 'train.pkl', 'rb') as f:
        train_data = pkl.load(f)
    with open(folder + filenames_prefix + 'dev.pkl', 'rb') as f:
        val_data = pkl.load(f)
    with open(folder + filenames_prefix + 'test.pkl', 'rb') as f:
        test_data = pkl.load(f)
    train_texts = []
    train_labels = []
    train_ids = []
    for a_point in train_data:
        # print(a_point)
        train_ids.append(a_point['tweet_id'])
        train_texts.append(a_point['tweet_text'])
        train_labels.append(a_point['label'])
    val_texts = []
    val_labels = []
    val_ids = []
    for a_point in val_data:
        val_ids.append(a_point['tweet_id'])
        val_texts.append(a_point['tweet_text'])
        val_labels.append(a_point['label'])
    test_texts = []
    test_labels = []
    test_ids = []
    for a_point in test_data:
        test_ids.append(a_point['tweet_id'])
        test_texts.append(a_point['tweet_text'])
        test_labels.append(a_point['label'])
    my_labels = ['other_relevant_information', 'affected_individuals', 'rescue_volunteering_or_donation_effort', 'infrastructure_and_utility_damage', 'not_humanitarian']
    
    train_texts = [x[0] for x in train_texts]
    val_texts = [x[0] for x in val_texts]
    test_texts = [x[0] for x in test_texts]
    train_labels = AlignFormatLabels(train_labels, my_labels)
    val_labels = AlignFormatLabels(val_labels, my_labels)
    test_labels = AlignFormatLabels(test_labels, my_labels)
    return train_texts, train_labels, train_ids, val_texts, val_labels, val_ids, test_texts, test_labels, test_ids

pickle_folder_path = '../data/'
train_texts, train_labels, train_ids, val_texts, val_labels, val_ids, test_texts, test_labels, test_ids = GetData(pickle_folder_path)

In [9]:
import pandas as pd 
train_df = pd.DataFrame({'text': train_texts,'label': train_labels})
eval_df = pd.DataFrame({'text': val_texts,'label': val_labels})
test_df = pd.DataFrame({'text': test_texts,'label': test_labels})
train_df.head(5)

Unnamed: 0,text,label
0,KAKEnews: California wildfires destroy more th...,3
1,KAKEnews: California wildfires destroy more th...,3
2,KAKEnews: California wildfires destroy more th...,3
3,TheAtlantic: Photos of California's destructiv...,3
4,Why California's wildfires are worse in the fall.,0


In [6]:
! pip install --upgrade pandas simpletransformers



In [11]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import sklearn

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)

val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = DatasetFormatting(train_encodings, train_labels)
val_dataset = DatasetFormatting(val_encodings, val_labels)
test_dataset = DatasetFormatting(test_encodings, test_labels)

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [14]:
def compute_metrics(pred):
    labels = pred.label_ids
    print(pred.predictions)
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

For demo purposes, we train only for 2 epochs.

In [16]:
#comment this line to enable logging of weights and biases 
#(you might need a wandb account and an API key for this!)
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir='./results_',          # output directory
    num_train_epochs=2 ,             # total number of training epochs
    per_device_train_batch_size=8,  # batchx size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    # learning_rate = 5e-5
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    save_steps = 500,
    evaluation_strategy='epoch'
)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=NUM_LABELS)

model.config.output_hidden_states = True

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    tokenizer = tokenizer,
    # compute_metrics=compute_metrics,
    eval_dataset=val_dataset             # evaluation dataset
)

# Training
trainer.train()
# trainer.train()
trainer.save_model('trained_models')


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0

Epoch,Training Loss,Validation Loss
1,1.0218,0.992202
2,0.5654,0.817367


Saving model checkpoint to ./results_/checkpoint-500
Configuration saved in ./results_/checkpoint-500/config.json
Model weights saved in ./results_/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results_/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results_/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 998
  Batch size = 16
Saving model checkpoint to ./results_/checkpoint-1000
Configuration saved in ./results_/checkpoint-1000/config.json
Model weights saved in ./results_/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./results_/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results_/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to ./results_/checkpoint-1500
Configuration saved in ./results_/checkpoint-1500/config.json
Model weights saved in ./results_/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in ./results_/checkpoint-1500/token

In [17]:
trainer.save_model('trained_models')
torch.save(model.state_dict(), 'trained_models_new/trained_model.pt')

Saving model checkpoint to trained_models
Configuration saved in trained_models/config.json
Model weights saved in trained_models/pytorch_model.bin
tokenizer config file saved in trained_models/tokenizer_config.json
Special tokens file saved in trained_models/special_tokens_map.json


In [18]:
from tqdm import tqdm
device = "cuda:0"
model = model.to(device)

embeddings_train = {}
embeddings_val = {}
embeddings_test = {}
for i in tqdm(range(0,len(train_ids))):
    input = torch.tensor(tokenizer.encode(train_texts[i])).unsqueeze(0).to(device)
    outputs = model(input)
    embedding = torch.mean(outputs.hidden_states[-1], 1, True).cpu()
    embedding = embedding.detach().numpy()
    embedding = np.reshape(embedding, (embedding.shape[0], embedding.shape[-1])) 
    embeddings_train[train_ids[i]] = embedding

for i in tqdm(range(0,len(val_ids))):
    input = torch.tensor(tokenizer.encode(val_texts[i])).unsqueeze(0).to(device)
    outputs = model(input)
    embedding = torch.mean(outputs.hidden_states[-1], 1, True).cpu()
    embedding = embedding.detach().numpy()
    embedding = np.reshape(embedding, (embedding.shape[0], embedding.shape[-1])) 
    embeddings_val[val_ids[i]] = embedding

for i in tqdm(range(0,len(test_ids))):
    input = torch.tensor(tokenizer.encode(test_texts[i])).unsqueeze(0).to(device)
    outputs = model(input)
    embedding = torch.mean(outputs.hidden_states[-1], 1, True).cpu()
    embedding = embedding.detach().numpy()
    embedding = np.reshape(embedding, (embedding.shape[0], embedding.shape[-1])) 
    embeddings_test[test_ids[i]] = embedding



100%|██████████| 6126/6126 [00:43<00:00, 140.24it/s]
100%|██████████| 998/998 [00:07<00:00, 139.72it/s]
100%|██████████| 955/955 [00:06<00:00, 139.03it/s]


In [19]:
import pickle
with open('embeddings_train.pickle', 'wb') as handle:
    pickle.dump(embeddings_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('embeddings_val.pickle', 'wb') as handle:
    pickle.dump(embeddings_val, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('embeddings_test.pickle', 'wb') as handle:
    pickle.dump(embeddings_test, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [20]:
# Evaluation
trainer.evaluate()
#Testing
probs, _, metrics = trainer.predict(test_dataset)
# print(type(probs[0]))
pred_labels = probs[0].argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, pred_labels, average='macro')
acc = accuracy_score(test_labels, pred_labels)
test_dict = {'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall}
print(test_dict)
print(metrics)

***** Running Evaluation *****
  Num examples = 998
  Batch size = 16


***** Running Prediction *****
  Num examples = 955
  Batch size = 16


{'accuracy': 0.7015706806282722, 'f1': 0.48216447110445754, 'precision': 0.4931934873651304, 'recall': 0.4774533753611768}
{'test_loss': 0.8699346780776978, 'test_runtime': 3.9618, 'test_samples_per_second': 241.049, 'test_steps_per_second': 15.144}


  _warn_prf(average, modifier, msg_start, len(result))
