In this notebook, we train the Hindi language text-only classification models for Humanitarian Classification Task

We install libraries needed for the task.

In [1]:
!pip install tweet-preprocessor
!pip install transformers
!pip install ftfy



We use the [CrisisMMD](https://arxiv.org/abs/1805.00713) dataset comprising of 7216 Twitter posts (images + text) that are categorized into 5 humanitarian categories. We store the dataset (eg filename for DEV set : `task_humanitarian_text_img_dev.tsv`) on a Google Drive at the (relative) path `/content/drive/My Drive/crisis_bert/data/` and mount the drive at this path. We are assuming that this notebook is stored at the path /content/drive/My Drive/crisis_bert/code

In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/crisis_bert/code')
!pwd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/crisis_bert/code


For each of the train, test and dev files, we preprocess the data and and pickle and store the preprocessed files at `/content/drive/My Drive/crisis_bert/data`

In [3]:
import numpy as np
import pandas as pd
import os
import sys
import pickle as pkl
import preprocessor as tp
import ftfy

def FormatText(filename):
    data_list = []
    data = pd.read_csv(filename, sep = '\t')

    # Clean the "tweet_text" column
    tp.set_options(tp.OPT.URL, tp.OPT.EMOJI, tp.OPT.SMILEY, tp.OPT.RESERVED)
    data["tweet_text"] = data["tweet_text"].apply(lambda x: tp.clean(x))
    data["tweet_text"] = data["tweet_text"].apply(lambda x : ftfy.fix_text(x))
    data["tweet_text"] = data["tweet_text"].str.replace(r'\\n',' ', regex=True) 
    data["tweet_text"] = data["tweet_text"].str.replace(r"\'t", " not")
    data["tweet_text"] = data["tweet_text"].str.strip()
    data["tweet_text"] = data["tweet_text"].str.replace("#","")
    data["tweet_text"] = data["tweet_text"].str.replace("@","")
    tweet_id = data['tweet_id'].to_list()
    image_id = data['image_id'].to_list()
    tweet_text = data['tweet_text'].to_list()
    tweet_text = [str(x) for x in tweet_text]

    label = data['label'].to_list()
    alignment = data['label_text_image'].to_list()
    for a_var in range(len(tweet_id)):
        data_point = {}
        if alignment[a_var] == 'Positive':
            data_point['tweet_id'] = tweet_id[a_var]
            data_point['image_id'] = image_id[a_var]
            data_point['tweet_text'] = tweet_text[a_var]
            data_point['label'] = label[a_var] #these labels are not final, need to be updated in a downstream script
            data_list.append(data_point)
    return data_list

folderpath = '../data/'
filenames = ['task_humanitarian_text_img_dev.tsv', 'task_humanitarian_text_img_test.tsv', 'task_humanitarian_text_img_train.tsv']

for a_file in filenames:
    data = FormatText(folderpath + a_file)
    source_text = [x['tweet_text'] for x in data]
    for a_var in range(len(data)):
        data[a_var]['tweet_text'] = [source_text[a_var]]
    with open(folderpath + a_file.split('.')[0] + '.pkl', 'wb') as f:
        pkl.dump(data, f)



We use these pickled files to convert the preprocessed data to a suitable input for building and training the model.

In [4]:
import os, sys, re
import torch
import pickle as pkl
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertForMaskedLM, BertTokenizer

NUM_LABELS = 5

class DatasetFormatting(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def AlignFormatLabels(list, labels):
    aligned_list = []
    for item in list:
        if item == 'vehicle_damage':
            aligned_list.append('infrastructure_and_utility_damage')
        elif item == 'missing_or_found_people' or item == 'injured_or_dead_people':
            aligned_list.append('affected_individuals')
        else:
            aligned_list.append(item)
    final_labels = []
    for item in aligned_list:
        final_labels.append(labels.index(item))
    return final_labels

def GetData(folder):
    filenames_prefix = 'task_humanitarian_text_img_'
    with open(folder + filenames_prefix + 'train.pkl', 'rb') as f:
        train_data = pkl.load(f)
    with open(folder + filenames_prefix + 'dev.pkl', 'rb') as f:
        val_data = pkl.load(f)
    with open(folder + filenames_prefix + 'test.pkl', 'rb') as f:
        test_data = pkl.load(f)
    train_texts = []
    train_labels = []
    train_ids = []
    for a_point in train_data:
        # print(a_point)
        train_ids.append(a_point['tweet_id'])
        train_texts.append(a_point['tweet_text'])
        train_labels.append(a_point['label'])
    val_texts = []
    val_labels = []
    val_ids = []
    for a_point in val_data:
        val_ids.append(a_point['tweet_id'])
        val_texts.append(a_point['tweet_text'])
        val_labels.append(a_point['label'])
    test_texts = []
    test_labels = []
    test_ids = []
    for a_point in test_data:
        test_ids.append(a_point['tweet_id'])
        test_texts.append(a_point['tweet_text'])
        test_labels.append(a_point['label'])
    my_labels = ['other_relevant_information', 'affected_individuals', 'rescue_volunteering_or_donation_effort', 'infrastructure_and_utility_damage', 'not_humanitarian']
    
    train_texts = [x[0] for x in train_texts]
    val_texts = [x[0] for x in val_texts]
    test_texts = [x[0] for x in test_texts]
    train_labels = AlignFormatLabels(train_labels, my_labels)
    val_labels = AlignFormatLabels(val_labels, my_labels)
    test_labels = AlignFormatLabels(test_labels, my_labels)
    return train_texts, train_labels, train_ids, val_texts, val_labels, val_ids, test_texts, test_labels, test_ids

pickle_folder_path = '../data/'
train_texts, train_labels, train_ids, val_texts, val_labels, val_ids, test_texts, test_labels, test_ids = GetData(pickle_folder_path)

In [5]:
import pandas as pd 
train_df = pd.DataFrame({'text': train_texts,'label': train_labels})
eval_df = pd.DataFrame({'text': val_texts,'label': val_labels})
test_df = pd.DataFrame({'text': test_texts,'label': test_labels})
train_df.head(5)

Unnamed: 0,text,label
0,KAKEnews: California wildfires destroy more th...,3
1,KAKEnews: California wildfires destroy more th...,3
2,KAKEnews: California wildfires destroy more th...,3
3,TheAtlantic: Photos of California's destructiv...,3
4,Why California's wildfires are worse in the fall.,0


In [6]:
! pip install --upgrade pandas simpletransformers



In [7]:
# set use_cuda=False on CPU-only platforms
from simpletransformers.classification import ClassificationModel
import sklearn

model = ClassificationModel('electra', 'monsoon-nlp/hindi-bert', num_labels=NUM_LABELS, use_cuda=True, args={
    'reprocess_input_data': True,
    'use_cached_eval_features': False,
    'overwrite_output_dir': True,
    'num_train_epochs': 1,
    'output_dir' : './results_',          # output directory
    'train_batch_size' : 8,  # batchx size per device during training
    'eval_batch_size' : 16,   # batch size for evaluation
    'warmup_steps' : 500,                # number of warmup steps for learning rate scheduler
    # learning_rate = 5e-5
    'weight_decay' : 0.01,               # strength of weight decay
    'logging_dir' : './logs',            # directory for storing logs
    'logging_steps' : 10,
    'save_steps' : 500,
    'evaluation_strategy' : 'epoch'
})
model.train_model(train_df, acc=sklearn.metrics.accuracy_score)

Downloading:   0%|          | 0.00/572 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/56.2M [00:00<?, ?B/s]

Some weights of the model checkpoint at monsoon-nlp/hindi-bert were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monsoon-nlp/hindi-bert and are newly initialized: ['classifier.dense.bias', 'classifie

Downloading:   0%|          | 0.00/181 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/593k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/6126 [00:00<?, ?it/s]



Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/766 [00:00<?, ?it/s]

(766, 1.3769921153083482)

In [8]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df, acc=sklearn.metrics.accuracy_score)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


  0%|          | 0/998 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/63 [00:00<?, ?it/s]

In [9]:
#note that order of target_names must be aligned with 0,1,2,3,4

predictions, raw_outputs = model.predict(test_texts)
from sklearn.metrics import classification_report
target_names=['other_relevant_information', 'affected_individuals', 'rescue_volunteering_or_donation_effort', 'infrastructure_and_utility_damage', 'not_humanitarian']
print(classification_report(test_labels, predictions,target_names=target_names))

  0%|          | 0/955 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

                                        precision    recall  f1-score   support

            other_relevant_information       0.00      0.00      0.00       235
                  affected_individuals       0.00      0.00      0.00         9
rescue_volunteering_or_donation_effort       0.00      0.00      0.00       126
     infrastructure_and_utility_damage       0.00      0.00      0.00        81
                      not_humanitarian       0.53      1.00      0.69       504

                              accuracy                           0.53       955
                             macro avg       0.11      0.20      0.14       955
                          weighted avg       0.28      0.53      0.36       955



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
