In [None]:
!pip install tweet-preprocessor
!pip install transformers
!pip install ftfy

In [None]:
from google.colab import drive
drive._mount('/content/drive/', force_remount=True)
import os
os.chdir('/content/drive/My Drive/ColabFiles/MultimodalLanguageDisparity')
!pwd

In [None]:
#!pip install torch==1.4.0
import torch
# torch.__version__

In [None]:
torch.cuda.get_device_name(0)

In [None]:
import numpy as np
import pandas as pd
import os
import sys
import pickle as pkl
import preprocessor as tp
import ftfy

def FormatText(filename):
    data_list = []
    data = pd.read_csv(filename, sep = '\t')

    # Clean the "tweet_text" column
    tp.set_options(tp.OPT.URL, tp.OPT.EMOJI, tp.OPT.SMILEY, tp.OPT.RESERVED)
    data["tweet_text"] = data["tweet_text"].apply(lambda x: tp.clean(x))
    data["tweet_text"] = data["tweet_text"].apply(lambda x : ftfy.fix_text(x))
    data["tweet_text"] = data["tweet_text"].str.replace(r'\\n',' ', regex=True) 
    data["tweet_text"] = data["tweet_text"].str.replace(r"\'t", " not")
    data["tweet_text"] = data["tweet_text"].str.strip()
    data["tweet_text"] = data["tweet_text"].str.replace("#","")
    data["tweet_text"] = data["tweet_text"].str.replace("@","")
    tweet_id = data['tweet_id'].to_list()
    image_id = data['image_id'].to_list()
    tweet_text = data['tweet_text'].to_list()
    tweet_text = [str(x) for x in tweet_text]

    label = data['label'].to_list()
    alignment = data['label_text_image'].to_list()
    for a_var in range(len(tweet_id)):
        data_point = {}
        if alignment[a_var] == 'Positive':
            data_point['tweet_id'] = tweet_id[a_var]
            data_point['image_id'] = image_id[a_var]
            data_point['tweet_text'] = tweet_text[a_var].lower()
            data_point['label'] = label[a_var]
            data_list.append(data_point)
    return data_list

folderpath = './'
!pwd
filenames = ['task_humanitarian_text_img_dev.tsv', 'task_humanitarian_text_img_test.tsv', 'task_humanitarian_text_img_train.tsv']

for a_file in filenames:
    data = FormatText(folderpath + a_file)
    source_text = [x['tweet_text'] for x in data]
    for a_var in range(len(data)):
        data[a_var]['tweet_text'] = [source_text[a_var]]
    with open(folderpath + a_file.split('.')[0] + '.pkl', 'wb') as f:
        pkl.dump(data, f)

In [None]:
#@title
import os, sys, re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
import pickle as pkl
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

NUM_LABELS = 5

class DatasetFormatting(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def appendCaptions(data):
  with open('final_captions_all.pkl', 'rb') as f:
    captions = pkl.load(f)
  for a_point in data:
    try:
      a_point['tweet_text'][0] = a_point['tweet_text'][0] + ' ' + captions[a_point['image_id'] + '.jpg'].split('<br>')[0]
    except KeyError:
      a_point['tweet_text'][0] = a_point['tweet_text'][0] + ' ' + captions[a_point['image_id'] + '.png'].split('<br>')[0]
  return data

def AppendText(data):
  with open('./image_text_kw,pkl', 'rb') as f: # THIS IS FOR EVAL OF CLOSES IMAGE'S CAPTION
    extra_text = pkl.load(f)

  for a_point in data:    # THIS IS FOR EVAL OF RULE-BASED MODELS 
    if str(a_point['tweet_id']) in [str(x['tweet_id']) for x in extra_text]:
        for elt in extra_text:
            if str(elt['tweet_id']) == str(a_point['tweet_id']):
                a_point['tweet_text'][0] = a_point['tweet_text'][0] + ' ' + elt['text']
  return data

def AlignFormatLabels(list, labels):
    aligned_list = []
    for item in list:
        if item == 'vehicle_damage':
            aligned_list.append('infrastructure_and_utility_damage')
        elif item == 'missing_or_found_people' or item == 'injured_or_dead_people':
            aligned_list.append('affected_individuals')
        else:
            aligned_list.append(item)
    final_labels = []
    for item in aligned_list:
        final_labels.append(labels.index(item))
    return final_labels

def GetData(folder):
    filenames_prefix = 'task_humanitarian_text_img_'
    with open(folder + filenames_prefix + 'train.pkl', 'rb') as f:
        train_data = pkl.load(f)
    with open(folder + filenames_prefix + 'dev.pkl', 'rb') as f:
        val_data = pkl.load(f)
    with open(folder + filenames_prefix + 'test.pkl', 'rb') as f:
        test_data = pkl.load(f)
     
    print(train_data[0])
    print(val_data[0])
    print(test_data[0])

    train_texts = []
    train_labels = []
    train_ids = []
    for a_point in train_data:
        # print(a_point)
        train_ids.append(a_point['tweet_id'])
        train_texts.append(a_point['tweet_text'])
        train_labels.append(a_point['label'])
    val_texts = []
    val_labels = []
    val_ids = []
    for a_point in val_data:
        val_ids.append(a_point['tweet_id'])
        val_texts.append(a_point['tweet_text'])
        val_labels.append(a_point['label'])
    test_texts = []
    test_labels = []
    test_ids = []
    for a_point in test_data:
        test_ids.append(a_point['tweet_id'])
        test_texts.append(a_point['tweet_text'])
        test_labels.append(a_point['label'])
    my_labels = ['affected_individuals', 'infrastructure_and_utility_damage', 'not_humanitarian', 'other_relevant_information', 'rescue_volunteering_or_donation_effort']
    
    train_texts = [x[0] for x in train_texts]
    val_texts = [x[0] for x in val_texts]
    test_texts = [x[0] for x in test_texts]
    train_labels = AlignFormatLabels(train_labels, my_labels)
    val_labels = AlignFormatLabels(val_labels, my_labels)
    test_labels = AlignFormatLabels(test_labels, my_labels)
    return train_texts, train_labels, train_ids, val_texts, val_labels, val_ids, test_texts, test_labels, test_ids

pickle_folder_path = './'
train_texts, train_labels, train_ids, val_texts, val_labels, val_ids, test_texts, test_labels, test_ids = GetData(pickle_folder_path)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

train_dataset = DatasetFormatting(train_encodings, train_labels)
val_dataset = DatasetFormatting(val_encodings, val_labels)
test_dataset = DatasetFormatting(test_encodings, test_labels)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions[0].argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results_',          # output directory
    num_train_epochs=5 ,             # total number of training epochs
    per_device_train_batch_size=16,  # batchx size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    learning_rate = 5e-5
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    save_steps = 500,
    evaluation_strategy='steps'
)

# Here you can specify the BERT model you want to use to train the text-only classifier
# As an additional consideration, you want to use the same BERT model configs as those you use for training the POINTER model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=NUM_LABELS)
model.config.output_hidden_states = True

trainer = Trainer(
    model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
    eval_dataset=val_dataset             # evaluation dataset
)

# Training
trainer.train()

In [None]:
trainer.save_model('finetuned_bert_model')

In [None]:
from tqdm import tqdm
import numpy as np
device = "cuda:0"
model = model.to(device)

embeddings_train = {}
embeddings_val = {}
embeddings_test = {}
for i in tqdm(range(0,len(train_ids))):
    input = torch.tensor(tokenizer.encode(train_texts[i])).unsqueeze(0).to(device)
    outputs = model(input)
    embedding = torch.mean(outputs.hidden_states[-1], 1, True).cpu()
    embedding = embedding.detach().numpy()
    embedding = np.reshape(embedding, (embedding.shape[0], embedding.shape[-1])) 
    embeddings_train[train_ids[i]] = embedding

for i in tqdm(range(0,len(val_ids))):
    input = torch.tensor(tokenizer.encode(val_texts[i])).unsqueeze(0).to(device)
    outputs = model(input)
    embedding = torch.mean(outputs.hidden_states[-1], 1, True).cpu()
    embedding = embedding.detach().numpy()
    embedding = np.reshape(embedding, (embedding.shape[0], embedding.shape[-1])) 
    embeddings_val[val_ids[i]] = embedding

for i in tqdm(range(0,len(test_ids))):
    input = torch.tensor(tokenizer.encode(test_texts[i])).unsqueeze(0).to(device)
    outputs = model(input)
    embedding = torch.mean(outputs.hidden_states[-1], 1, True).cpu()
    embedding = embedding.detach().numpy()
    embedding = np.reshape(embedding, (embedding.shape[0], embedding.shape[-1])) 
    embeddings_test[test_ids[i]] = embedding



In [None]:
import pickle
with open('./embeddings_train_crisis.pickle', 'wb') as handle:
    pickle.dump(embeddings_train, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./embeddings_val_crisis.pickle', 'wb') as handle:
    pickle.dump(embeddings_val, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('./embeddings_test_crisis.pickle', 'wb') as handle:
    pickle.dump(embeddings_test, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
import pickle
with open('./aaa_embeddings_train_crisis.pickle', 'rb') as handle:
    embeddings_train_dict = pickle.load(handle)
with open('./aaa_embeddings_val_crisis.pickle', 'rb') as handle:
    embeddings_val_dict = pickle.load(handle)
with open('./aaa_embeddings_test_crisis.pickle', 'rb') as handle:
    embeddings_test_dict = pickle.load(handle)

In [None]:
len(set(embeddings_val_dict))

In [None]:
train_text_emb, train_text_id = [], []
for tweet_id in embeddings_train_dict.keys():
	train_text_id.append(tweet_id)
	train_text_emb.append(embeddings_train_dict[tweet_id])
test_text_emb, test_text_id = [], []
for tweet_id in embeddings_test_dict.keys():
	test_text_id.append(tweet_id)
	test_text_emb.append(embeddings_test_dict[tweet_id])
val_text_emb, val_text_id = [], []
for tweet_id in embeddings_val_dict.keys():
	val_text_id.append(tweet_id)
	val_text_emb.append(embeddings_val_dict[tweet_id])
train_text_emb = np.array(train_text_emb)
train_text_emb = train_text_emb.reshape((train_text_emb.shape[0], train_text_emb.shape[-1]))
test_text_emb = np.array(test_text_emb)
test_text_emb = test_text_emb.reshape((test_text_emb.shape[0], test_text_emb.shape[-1]))
val_text_emb = np.array(val_text_emb)
val_text_emb = val_text_emb.reshape((val_text_emb.shape[0], val_text_emb.shape[-1]))
print(train_text_emb.shape)
print(train_text_id[:5])
print(test_text_id[:5])
print(val_text_id[:5])

In [None]:
# Evaluation
trainer.evaluate()
#Testing
probs, _, metrics = trainer.predict(test_dataset)
pred_labels = probs[0].argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, pred_labels, average='macro')
acc = accuracy_score(test_labels, pred_labels)
test_dict = {'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall}
print(test_dict)
print(metrics)

In [None]:
from transformers import DistilBertModel, DistilBertConfig

tokenizer = DistilBertTokenizer.from_pretrained('trained_models', local_files_only = True)
model = DistilBertForSequenceClassification.from_pretrained('trained_models', local_files_only = True)

In [None]:
# Evaluation
#Testing
probs, _, metrics = model.predict(test_dataset)
pred_labels = probs[0].argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, pred_labels, average='macro')
acc = accuracy_score(test_labels, pred_labels)
test_dict = {'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall}
print(test_dict)
print(metrics)