In [3]:
import numpy as np
import pandas as pd
import json
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import LongformerTokenizer, LongformerModel, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy, f1, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

In [4]:
def get_data(datapath):
    data = []
    with open(datapath) as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [5]:
def prepare_class_data(data):
    class_data = pd.DataFrame(columns=['DocID','Text','Attack','Kidnapping',
                                       'Bombing','Robbery','Arson','Forced'])
    class_dict = {'attack': 0,
                 'kidnapping': 1,
                 'bombing': 2,
                 'robbery': 3,
                 'arson': 4,
                 'forced work stoppage': 5}
    for doc in data:
        doc_list = [doc['docid'],doc['doctext']]
        class_list = [0]*len(class_dict.keys())
        for template in doc['templates']:
            incident_type = template['incident_type']
            if (incident_type=='bombing / attack') or (incident_type=='attack / bombing'):
                class_list[class_dict['bombing']] = 1
                class_list[class_dict['attack']] = 1
                continue
            class_list[class_dict[incident_type]] = 1
        doc_list += class_list
        class_data.loc[len(class_data)] = doc_list
    return class_data

In [6]:
train_data = prepare_class_data(get_data('../gtt_data/train.json'))
dev_data = prepare_class_data(get_data('../gtt_data/dev.json'))
test_data = prepare_class_data(get_data('../gtt_data/test.json'))

In [7]:
sentences = train_data['Text']
label_columns = train_data.columns.tolist()[2:]
labels = train_data[label_columns]

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096', do_lower_case=True)
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')

In [10]:
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  1886


In [11]:
class MucDataset(Dataset):

    def __init__(self, data: pd.DataFrame, tokenizer: LongformerTokenizer, max_token_len: int = 2000):
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        text = data_row['Text']
        labels = data_row[label_columns]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt')

        return dict(text=text, input_ids=encoding["input_ids"].flatten(),
                    attention_mask=encoding["attention_mask"].flatten(),
                    labels=torch.FloatTensor(labels))

In [12]:
class MucDataModule(pl.LightningDataModule):

    def __init__(self, train_df, dev_df, test_df, tokenizer, batch_size=8, max_token_len=128):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.dev_df = dev_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        
        self.train_dataset = MucDataset(self.train_df,
                                        self.tokenizer,
                                        self.max_token_len)
        
        self.dev_dataset = MucDataset(self.dev_df,
                                      self.tokenizer,
                                      self.max_token_len)

        self.test_dataset = MucDataset(self.test_df,
                                       self.tokenizer,
                                       self.max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          batch_size=self.batch_size,
                          shuffle=True,
                          num_workers=12)

    def val_dataloader(self):
        return DataLoader(self.dev_dataset,
                          batch_size=self.batch_size,
                          num_workers=12)

    def test_dataloader(self):
        return DataLoader(self.test_dataset,
                          batch_size=self.batch_size,
                          num_workers=12)

In [18]:
class MucTagger(pl.LightningModule):

    def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.model = LongformerModel.from_pretrained(MODEL_NAME, return_dict=True)
        self.classifier = nn.Linear(self.model.config.hidden_size, n_classes)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.BCELoss()

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids, attention_mask=attention_mask)
        output = self.classifier(output.pooler_output)
        output = torch.sigmoid(output)    
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def training_epoch_end(self, outputs):

        labels = []
        predictions = []
        for output in outputs:
            for out_labels in output["labels"].detach().cpu():
                labels.append(out_labels)
            for out_predictions in output["predictions"].detach().cpu():
                predictions.append(out_predictions)

        labels = torch.stack(labels).int()
        predictions = torch.stack(predictions)

        for i, name in enumerate(label_columns):
            class_f1 = f1(predictions[:, i], labels[:, i], num_classes=2)
            self.logger.experiment.add_scalar(f"{name}_f1/Train", class_f1, self.current_epoch)


    def configure_optimizers(self):

        optimizer = AdamW(self.parameters(), lr=2e-5)

        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=self.n_warmup_steps,
                                                    num_training_steps=self.n_training_steps)

        return dict(optimizer=optimizer, lr_scheduler=dict(scheduler=scheduler, interval='step'))

In [19]:
N_EPOCHS = 1
BATCH_SIZE = 1
MODEL_NAME = 'allenai/longformer-base-4096'
MAX_TOKEN_COUNT = 512

steps_per_epoch=len(train_data) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS
warmup_steps = 20 #total_training_steps // 5

In [20]:
data_module = MucDataModule(train_data, dev_data, test_data, tokenizer, batch_size=BATCH_SIZE, 
                            max_token_len=MAX_TOKEN_COUNT)

model = MucTagger(n_classes=len(label_columns), n_warmup_steps=warmup_steps,
                  n_training_steps=total_training_steps)

checkpoint_callback = ModelCheckpoint(dirpath="checkpoints", filename="best-checkpoint",
                                      save_top_k=1, verbose=True, monitor="val_loss", mode="min")

logger = TensorBoardLogger("lightning_logs", name="muc")

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

In [21]:
trainer = pl.Trainer(logger=logger, checkpoint_callback=checkpoint_callback,
                     callbacks=[early_stopping_callback], max_epochs=N_EPOCHS, gpus=1,
                     progress_bar_refresh_rate=30)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [22]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type            | Params
-----------------------------------------------
0 | model      | LongformerModel | 148 M 
1 | classifier | Linear          | 4.6 K 
2 | criterion  | BCELoss         | 0     
-----------------------------------------------
148 M     Trainable params
0         Non-trainable params
148 M     Total params
594.656   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Epoch 0, global step 1299: val_loss reached 0.18524 (best 0.18524), saving model to "/der/notebooks/checkpoints/best-checkpoint-v3.ckpt" as top 1





1

In [24]:
trained_model = MucTagger.load_from_checkpoint(trainer.checkpoint_callback.best_model_path,
                                               n_classes=len(label_columns))
trained_model.eval()
trained_model.freeze()

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
trained_model = trained_model.to(device)

val_dataset = MucDataset(
  test_data,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

predictions = []
labels = []

for item in tqdm(val_dataset):
    _, prediction = trained_model(item["input_ids"].unsqueeze(dim=0).to(device), 
                                  item["attention_mask"].unsqueeze(dim=0).to(device))
    predictions.append(prediction.flatten())
    labels.append(item["labels"].int())

predictions = torch.stack(predictions).detach().cpu()
labels = torch.stack(labels).detach().cpu()

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [27]:
y_pred = predictions.numpy()
y_true = labels.numpy()

upper, lower = 1, 0

y_pred = np.where(y_pred > 0.5, upper, lower)

print(classification_report(
  y_true, 
  y_pred, 
  target_names=label_columns, 
  zero_division=0
))

              precision    recall  f1-score   support

      Attack       0.87      0.60      0.71        89
  Kidnapping       0.75      0.69      0.72        13
     Bombing       0.83      0.83      0.83        48
     Robbery       0.00      0.00      0.00         1
       Arson       0.00      0.00      0.00         3
      Forced       0.00      0.00      0.00         3

   micro avg       0.84      0.65      0.73       157
   macro avg       0.41      0.35      0.38       157
weighted avg       0.81      0.65      0.71       157
 samples avg       0.50      0.45      0.46       157



In [28]:
print(y_pred)

[[0 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 ...
 [0 0 0 0 0 0]
 [1 0 0 0 0 0]
 [0 0 0 0 0 0]]


In [29]:
print(predictions)

tensor([[0.1116, 0.0171, 0.0329, 0.0147, 0.0167, 0.0097],
        [0.8915, 0.0396, 0.0394, 0.0242, 0.0364, 0.0133],
        [0.8710, 0.0360, 0.0302, 0.0202, 0.0297, 0.0109],
        ...,
        [0.2861, 0.3104, 0.0158, 0.0198, 0.0249, 0.0102],
        [0.7578, 0.0227, 0.0258, 0.0150, 0.0213, 0.0086],
        [0.2670, 0.0125, 0.0768, 0.0100, 0.0198, 0.0058]])
