In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import os
import random
import logging
import time
import numpy as np

# create custom dataset class for arvix classification dataset
class ArvixDataset(Dataset):
    def __init__(self, path, tokenizer, model_config, mode='train', max_len=4096):

        self.dictCls2Idx = {
            "cs.AI": 0,
            "cs.cv": 1,
            "cs.IT": 2,
            "cs.PL": 3,
            "math.AC": 4,
            "math.ST": 5,
            "cs.CE": 6, 
            "cs.DS": 7,
            "cs.NE": 8,
            "cs.SY": 9 , 
            "math.GR": 10
        }
        self.Idx2dictCls = {}
        self.dataset = []
        self.labels  = []
        self.tokenizer = tokenizer
        self.max_len = max_len

        for sub in self.dictCls2Idx:
            label_index = self.dictCls2Idx[sub]
            subfolder = os.path.join(path,sub)
            self.Idx2dictCls[label_index] = sub

            files = sorted([f for f in os.listdir(subfolder) if os.path.isfile(os.path.join(subfolder,f))])
            random.seed(1234)
            random.shuffle(files)

            if mode == "train":
                file_index = [i for i in range(model_config["train_size"])]
            elif mode == "validation":
                file_index = [i for i in range(model_config["train_size"], model_config["train_size"] + model_config["val_size"])]
            elif mode == "test":
                file_index = [i for i in range(model_config["train_size"] + model_config["val_size"], model_config["train_size"] + model_config["val_size"] + model_config["test_size"])]

            for i in file_index:
                f = files[i]
                fname = os.path.join(subfolder,f)
                self.dataset.append(fname)
                self.labels.append(label_index)
        

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        label = self.labels[idx]
        data_path = self.dataset[idx]
        data = self.read_txt(data_path)
        encoded_data = self.tokenizer.encode(data, truncation=True, padding="max_length", max_length=self.max_len)
        att_mask = torch.ones(len(encoded_data), dtype=torch.long)
        att_mask[0] = 2
        sample = {"Text": torch.tensor(encoded_data), 
                  "Attention": att_mask, 
                  "Label": torch.Tensor([label])}
        return sample

    def read_txt(self, file_path):
        with open(file_path, 'r') as file:
            text = file.read().replace('\n', '')
        return text

In [2]:
import torch
from longformer.longformer import Longformer, LongformerConfig
from transformers import LongformerConfig, LongformerModel
from longformer.sliding_chunks import pad_to_window_size
from transformers import RobertaForMaskedLM, RobertaTokenizerFast
import requests
import tarfile
from tqdm import tqdm

In [3]:
logging.basicConfig(filename=os.path.join('./logs', 'longformer_nopretrained.log'),
                    format='%(asctime)s %(levelname)-8s %(message)s',
                    datefmt='%m-%d %H:%M', level=logging.INFO, filemode='w')

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [4]:
# training parameters
model_config = {}

model_config["train_size"] = 2300
model_config["val_size"] = 100
model_config["test_size"] = 100

model_config['lr'] = 1e-6
model_config['window_size'] = 64
model_config['batch_size'] = 2
model_config['max_len'] = 4096
model_config["datapath"] = "./Long-document-dataset/"
model_config["weight_path"] = "./no_pretrain_weight/"
model_config["num_epoch"] = 20
model_config["weight_name"] = "e4_model.pt"
model_config["longformer_lr"] = 1e-6
model_config["linear_lr"] = 1e-6
model_config["gamma"] = 0.8
device = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')

In [5]:
config = LongformerConfig.from_pretrained('longformer-base-4096/') 
config.attention_mode = 'sliding_chunks'
config.attention_window = [model_config['window_size']] * 12

In [6]:
class LongformerClassifier(torch.nn.Module):
    
    def __init__(self, config, pretrain=False, in_features=768, out_features=11):
        super(LongformerClassifier, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.pretrain = pretrain
        self.config = config

        if self.pretrain:
            self.longformer = Longformer.from_pretrained('longformer-base-4096/', config=self.config)
        else:
            self.longformer = LongformerModel(self.config)

        self.linear = torch.nn.Linear(in_features=self.in_features, out_features=self.out_features)
    
    def forward(self, input_ids, attention_mask):
        x = self.longformer(input_ids=input_ids, attention_mask=attention_mask)[0]
        x = self.linear(x[:, 0])
        return x

In [7]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', model_max_length=model_config["max_len"])
train_dataset = ArvixDataset(model_config["datapath"], tokenizer, model_config, mode="train", max_len=model_config["max_len"])
val_dataset = ArvixDataset(model_config["datapath"], tokenizer, model_config, mode="validation", max_len=model_config["max_len"])

In [8]:
train_dataloader = DataLoader(train_dataset, batch_size=model_config['batch_size'], shuffle=True, collate_fn=None)
val_dataloader = DataLoader(val_dataset, batch_size=model_config['batch_size'], shuffle=False, collate_fn=None)
data = next(iter(train_dataloader))

In [9]:
model = LongformerClassifier(config, pretrain=False, in_features=768, out_features=11).to(device)

if model_config["weight_name"] is not None:
    file_name = os.path.join(model_config["weight_path"], model_config["weight_name"])
    model.load_state_dict(torch.load(file_name))
    print(f"Load model weight from file {model_config['weight_name']}")

#optimizer = torch.optim.AdamW(model.parameters(), lr = model_config["lr"])
optimizer = torch.optim.AdamW([
    {'params': model.longformer.parameters(), 'lr': model_config["longformer_lr"]},
    {'params': model.linear.parameters(), 'lr': model_config["linear_lr"]}])

loss_fn = torch.nn.CrossEntropyLoss().to(device)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=model_config["gamma"], last_epoch=-1)

Load model weight from file e4_model.pt


In [10]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
for epoch in tqdm(range(model_config["num_epoch"])):
    logger.info("in epoch:" + str(round))
    total_train_loss = 0
    model.train()
    current_lr = scheduler.get_last_lr()
    logger.info(f"Current Learning rate: {current_lr}")
    print(f"Current Learning rate for longformer: {current_lr[0]}, for linear layer: {current_lr[1]}")
    for step, data in enumerate(train_dataloader):
        
        start = time.time()
        input_ids = data["Text"].to(device)
        attention_mask = data["Attention"].to(device)
        label = data["Label"].to(device)
        optimizer.zero_grad()  

        outputs = model(input_ids, attention_mask=attention_mask)

        loss = loss_fn(outputs, label.squeeze(1).long())
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        end = time.time()

        if(step % 10 == 0):
            logger.info(f"Loss after {step} step: {loss} Time: {end-start}")
            print(f"Loss after {step} step: {loss}")

    scheduler.step()
    
    avg_train_loss = total_train_loss / len(train_dataloader)
    logger.info("Average training loss: {0:.2f}".format(avg_train_loss))
    print("Average training loss: {0:.2f}".format(avg_train_loss))

    # save model weight
    print("Saving model weight...")

    if not os.path.exists(model_config['weight_path']):
        os.makedirs(model_config['weight_path'])
    
    weight_file_name = f"{model_config['weight_path']}/e{epoch+5}_model.pt"
    torch.save(model.state_dict(), weight_file_name)
        
    print("")
    print("Running Validation...")

    # Put the model in evaluation mode-
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

        # Evaluate data for one epoch
    for step, data in enumerate(val_dataloader):
        
        input_ids = data["Text"].to(device)
        attention_mask = data["Attention"].to(device)
        label = data["Label"].to(device)

        with torch.no_grad():        
            outputs = model(input_ids, attention_mask=attention_mask)
            
        # Accumulate the validation loss.
        loss = loss_fn(outputs, label.squeeze(1).long())
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = outputs.detach().cpu().numpy()
        label_ids = label.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    logger.info("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(val_dataloader)
    
    logger.info("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))

print("")
print("Training complete!")

  0%|          | 0/20 [00:00<?, ?it/s]

Current Learning rate for longformer: 1e-06, for linear layer: 1e-06
Loss after 0 step: 0.8813285827636719
Loss after 10 step: 0.6113165616989136
Loss after 20 step: 1.02433180809021
Loss after 30 step: 2.0124762058258057
Loss after 40 step: 0.734704315662384
Loss after 50 step: 0.4309670925140381
Loss after 60 step: 1.262691855430603
Loss after 70 step: 0.8471678495407104
Loss after 80 step: 1.9378206729888916
Loss after 90 step: 0.3210113048553467
Loss after 100 step: 2.408802032470703
Loss after 110 step: 0.9046907424926758
Loss after 120 step: 0.5376856923103333
Loss after 130 step: 0.162942573428154
Loss after 140 step: 0.8467928767204285
Loss after 150 step: 0.24727031588554382
Loss after 160 step: 0.7980748414993286


In [11]:
import pandas as pd
import numpy as np

class Calculator:
    def __init__(self, num_class=11):
        self.num_class = num_class
        self.dictIdx2Cls = {
            0: "cs.AI",
            1: "cs.cv",
            2: "cs.IT",
            3: "cs.PL",
            4: "math.AC",
            5: "math.ST",
            6: "cs.CE", 
            7: "cs.DS",
            8: "cs.NE",
            9: "cs.SY", 
            10: "math.GR"
        }

    def init_metrics(self):
        class_list = [i for i in range(self.num_class)]
        val_list = [0] * self.num_class

        self.TP = dict(zip(class_list, val_list))
        self.positive_pred = dict(zip(class_list, val_list))
        self.positive_label = dict(zip(class_list, val_list))

        self.precision = dict(zip(class_list, val_list))
        self.recall = dict(zip(class_list, val_list))
        self.f1 = dict(zip(class_list, val_list))

    def update_result(self, preds, labels):
        preds_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()

        for i in range(self.num_class):

            this_pred = np.array([1 if pred == i else 0 for pred in preds_flat])
            this_label = np.array([1 if label == i else 0 for label in labels_flat])

            self.TP[i] += np.sum(this_pred * this_label)
            self.positive_pred[i] += np.sum(this_pred)
            self.positive_label[i] += np.sum(this_label)

    def get_overall_performance(self):

        precision = sum(self.TP.values()) / sum(self.positive_pred.values())
        recall = sum(self.TP.values()) / sum(self.positive_label.values())
        f1 = (2 * sum(np.array(list(result_calculator.precision.values())) * np.array(list(result_calculator.recall.values())))) / (sum(self.precision.values()) + sum(self.recall.values()))
        # accuracy = sum(self.correct.values()) / sum(self.total.values())
        total = sum(self.positive_label.values())

        return ["overall", total, precision, recall, f1]

    def get_metrics(self):

        for i in range(self.num_class):

            self.precision[i] = (self.TP[i] / self.positive_pred[i]) if self.positive_pred[i] else 0
            self.recall[i] = (self.TP[i] / self.positive_label[i]) if self.positive_label[i] else 0
            self.f1[i] = (2.0 * self.precision[i] * self.recall[i] / (self.precision[i] + self.recall[i])) if (self.precision[i] + self.recall[i]) else 0
            # self.accuracy[i] = self.correct[i] / self.total[i] if self.total[i] else 0
     
        result_dict = {
            "Class": self.dictIdx2Cls.values(),
            "Sample Size": self.positive_label.values(),
            # "Accuracy": self.accuracy.values(),
            "Precision": self.precision.values(),
            "Recall": self.recall.values(),
            "F1": self.f1.values()
        }

        result_df = pd.DataFrame(result_dict)
        result_df.loc[len(result_df.index)] = self.get_overall_performance()

        return result_df


In [12]:
model = LongformerClassifier(config, pretrain=False, in_features=768, out_features=11).to(device)
model.load_state_dict(torch.load("no_pretrain_weight/e8_model.pt"))
print(f"Load model weight from file")

Load model weight from file


In [13]:
test_dataset = ArvixDataset(model_config["datapath"], tokenizer, model_config, mode="test", max_len=model_config["max_len"])
test_dataloader = DataLoader(val_dataset, batch_size=model_config['batch_size'], shuffle=False, collate_fn=None)

In [14]:
total_test_accuracy = 0

result_calculator = Calculator(num_class=11)
result_calculator.init_metrics()

model.eval()

for step, data in enumerate(tqdm(test_dataloader)):
    start = time.time()
    input_ids = data["Text"].to(device)
    attention_mask = data["Attention"].to(device)
    label = data["Label"].to(device)

    with torch.no_grad():        
        outputs = model(input_ids, attention_mask=attention_mask)
        
    # Accumulate the validation loss.
    # loss = loss_fn(outputs, label.squeeze(1).long())
    # total_eval_loss += loss.item()

    # Move logits and labels to CPU
    logits = outputs.detach().cpu().numpy()
    label_ids = label.to('cpu').numpy()

    # Calculate the metrics for this batch of test sentences, and
    # accumulate it over all batches.
    result_calculator.update_result(logits, label_ids)

    # Calculate the accuracy for this batch of test sentences, and
    # accumulate it over all batches.
    total_test_accuracy += flat_accuracy(logits, label_ids)
    end = time.time()
    print(end-start)

# Report the final accuracy for this validation run.
avg_test_accuracy = total_test_accuracy / len(test_dataloader)
print("")
print("Test  Accuracy: {0:.3f}".format(avg_val_accuracy))

# Report the final metrics for this test run.
result_df = result_calculator.get_metrics()


  0%|          | 1/550 [00:00<06:04,  1.51it/s]

0.6144452095031738


  0%|          | 2/550 [00:01<05:39,  1.61it/s]

0.5484049320220947


  1%|          | 3/550 [00:01<05:34,  1.63it/s]

0.5475468635559082


  1%|          | 4/550 [00:02<05:41,  1.60it/s]

0.5512967109680176


  1%|          | 5/550 [00:03<05:35,  1.62it/s]

0.5561578273773193


  1%|          | 6/550 [00:03<05:32,  1.64it/s]

0.5475430488586426


  1%|▏         | 7/550 [00:04<05:29,  1.65it/s]

0.5576865673065186


  1%|▏         | 8/550 [00:04<05:24,  1.67it/s]

0.5498690605163574


  2%|▏         | 9/550 [00:05<05:24,  1.67it/s]

0.5560493469238281


  2%|▏         | 10/550 [00:06<05:26,  1.65it/s]

0.5494823455810547


  2%|▏         | 11/550 [00:06<05:29,  1.63it/s]

0.5531208515167236


  2%|▏         | 12/550 [00:07<05:29,  1.63it/s]

0.5611796379089355


  2%|▏         | 13/550 [00:07<05:29,  1.63it/s]

0.553046464920044


  3%|▎         | 14/550 [00:08<05:30,  1.62it/s]

0.5494897365570068


  3%|▎         | 15/550 [00:09<05:28,  1.63it/s]

0.5526571273803711


  3%|▎         | 16/550 [00:09<05:28,  1.63it/s]

0.5634887218475342


  3%|▎         | 17/550 [00:10<05:23,  1.65it/s]

0.5566298961639404


  3%|▎         | 18/550 [00:10<05:19,  1.66it/s]

0.5580112934112549


  3%|▎         | 18/550 [00:11<05:37,  1.57it/s]


KeyboardInterrupt: 

In [24]:
avg_test_accuracy

0.7020890099909166

In [28]:
df = result_calculator.get_metrics()

In [29]:
df.sort_values(by="Class")

Unnamed: 0,Class,Sample Size,Precision,Recall,F1
0,cs.AI,100,0.339623,0.36,0.349515
6,cs.CE,100,0.636364,0.42,0.506024
7,cs.DS,100,0.75,0.63,0.684783
2,cs.IT,100,0.956522,0.66,0.781065
8,cs.NE,100,0.444444,0.56,0.495575
3,cs.PL,100,0.631579,0.96,0.761905
9,cs.SY,100,0.740385,0.77,0.754902
1,cs.cv,100,0.732143,0.82,0.773585
4,math.AC,100,0.952941,0.81,0.875676
10,math.GR,100,0.863636,0.95,0.904762


In [31]:
df.to_csv('nopretrain.csv')