In [22]:
import os
import argparse
import numpy as np
import torch
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import os
import evaluate
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

Same code as that of train.py

In [2]:
train_data = '../data/dataset/IN-Abs/train-data'
test_data = '../data/dataset/IN-Abs/test-data'

In [4]:
model_checkpoint = "t5-small"

In [15]:
# Reading the training and testing files
train_files = {
    "text": os.listdir(train_data+'/judgement'),
    "summary": os.listdir(train_data+'/summary')
}
test_files = {
    "text": os.listdir(test_data+'/judgement'),
    "summary": os.listdir(test_data+'/summary')
}

print("Number of train set:", len(train_files["text"]))
print("Number of test set", len(test_files["text"]))


# Loading model and the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

# Setting Dataset transformer
class SummaryDataset(Dataset):
    def __init__(self, dataset_files, tokenizer, max_length=None, n_samples=None, dstype="train"):
        """
            Dataset Class as per the pytorch library. This is the constructor class 
            In this class the dataset folder is loaded (text and the summary)
            data_files - List of names of input files (txt file)
        """
        self.texts = []
        self.labels = []
        self.max_length = tokenizer.model_max_length if max_length is None else max_length 

        base_dir = train_data if dstype == "train" else test_data

        # Reading 
        count = 0
        for file in dataset_files["text"]:
            # Reading and tokenizing the text file 
            with open(base_dir+'/judgement/'+file, 'r') as f:
                l = f.readlines()
                l = [x.strip() for x in l]
                text = ''.join(l) # Concatinating all lines to one line
                # Tokenizer creates the token for the document and returns the pytorch tensor with padding for parallism in GPU
                tok = tokenizer.tokenize(text)
                self.texts.append(len(tok)) # Appending length instead of token

                # self.texts.append(tok)

            # Reading and tokenizing the summary file corresponding to the text file
            with open(base_dir+'/summary/'+file, 'r') as f:
                l = f.readlines()
                l = [x.strip() for x in l]
                text = ''.join(l) # Concatinating all lines to one line
                # Tokenizer creates the token for the document and returns the pytorch tensor with padding for parallism in GPU
                tok = tokenizer.tokenize(text)
                self.labels.append(len(tok))

            count += 1
            if n_samples is not None and count == n_samples:
                break
  

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {'input_ids':self.texts[idx]["input_ids"][0], "attention_mask": self.texts[idx]["attention_mask"][0], "labels":self.labels[idx]["input_ids"][0]}


# Loading and tokenizing the dataset
train_dataset = SummaryDataset(train_files, tokenizer, dstype="train")

Number of train set: 7030
Number of test set 100


Token indices sequence length is longer than the specified maximum sequence length for this model (6132 > 512). Running this sequence through the model will result in indexing errors


## For Input Text to the model

In [20]:
train_df = pd.DataFrame({
    "input_ids_length": train_dataset.texts
})

print("Maximum context length in dataset:", max(train_df['input_ids_length']))
print("Minimum context length in dataset:", min(train_df['input_ids_length']))
print("Average context length in dataset:", train_df['input_ids_length'].mean())

Maximum context length in dataset: 182927
Minimum context length in dataset: 267
Average context length in dataset: 5800.730014224751


## For Summary text

In [47]:
train_df = pd.DataFrame({
    "input_ids_length": train_dataset.labels
})

print("Maximum context length in dataset:", max(train_df['input_ids_length']))
print("Minimum context length in dataset:", min(train_df['input_ids_length']))
print("Average context length in dataset:", train_df['input_ids_length'].mean())

Maximum context length in dataset: 38134
Minimum context length in dataset: 0
Average context length in dataset: 1149.129587482219
