In [4]:
import json
from datasets import load_dataset
from config import (train_data_path, 
                    test_data_path, 
                    )

def get_fd(
        train_data_path, 
        test_data_path, 
        val_data_path = ""
        ):

    train_fd = open(train_data_path)
    test_fd = open(test_data_path)

    return (
        train_fd,
        test_fd
    )

def train_test_split(
        train_fd, 
        test_fd
        ):

    #defining train and test data 

    train_data = json.load(train_fd)
    test_data = json.load(test_fd)

    return (
        train_data, 
        test_data
    )

def load_data(
        local = 1, 
        train_data_path = "", 
        test_data_path = ""
        ):

    if (local != 1):

        qasper = load_dataset("allenai/qasper")

        return (
            qasper
        )

    else:

        train_fd, test_fd = get_fd(
            train_data_path,
            test_data_path
        )

        train_data, test_data = train_test_split(
            train_fd,
            test_fd
        )

        return (
            train_data,
            test_data
        )


qasper = load_data(local = 0)
print(qasper)


Found cached dataset qasper (C:/Users/DELL/.cache/huggingface/datasets/allenai___qasper/qasper/0.3.0/2bfcd239e581ab83f9ab7b76a82e42c6bcf574a13246ae6cc5a6c357c35f96f9)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
        num_rows: 888
    })
    validation: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
        num_rows: 281
    })
    test: Dataset({
        features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
        num_rows: 416
    })
})


In [7]:
# preprocessing

def modify_dataset(dataset):

    new_dataset = list()
    
    for i in range(len(dataset)):
        print(f"Entry: {i} Completed")
        # if (i == 100):
            # break
        for j in range(len(dataset[i]['qas']['question'])):
            _question = dataset[i]['qas']['question'][j]
            for k in range(len(dataset[i]['qas']['answers'][j]['answer'])):
                _answer = {
                    "unanswerable": dataset[i]['qas']['answers'][j]['answer'][k]['unanswerable'],
                    "extractive_spans": dataset[i]['qas']['answers'][j]['answer'][k]['extractive_spans'],
                    "yes_no": dataset[i]['qas']['answers'][j]['answer'][k]['yes_no'],
                    "abstractive": dataset[i]['qas']['answers'][j]['answer'][k]['free_form_answer']
                }
                _context = dataset[i]['qas']['answers'][j]['answer'][k]['evidence']

                data = {
                    "context": _context,
                    "question": _question,
                    "answer": _answer
                }

                if (len(data['context']) > 0):
                    new_dataset.append(data)
                # print(new_dataset)
                # break

    return new_dataset



In [8]:

print("Running for Train")
new_qasper_train = modify_dataset(qasper['train'])
print("Running for Test")
new_qasper_test = modify_dataset(qasper['test'])
print("Running for Validation")
new_qasper_validation = modify_dataset(qasper['validation'])

print(f"{len(new_qasper_train)}, {len(new_qasper_test)}, {len(new_qasper_validation)}")


Running for Train
Entry: 0 Completed
Entry: 1 Completed
Entry: 2 Completed
Entry: 3 Completed
Entry: 4 Completed
Entry: 5 Completed
Entry: 6 Completed
Entry: 7 Completed
Entry: 8 Completed
Entry: 9 Completed
Entry: 10 Completed
Entry: 11 Completed
Entry: 12 Completed
Entry: 13 Completed
Entry: 14 Completed
Entry: 15 Completed
Entry: 16 Completed
Entry: 17 Completed
Entry: 18 Completed
Entry: 19 Completed
Entry: 20 Completed
Entry: 21 Completed
Entry: 22 Completed
Entry: 23 Completed
Entry: 24 Completed
Entry: 25 Completed
Entry: 26 Completed
Entry: 27 Completed
Entry: 28 Completed
Entry: 29 Completed
Entry: 30 Completed
Entry: 31 Completed
Entry: 32 Completed
Entry: 33 Completed
Entry: 34 Completed
Entry: 35 Completed
Entry: 36 Completed
Entry: 37 Completed
Entry: 38 Completed
Entry: 39 Completed
Entry: 40 Completed
Entry: 41 Completed
Entry: 42 Completed
Entry: 43 Completed
Entry: 44 Completed
Entry: 45 Completed
Entry: 46 Completed
Entry: 47 Completed
Entry: 48 Completed
Entry: 49 Co

In [17]:
import pandas as pd

def shuffler(dataset):
    df = pd.DataFrame(dataset)
    df = df.sample(frac = 1)
    # print(df.head())
    return df

new_qasper_train = shuffler(new_qasper_train)
new_qasper_test = shuffler(new_qasper_test)
new_qasper_validation = shuffler(new_qasper_validation)

# print(len(new_qasper_train))
new_qasper_train.head()

    

Unnamed: 0,context,question,answer
1057,[Table TABREF44 shows that consistent nucleus ...,How much improvement is gained from the propos...,"{'unanswerable': False, 'extractive_spans': []..."
845,[The results are shown in Table 1 . The RelNet...,What are the relative improvements observed ov...,"{'unanswerable': False, 'extractive_spans': ['..."
2058,[Table TABREF32 shows the perplexity of the mo...,How is language modelling evaluated?,"{'unanswerable': False, 'extractive_spans': ['..."
883,[We had two human annotators who were trained ...,What is the size of the dataset?,"{'unanswerable': False, 'extractive_spans': ['..."
1981,[FLOAT SELECTED: Table 3. MOS naturalness resu...,Does DCA or GMM-based attention perform better...,"{'unanswerable': False, 'extractive_spans': []..."


In [18]:
new_qasper_train = (new_qasper_train.to_numpy()).tolist()
new_qasper_test = (new_qasper_test.to_numpy()).tolist()
new_qasper_validation = (new_qasper_validation.to_numpy()).tolist()

print(len(new_qasper_train))

qasper_final = {
    "date": "20th June 2023",
    "author": ["Saurabh Tiwari", "Preetam Pati"],
    "length": {
        "train_data": len(new_qasper_train),
        "test_data": len(new_qasper_test),
        "validation_data": len(new_qasper_validation)
    },
    "data": {
        "train_data": new_qasper_train,
        "test_data": new_qasper_test,
        "validation_data": new_qasper_validation
    },
}

with open(r"new_qasper.json", "w") as outfile:
    json.dump(qasper_final, outfile)

2308


In [19]:
count1 = 0; count2 = 0; count3 = 0; count4 = 0; count5 = 0; count6 = 0
for i in range(len(new_qasper_train)):
    if len(new_qasper_train[i][0]) == 0:
        # print(new_qasper_train[i])
        count1 += 1
    if new_qasper_train[i][2]['unanswerable'] == True:
        count2 += 1
    if len(new_qasper_train[i][2]['extractive_spans']) == 0:
        count3 += 1
    if new_qasper_train[i][2]['abstractive'] == "":
        count4 += 1
    if new_qasper_train[i][2]['abstractive'] == "" and len(new_qasper_train[i][2]['extractive_spans']) == 0 and new_qasper_train[i][2]['yes_no'] == None and new_qasper_train[i][2]['unanswerable'] == False:
        count5 += 1
    if new_qasper_train[i][2]['yes_no'] != None:
        count6 += 1

print(f"Empty Context - {count1}")
print(f"Unanswerable - {count2}")
print(f"Extractive Empty - {count3}")
print(f"Abstractive Empty - {count4}")
print(f"yes_no - {count6}")
print(f"Not answered - {count5}")


Empty Context - 0
Unanswerable - 0
Extractive Empty - 946
Abstractive Empty - 1688
yes_no - 326
Not answered - 0


In [1]:
import json
from config import (
  new_qasper_data_path
)

class Qasper_Dataset: 

  qasper_data = dict()

  def __init__(self, new_qasper_data_path):
    new_qasper_fd = open(new_qasper_data_path)
    new_qasper_data = json.load(new_qasper_fd)
    self.qasper_data = new_qasper_data

  def __len__(self):
    return (
        len(self.qasper_data)
    )

  def __data__(self):
      return (
        self.qasper_data
    )

  def multi_class_labels(self):

    # Updating train data
    
    for i in range(len(self.qasper_data['data']['train_data'])):

      if len(self.qasper_data['data']['train_data'][i][2]['extractive_spans']) == 0:
        self.qasper_data['data']['train_data'][i][2]['extractive_spans'] = False
      else:
        self.qasper_data['data']['train_data'][i][2]['extractive_spans'] = True

      if self.qasper_data['data']['train_data'][i][2]['yes_no'] == None:
        self.qasper_data['data']['train_data'][i][2]['yes_no'] = False
      else:
        self.qasper_data['data']['train_data'][i][2]['yes_no'] = True

      if self.qasper_data['data']['train_data'][i][2]['abstractive'] == "":
        self.qasper_data['data']['train_data'][i][2]['abstractive'] = False
      else:
        self.qasper_data['data']['train_data'][i][2]['abstractive'] = True

    # Updating test data
    
    for i in range(len(self.qasper_data['data']['test_data'])):

      if len(self.qasper_data['data']['test_data'][i][2]['extractive_spans']) == 0:
        self.qasper_data['data']['test_data'][i][2]['extractive_spans'] = False
      else:
        self.qasper_data['data']['test_data'][i][2]['extractive_spans'] = True

      if self.qasper_data['data']['test_data'][i][2]['yes_no'] == None:
        self.qasper_data['data']['test_data'][i][2]['yes_no'] = False
      else:
        self.qasper_data['data']['test_data'][i][2]['yes_no'] = True

      if self.qasper_data['data']['test_data'][i][2]['abstractive'] == "":
        self.qasper_data['data']['test_data'][i][2]['abstractive'] = False
      else:
        self.qasper_data['data']['test_data'][i][2]['abstractive'] = True

    # Updating validation data
    
    for i in range(len(self.qasper_data['data']['validation_data'])):

      if len(self.qasper_data['data']['validation_data'][i][2]['extractive_spans']) == 0:
        self.qasper_data['data']['validation_data'][i][2]['extractive_spans'] = False
      else:
        self.qasper_data['data']['validation_data'][i][2]['extractive_spans'] = True

      if self.qasper_data['data']['validation_data'][i][2]['yes_no'] == None:
        self.qasper_data['data']['validation_data'][i][2]['yes_no'] = False
      else:
        self.qasper_data['data']['validation_data'][i][2]['yes_no'] = True

      if self.qasper_data['data']['validation_data'][i][2]['abstractive'] == "":
        self.qasper_data['data']['validation_data'][i][2]['abstractive'] = False
      else:
        self.qasper_data['data']['validation_data'][i][2]['abstractive'] = True

    return (
      self.qasper_data
    )
  
  def __savedata__(self):
    with open(r"qasper_classification.json", "w") as outfile:
      json.dump(self.qasper_data, outfile)

def caller_func(new_qasper_data_path):

  qasper_dataset = Qasper_Dataset(new_qasper_data_path)
  print(qasper_dataset.__len__())
  print(qasper_dataset.multi_class_labels())
  qasper_dataset.__savedata__()


caller_func(new_qasper_data_path)

# print(len(new_qasper_data['data']['train_data']))

4
