In [2]:
import json

movies_path = 'drive/MyDrive/google_colab_data/movies'

def parse_data(file_path):
    data = []
    # Open the .jsonl file and read it line by line
    with open(file_path, 'r') as file:
        for line in file:
            # Parse each line as JSON and append it to the list
            annotation = json.loads(line)
            id = annotation["annotation_id"]
            annotation["classification"] = 1 if annotation['classification'] == "POS" else 0

            with open(f"{movies_path}/docs/{id}", 'r') as file:
                content = file.read()
                annotation['content'] = content.replace('\n', ' ')
                data.append(annotation)
    return data


# Specify the path to your JSON file
train_file_path = f'{movies_path}/train.jsonl'
test_file_path = f'{movies_path}/test.jsonl'
val_file_path = f'{movies_path}/val.jsonl'

# Initialize an empty list to store the dictionaries
train_data = parse_data(train_file_path)
test_data = parse_data(test_file_path)
val_data = parse_data(val_file_path)

train_data[7]['content']

'that \'s exactly how long the movie felt to me . there were n\'t even nine laughs in nine months . it \'s a terrible mess of a movie starring a terrible mess of a man , mr . hugh grant , a huge dork . it \'s not the whole oral - sex / prostitution thing ( referring to grant , not me ) that bugs me , it \'s the fact that grant is annoying . not just adam sandler - annoying , we \'re talking jim carrey - annoying . since when do eye flutters and nervous smiles pass for acting ? but , on the other hand , since when do really bad slapstick ( a fistfight in the delivery room culminating in grant \'s head in joan cusack \'s lap -- a scene he paid $ 60 to have included in the movie ) and obscene double entendres ( robin williams , the obstetrician , tells grant \'s pregnant girlfriend she has " a big pussy , " referring of course to the size of the cat hairs on her coat , but nonetheless , grant paid $ 60 to have the exchange included in the movie ) pass for comedy ? nine months is a predict

In [3]:
def print_example(data, index, print_content=True, print_classification=True, print_rationales=True ):
    print(f'Retrieving Training Example [{index}].................\n')
    item = data[index]
    classification = item['classification']
    evidences = item['evidences']
    content = item['content']
    if print_content: print(f'Review content:\n{content}\n')
    if print_classification: print('----------------------------',
                                   '\n| Sentiment class:',
                                   classification,
                                   ("- NEG" if not classification else "- POS"),
                                   '|', '\n----------------------------')
    if print_rationales:
        print('\nHuman rationales / Supporting Evidence:')
        for evidence in evidences:
            print('     - ', evidence[0])

def get_content(data, index):
    item = data[index]
    content = item['content']
    return content

def get_classes(data, index):
    item = data[index]
    classification = item['classification']
    return torch.tensor(classification)

def get_annotations(data, index):
    item = data[index]
    content = item['evidences']
    annotations = [evidence for evidence in content]
    return annotations

train_size = len(train_data)
test_size = len(test_data)
val_size = len(val_data)

print(f'Dataset split: {train_size} training examples')
print(f'               {test_size} test examples\n')

print_example(train_data, 505)

Dataset split: 1600 training examples
               199 test examples

Retrieving Training Example [505].................

Review content:
well , what are you going to expect ? it 's a movie about a big snake that eats people . that 's what i should have been thinking when i viewed this film , because maybe then i would have enjoyed myself more . instead , i ended up wishing a giant snake would come along and eat me , too . anaconda is about a documentary film crew sailing down a south american river . led by anthropologists dr . steven cale ( eric stolz ) and terri flores ( jennifer lopez ) , the crew is attempting to locate a lost tribe of natives . along the way , they find poacher paul sarone ( jon voight ) , and become unwillingly embroiled in his quest to capture the elusive anaconda . to simply say that this is the world 's largest snake would n't be doing it justice , since the anaconda in this movie is at least two feet wide . if this is n't a good reason to avoid picking up 

In [18]:
def encode_annotations(reviews, evidences):
  encoded_annotations = []
  for index in range(len(reviews)):
    review = reviews[index]
    evidence = evidences[index]
    annotation = np.zeros(max_length)
    for e in evidence:
      e = e[0]
      start = e['start_token']
      end = e['end_token']
      if(start < max_length and end < max_length):
        annotation[start:end] = 1

    encoded_annotations.append(annotation)
  return torch.tensor(encoded_annotations)

In [24]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from typing import List, Dict, Union
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
max_length = 512
# With it being easy to generate batches of tokenized texts, it's actually easier
# not to do the tokenization beforehand, and just store texts
# It's a little bit slow though, so if you found this to be bottleneck
# you'd want to pre-tokenize everything and then batch/pad as necessary
class SST2TransformerDataset(Dataset):
  def __init__(self,
               labels=None,
               texts=None,
               evidences=None
               ):

    self.y = torch.tensor(labels,dtype=torch.int64)
    self.texts = texts
    self.evidences = evidences

  def __len__(self):
    return self.y.shape[0]

  def __getitem__(self, idx):
    rdict = {
      'y': self.y[idx],
      'text': self.texts[idx],
      'evidences': self.evidences[idx]
    }
    return rdict


def SST2_transformer_collate(batch:List[Dict[str, Union[torch.Tensor,str]]]):
  # print("BATCH: ", batch)
  y_batch = torch.tensor([example['y'] for example in batch])
  evidences = [example['evidences'].numpy() for example in batch]

  # We'll just reuse the tokenizer we created earlier, since it doesn't change
  tokenized_batch = tokenizer.batch_encode_plus([example['text'] for example in batch],
                                                return_tensors='pt',
                                                padding=True,
                                                max_length=max_length,
                                                truncation=True)

  return {
      'y':y_batch,
      'input_ids':tokenized_batch['input_ids'],
      'attention_mask':tokenized_batch['attention_mask'],
      'evidences':torch.tensor(evidences)
  }

reviews = [get_content(train_data, i) for i in range(train_size)]
classes = [float(get_classes(train_data, i)) for i in range(train_size)]
evidences = [get_annotations(train_data, i) for i in range(train_size)]
encoded_evidences = encode_annotations(reviews, evidences)

reviews_test = [get_content(test_data, i) for i in range(test_size)]
classes_test = [float(get_classes(test_data, i)) for i in range(test_size)]
evidences_test = [get_annotations(test_data, i) for i in range(test_size)]
encoded_evidences_test = encode_annotations(reviews_test, evidences_test)

reviews_val = [get_content(val_data, i) for i in range(val_size)]
classes_val = [float(get_classes(val_data, i)) for i in range(val_size)]
evidences_val = [get_annotations(val_data, i) for i in range(val_size)]
encoded_evidences_val = encode_annotations(reviews_val, evidences_val)




batch_size = 10
train_dataset = SST2TransformerDataset(classes, reviews, encoded_evidences)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn = SST2_transformer_collate, shuffle=True)

dev_dataset = SST2TransformerDataset(classes_test, reviews_test, encoded_evidences_test)
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, collate_fn = SST2_transformer_collate, shuffle=False)

val_dataset = SST2TransformerDataset(classes_val, reviews_val, encoded_evidences_val)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn = SST2_transformer_collate, shuffle=False)

In [25]:
train_data_path = 'drive/MyDrive/google_colab_data/train_data_loader.pt'
test_data_path = 'drive/MyDrive/google_colab_data/test_data_loader.pt'
val_data_path = 'drive/MyDrive/google_colab_data/val_data_loader.pt'

def save_dataloader(dataloader_x, dataloader_y, encoded_evidences, dataloader, path, shuffle):
  !mkdir -p path
  # Save the data (input, Y_star) and DataLoader parameters
  save_data = {
      'input': dataloader_x,
      'Y_star': dataloader_y,
      'evidence': encoded_evidences,
      'dataloader_params': {
          'batch_size': dataloader.batch_size,
          'shuffle': shuffle,  # Keep track of shuffle manually
          'num_workers': dataloader.num_workers,
      }
  }
  # Save the data
  torch.save(save_data, path)

save_dataloader(reviews, classes, encoded_evidences, train_dataloader, train_data_path, True)
save_dataloader(reviews_test, classes_test, encoded_evidences_test, dev_dataloader, test_data_path, False)
save_dataloader(reviews_val, classes_val, encoded_evidences_val, val_dataloader, val_data_path, False)