In [None]:
# Imports and installations
%pip install transformers[torch]

from transformers import pipeline
from google.colab import drive
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
import json

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    # Use CUDA (GPU)
    device = torch.device("cuda")
    print('GPU:', torch.cuda.get_device_name(0))
else:
    # Use CPU
    device = torch.device("cpu")
    print('Using CPU.')

GPU: NVIDIA L4


In [None]:
# Mount drive
drive.mount("/content/drive")

In [None]:
# Initialize BART tokenizer and configuration from finetuned model
tokenizer = BartTokenizer.from_pretrained("/content/drive/MyDrive/cpsc_477/training_2/checkpoint-9000")
config = BartConfig.from_pretrained("/content/drive/MyDrive/cpsc_477/training_2/checkpoint-9000")
# Define model using loaded configuration
model = BartForConditionalGeneration.from_pretrained("/content/drive/MyDrive/cpsc_477/training_2/checkpoint-9000", config=config)

In [None]:
# Extract articles from dataset
def get_articles():
	# Define path to jsonl file
	jsonl_file_path = "/content/drive/MyDrive/cpsc_477/data/copy_PLOS_val.jsonl"
	# read file
	with open(jsonl_file_path, "r", encoding="utf-8") as f:
			articles = []
			for line in f:
					# Load json object on line
					article_object = json.loads(line)
					# just extract article from jsno object
					article_text = article_object['article']
					articles.append(article_text)
	return articles

In [None]:
# Write summaries to a file for inference, one per line
def write_summaries_to_file(summaries, output_file_path):
    """
    Write summaries to a text file.

    Args:
    - summaries (list): List of summaries.
    - output_file_path (str): Path to the output text file.
    """
    with open(output_file_path, "w", encoding="utf-8") as f:
        for summary in summaries:
            f.write(summary.strip() + "\n")

In [None]:
# Main code to extract summaries from our model

content = get_articles() # load in articles
summaries = []
for text in content:
  inputs = tokenizer.encode(text, max_length=1024, truncation=True, return_tensors='pt') # tokenize input
  pred = model.generate(inputs, max_length=300, min_length=100, num_beams=4, early_stopping=True) # generate summary
  summary = tokenizer.decode(pred[0], skip_special_tokens=True) # decode generated summary tokens into text
  # Check if summary is blank, if so model failed
  # Never happened for BART though
  if summary == "" or summary == " ":
    summary = "None."
  summaries.append(summary)

# Once we have all summaries, write them to a text file
write_summaries_to_file(summaries, "/content/drive/MyDrive/cpsc_477/data/bart_summaries.txt")