In [None]:
# Installations and imports
%pip install transformers[torch]

from transformers import AutoTokenizer, pipeline
from google.colab import drive
import numpy as np
import torch
import json
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize



In [None]:
# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    # Use CUDA (GPU)
    device = torch.device("cuda")
    print('GPU:', torch.cuda.get_device_name(0))
else:
    # Use CPU
    device = torch.device("cpu")
    print('Using CPU.')

GPU: NVIDIA L4


In [None]:
# Mount drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Compute and return summaries, only taking the first 458 (about 1/3) of instances due to runtime
def get_summaries(jsonl_file_path):

	# read jsonl file
	with open(jsonl_file_path, "r", encoding="utf-8") as f:
			summaries = []
			batch_size = 458 # initialize how many articles you want to take in a "batch" to generate summaries for
			for line in f:
					# load json object on line
					article_object = json.loads(line)
					# just extract article from json object
					article_text = article_object['article']
					sentences = sent_tokenize(article_text) # tokenize our article into sentences since this is extraction
					# Pass in sentences to get extracted sentences as summary
					# Used count strategy to generate 7 sentence summaries, but can adjust
					output = summarization_pipeline({"sentences": sentences}, strategy="count", strategy_args=7)
					summary_sentences = " ".join(output[0]).strip() # element 0 of output is extracted sents, element 1 is indices of the sents
					summaries.append(summary_sentences)
		 			# Only take first batch_size elements due to runtime, even if on L4
					if len(summaries) == batch_size:
							break
	return summaries

In [None]:
# Write generated summaries to file, one per line
def write_summaries_to_file(summaries, output_file_path):
    """
    Write summaries to a text file.

    Args:
    - summaries (list): List of summaries.
    - output_file_path (str): Path to the output text file.
    """
    with open(output_file_path, "w", encoding="utf-8") as f:
        for summary in summaries:
            f.write(summary.strip() + "\n")

In [None]:
# Initialize pipeline for summarization, using tokenizer from model
summarization_pipeline = pipeline("summarization",
                                  model="NotXia/pubmedbert-bio-ext-summ",
                                  tokenizer = AutoTokenizer.from_pretrained("NotXia/pubmedbert-bio-ext-summ"),
                                  trust_remote_code=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Compute our generated summaries
validation_summaries = get_summaries("/content/drive/MyDrive/cpsc_477/data/copy_PLOS_val.jsonl")
# Write generated summaries to text file for inference
write_summaries_to_file(validation_summaries, "/content/drive/MyDrive/cpsc_477/data/extractive_bert_summaries_start.txt")