In [None]:
# Download necessary nltk word sets
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
# Imports
import re
import json
from google.colab import drive
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize

In [None]:
# Mount Drive
drive.mount('/content/drive')

In [None]:
# Clean our article
def clean_article(article):
	lem = WordNetLemmatizer()
	article =  re.sub(r'\[[^\]]*\]','',article) # remove citations
	article = sent_tokenize(article) # tokenize article into sentences
	cleaned_list=[]
	for sent in article:
		sent  = sent.lower()
		word_list = []
		words = word_tokenize(sent) # tokenize sntences into words
		for word in words:
			word_list.append(lem.lemmatize(word.lower())) # lemmatize words
		cleaned_list.append(' '.join(word_list)) # join lemmatized words
	return cleaned_list

In [None]:
# Calculate word frequencies dict from content
def init_freq_dict(content):
	frequency = {}
	for sentence in content:
		word_list = word_tokenize(sentence) # tokenize sentence into words
		for word in word_list:
      # Update frequency of the word in the frequency dictionary as long as not stop word or special character
			if word not in set(stopwords.words('english')).union({',','.',';','%',')','(','``'}):
				if frequency.get(word) is None:
					frequency[word] = 1 # Initialize if not in dict already
				else:
					frequency[word] += 1 # Increment by 1 if in dict
	return frequency

In [None]:
# Calculate scores for each sentence based on word frequency and position
def get_score(content, frequency_dictionary):
    sentence_score = {}
    for sentence in content:
        score = 0
        word_list = word_tokenize(sentence) # tokenize sentence into words
        start_idx, end_idx = -1, len(word_list) + 1
        index_list = []
        for word in word_list:
            # Make sure word is not stop word or punctuation and it exists in dict
            if word not in set(stopwords.words('english')).union({',', '.', ';', '%', ')', '(', '``'}) and word in frequency_dictionary.keys():
                # Add if it passes
                index_list.append(word_list.index(word))
        # Make sure there are relevant words
        if index_list:
            # Calculate score based on the number of relevant words and their positions
            if max(index_list) - min(index_list) != 0:  # Make sure denominator is non-zero
                score = len(index_list) ** 2 / (max(index_list) - min(index_list))
            else:
                score = len(index_list) ** 2  # Assign high score if denominator is zero
        # Put calculated sentence score in dict, with sentence as key
        sentence_score[content.index(sentence)] = score
    return sentence_score

In [None]:
# Generate summary based on sentence scores
def summarize(sentence_scores,content,threshold):
	summary = ""
  # Sort sentence scores by values in descending order, take top threshold - 1 sentences
	sentence_indices = sorted(sentence_scores,key=sentence_scores.get,reverse=True)[:threshold-1]
  # Iterate over indices to extract sentences form content
	for index in sentence_indices:
		summary+=content[index]+" "
	return summary

In [None]:
# Extract articles from jsonl file so generate summaries from
def get_articles():
	# Extract summaries from our validation set
	jsonl_file_path = "/content/drive/MyDrive/cpsc_477/data/copy_PLOS_val.jsonl"
	# Read the jsonl file
	with open(jsonl_file_path, "r", encoding="utf-8") as f:
			articles = []
			for line in f:
					# Load the jsonl object on the line
					article_object = json.loads(line)
					# Just extract the article to place in a list
					article_text = article_object['article']
					articles.append(article_text)
	return articles

In [None]:
# Write summaries to a text file for inference
def write_summaries_to_file(summaries, output_file_path):
    """
    Write summaries to a text file.

    Args:
    - summaries (list): List of summaries.
    - output_file_path (str): Path to the output text file.
    """
    with open(output_file_path, "w", encoding="utf-8") as f:
        for summary in summaries:
            f.write(summary.strip() + "\n")

# Count lines in file to make sure it is the right amount for inference
def count_lines_in_file(file_path):
    """
    Count the number of lines in a text file.

    Args:
    - file_path (str): Path to the text file.

    Returns:
    - int: Number of lines in the file.
    """
    line_count = 0
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line_count += 1
    return line_count

In [None]:
# Main function for summarization
def run_summarization():
	articles = get_articles() # extract articles for summarization
	summaries = []
	for content in articles:
		cleaned_content = clean_article(content) # clean articles
		threshold = len(cleaned_content)//60 # set threshold to pass to summarize
		frequency_dictionary = init_freq_dict(cleaned_content) # initialize our freq dict
		sorted_dictionary = {key: frequency_dictionary[key] for key in sorted(frequency_dictionary,key=frequency_dictionary.get,reverse=True)[:300]} # sort dict by freq
		sentence_scores = get_score(cleaned_content,sorted_dictionary) # calculate sentence scores based on freq
		summary = summarize(sentence_scores,sent_tokenize(content),threshold).strip() # calculate summary and strip
		# Remove all newline characters from summary since we do not want to go to
		# a new line in the middle of the summary
		summary = summary.replace("\n", "")
		# Check if summary is blank after removing newline characters, if so it failed on that summary
		if not summary.strip():
				summary = "None."
		summaries.append(summary.strip())

	# Write summaries to text file for inference, one per line
	write_summaries_to_file(summaries, "/content/drive/MyDrive/cpsc_477/data/luhn_summaries3.txt")
  # Count number of lines in file just generated to make sure correct
	num_lines = count_lines_in_file("/content/drive/MyDrive/cpsc_477/data/luhn_summaries3.txt")
  # Can print for debugging
	# print("Number of lines in the file:", num_lines)

In [None]:
# Call function to generate summaries and write to file
run_summarization()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
dict_keys(['lay_summary', 'article', 'headings', 'keywords', 'id'])
1376
1376
Number of lines in the file: 1376
