In [None]:
# Get required nltk word sets
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Imports
import re
import os
import json
from google.colab import drive
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize,word_tokenize

In [None]:
# Mount Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Function to read in our jsonl files
def read_jsonl_file(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

# Function to extract field types from our jsonl files to understand
# our jsonl files
def get_field_types(articles):
    field_types = set()
    for article in articles:
        for key in article.keys():
            field_types.add(type(article[key]))
    return field_types

In [None]:
# Clean our sentences using lemmatizer
def clean(sentences):
	lemmatizer = WordNetLemmatizer()
	cleaned_sentences = []
	for sentence in sentences:
		sentence = sentence.lower()
		sentence = re.sub(r'[^a-zA-Z]',' ',sentence)
		sentence = sentence.split()
	  # Check if word in stopwords from nltk
		sentence = [lemmatizer.lemmatize(word) for word in sentence if word not in set(stopwords.words('english'))]
		sentence = ' '.join(sentence)
		cleaned_sentences.append(sentence)
	return cleaned_sentences

In [None]:
# Calculate the initial probabilities of our sentences
def probs_init(sentences):
	probability_dict = {}
	words = word_tokenize('. '.join(sentences))
	total_words = len(set(words))
	for word in words:
		if word!='.':
			# probability is based on counts, so add
			if not probability_dict.get(word):
				probability_dict[word] = 1
			else:
				probability_dict[word] += 1

	for word,count in probability_dict.items():
		probability_dict[word] = count/total_words

	return probability_dict

In [None]:
# Iteratively update our probababilities
def probs_update(probability_dict,word):
	if probability_dict.get(word):
		probability_dict[word] = probability_dict[word]**2
	return probability_dict

In [None]:
# Calculate average sntence weights in oder to choose highest weight sentence
def sent_weights_avg(sentences,probability_dict):
	sentence_weights = {}
	for index,sentence in enumerate(sentences):
		if len(sentence) != 0:
			# Calculate the sum of probabilities for all words in the sentence
      # Only include words that are present in the probability dictionary
			average_proba = sum([probability_dict[word] for word in sentence if word in probability_dict.keys()])
			average_proba /= len(sentence)
			sentence_weights[index] = average_proba
	return sentence_weights


In [None]:
# Generate summary by looking at words that meet threshold
# Then choose highest weighted sentence
def generate_summary(sentence_weights, probability_dict, cleaned_article, tokenized_article, summary_length=7, probability_threshold=0.05):
		summary = ""
		current_length = 0
		# Continue generating summary until the desired length is reached or there are no more words to consider
		while current_length < summary_length and probability_dict:
				highest_probability_word = max(probability_dict, key=probability_dict.get)
		    # Check if the probability of the highest probability word is above the threshold
				if probability_dict[highest_probability_word] >= probability_threshold:
					sentences_with_max_word = [index for index, sentence in enumerate(cleaned_article) if highest_probability_word in set(word_tokenize(sentence))]
					if sentences_with_max_word:
							sentence_list = sorted([[index, sentence_weights[index]] for index in sentences_with_max_word], key=lambda x: x[1], reverse=True)
							summary += tokenized_article[sentence_list[0][0]] + " "
							for word in word_tokenize(cleaned_article[sentence_list[0][0]]):
									probability_dict = probs_update(probability_dict, word)
									current_length += 1
					# If no sentences contain highest prob word, drop it
					else:
							del probability_dict[highest_probability_word]
				# Drop highest prob word if it does not meet threshold
				else:
						del probability_dict[highest_probability_word]
		return summary

In [None]:
# Write our generated summaries to a file for inference, one per line
def write_summaries_to_file(summaries, output_file_path):
    """
    Write summaries to a text file.

    Args:
    - summaries (list): List of summaries.
    - output_file_path (str): Path to the output text file.
    """
    with open(output_file_path, "w", encoding="utf-8") as f:
        for summary in summaries:
            f.write(summary.strip() + "\n")

In [None]:
# Main function for summarization
def run_summarization():
	# Load in our articles from the validation jsonl file
	articles = read_jsonl_file('/content/drive/My Drive/PLOS_val_copy.jsonl')
  # Extract articles from jsonl object
	content = [articles[i]['article'] for i in range(len(articles))]
	# Check field types of jsonl object, more for debugging
	field_types = get_field_types(articles)
	required_length = 7  # PLOS expert summaries are 7 sentences each
	summaries = []
	for article_content in content:
		tokenized_article = sent_tokenize(article_content) # tokenize article
		cleaned_article = clean(tokenized_article) # clean tokens of unwanted characters
		probability_dict = probs_init(cleaned_article) # calculate probability dictionary from cleaned tokens
		sentence_weights = sent_weights_avg(cleaned_article, probability_dict) # Generate weights from cleaned tokens
		summary = generate_summary(sentence_weights, probability_dict, cleaned_article, tokenized_article, required_length) # Generate summary
		# Make sure summary is not empty, otherwise algorithm failed for that summary
		if summary == "" or summary == " ":
				summary = "None."
		summaries.append(summary.strip())

	# Once all summaries are generated, write to text file for inference
	write_summaries_to_file(summaries, "/content/drive/MyDrive/sumbasic_summaries4.txt")

In [None]:
# Call function to generate summaries and write to file
run_summarization()