# Framework Configuration
Load Libraries - Load GPT Data

In [1]:
#@title Load Libraries

# Standard library imports
import os
import random
import re
import string
import warnings
import locale

def getpreferredencoding(do_setlocale=True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

warnings.filterwarnings("ignore")

# Third-party library imports
import numpy as np
import pandas as pd
import time
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from scipy.special import rel_entr

In [11]:
#@title Stop/Stem Downloads

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

if input("Do you want to run the NLTK Downloads? (y/n): ") == "y":
  nltk.download('punkt')
  nltk.download('punkt_tab')
  nltk.download('stopwords')

if input("Do you want to run Stop/Stem Words Example? (y/n): ") == "y":
  stemmer = PorterStemmer()
  stop_words = set(stopwords.words('english'))

  sen = "The dancers from Denmark dance a traditional dance"
  sen_list = sen.split()
  print(sen_list)

  no_stop_list = [w for w in sen_list if not w in stop_words]
  print(no_stop_list)

  stemmed_list = [stemmer.stem(w) for w in no_stop_list]
  print(stemmed_list)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Connor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Connor\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Connor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#@title Load Subreddit Data

import pandas as pd

drive_path = "../corpus_data_downloads/data/master_corpus.xlsx"
df = pd.read_excel(drive_path)

                 corpus_name conv_id  res_id question_text  \
0           subreddit-corpus   oj8g6       1    DUMMY DATA   
1           subreddit-corpus   oj8g6       2    DUMMY DATA   
2           subreddit-corpus   oj8g6       3    DUMMY DATA   
3           subreddit-corpus   oj8g6       4    DUMMY DATA   
4           subreddit-corpus  1dwy4n       1    DUMMY DATA   
...                      ...     ...     ...           ...   
494124  subreddit-Chesapeake  5jlk0e       4    DUMMY DATA   
494125  subreddit-Chesapeake  5ks187       2    DUMMY DATA   
494126  subreddit-Chesapeake  5ks187       3    DUMMY DATA   
494127  subreddit-Chesapeake  5ks187       4    DUMMY DATA   
494128  subreddit-Chesapeake  8xe4l2       1    DUMMY DATA   

                                              answer_text  
0       I've been highly considering moving to Corpus ...  
1       Just stay out of the west and north parts of t...  
2                                You'll probably survive.  
3       The ver

# Model Configurations

In [4]:
#@title KL Divergence Formula

import numpy as np
import time
from scipy.special import rel_entr

# Formula for P's divergence from Q: KL(P || Q) = – sum x in X P(x) * log(Q(x) / P(x))
# If we are attempting to approximate an unknown probability distribution, then the target probability distribution from data is P, and Q is our approximation of the distribution.

def kl_archive_divergence (arc_dict, arc_word_num, sen_dict, sen_word_num):
  if sen_word_num == 0:
      return 1000

  total_sum = 0
  for word in sen_dict:
      a = arc_dict[word] / arc_word_num
      b = sen_dict[word] / sen_word_num
      val = a * np.log(a / b)
      total_sum += val

  total_sum = total_sum / sen_word_num
  return total_sum

def kl_archive_divergence_op(arc_dict, arc_word_num, sen_dict, sen_word_num):
  if sen_word_num == 0:
      return 1000

  arc_probs = np.array([arc_dict.get(word, 0) / arc_word_num for word in sen_dict])
  sen_probs = np.array([sen_dict[word] / sen_word_num for word in sen_dict])

  kl_values = rel_entr(arc_probs, sen_probs)
  total_kl_divergence = np.sum(kl_values) / sen_word_num

  return total_kl_divergence

if input("Do you want to run NLP KL-D example? (y/n): ") == "y":

  np.random.seed(0)
  num_words = 10000
  arc_words = [f'word{i}' for i in range(num_words)]
  sen_words = [f'word{i}' for i in range(num_words)]

  arc_frequencies = np.random.randint(1, 100, size=num_words)
  sen_frequencies = np.random.randint(1, 100, size=num_words)

  arc_dict = dict(zip(arc_words, arc_frequencies))
  sen_dict = dict(zip(sen_words, sen_frequencies))

  arc_word_num = sum(arc_frequencies)
  sen_word_num = sum(sen_frequencies)

  start_time = time.time()
  kl_div = kl_archive_divergence(arc_dict, arc_word_num, sen_dict, sen_word_num)
  end_time = time.time()
  print("KL Divergence Time:", end_time - start_time)

  start_time_op = time.time()
  kl_div_op = kl_archive_divergence_op(arc_dict, arc_word_num, sen_dict, sen_word_num)
  end_time_op = time.time()
  print("Optimized KL Divergence Time:", end_time_op - start_time_op)

  print("KL Divergence:", kl_div)
  print("Optimized KL Divergence:", kl_div_op)

if input("Do you want to run Probability Distribution KL Divergence Example? (y/n): ") == "y":

  def kl_ex(p, q):
    return sum(p[i] * np.log(p[i] / q[i]) for i in range(len(p)))

  p = [0.10, 0.15, 0.05, 0.20, 0.25, 0.10, 0.05, 0.10]
  q = [0.15, 0.10, 0.20, 0.10, 0.10, 0.05, 0.20, 0.10]
  print(kl_ex(p, q))
  print(np.sum(rel_entr(p, q)))

In [5]:
#@title Sentence Class

class Sentence:
  def __init__(self, sen, id):
    self.og_sen = sen
    self.og_num_words = 0
    self.filt_sen = ""
    self.filt_num_words = 0
    self.word_freq_dict = {}
    self.in_summary = False
    self.id = id
    if self.og_sen[-1] not in string.punctuation:
        self.og_sen = self.og_sen + "."

  def get_sen(self):
    return self.og_sen

  def get_og_num_words(self):
    return self.og_num_words

  def get_filt_num_words(self):
    return self.filt_num_words

  def get_dict(self):
    return self.word_freq_dict

  def add_to_summary(self):
    self.in_summary = True

  def is_in_summary(self):
    return self.in_summary

  def get_id(self):
    return self.id

  def filter(self):
    self.filt_sen = self.og_sen.translate(str.maketrans('', '', string.punctuation))

    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    word_list = self.filt_sen.split()
    self.og_num_words = len(word_list)
    filt_word_list = [w for w in word_list if w not in stop_words]
    self.filt_num_words = len(filt_word_list)

    self.word_freq_dict = {}
    for word in filt_word_list:
      stem_word = stemmer.stem(word)
      self.word_freq_dict[stem_word] = self.word_freq_dict.get(stem_word, 0) + 1

    return self.word_freq_dict

  def __repr__(self):
    return (f"Sentence(id={self.id!r}, og_sen={self.og_sen!r}, "
            f"og_num_words={self.og_num_words!r}, filt_sen={self.filt_sen!r}, "
            f"filt_num_words={self.filt_num_words!r}, word_freq_dict={self.word_freq_dict!r}, "
            f"in_summary={self.in_summary!r})")

if input("Do you want to run Sentence Example? (y/n): ") == "y":

  sentence = "His name was Jack Jack Black which was crazy since his mother was named Jackie Jack Black"
  id = 1

  sentence_data = Sentence(sentence, id)
  filtered_dict = sentence_data.filter()

  print("Original Sentence:", sentence_data.get_sen())
  print("Original Word Count:", sentence_data.get_og_num_words())
  print("Filtered Word Count:", sentence_data.get_filt_num_words())
  print("Stemmed Dictionary:", sentence_data.get_dict())
  print("Is in Summary:", sentence_data.is_in_summary())
  print("ID:", sentence_data.get_id())

  sentence_data.add_to_summary()
  print("Is in Summary (after adding):", sentence_data.is_in_summary())

In [None]:
#@title Archive Class

class Archive:  # Gets reviews, parses by sentence, gets KL scores, builds summary. Has 3 levels of affecting redundancy
  def __init__(self, responses, redun_level, lim_type):
    if responses is not None:
        self.df = responses
    self.sen_list = []
    self.total_words = 0
    self.min_words = 20
    self.full_dictionary = {}
    self.num_filter_words = 0
    self.kl_list = []
    self.summary = []
    self.summary_word_num = 0
    self.summary_limit = 250  # UPDATE SUMMARY LENGTH HERE
    self.kl_counter = 0
    self.min_sen_length = 5
    self.max_sen_length = 29
    self.redun_level = redun_level
    self.doc = None
    self.rev_count = 0
    self.lim_type = lim_type
    self.has_positive = False

  def run_summarization(self):
    self.load_sentences()
    if self.check_summarization():
      self.kl_divergence()
      self.build_summary()
      return True
    return False

  def check_summarization(self):
    if self.total_words > self.min_words:
      return True
    else:
      return False

  def set_doc(self, doc):
    self.doc = doc

  def set_summary_limit(self, num_responses):
    if self.lim_type == "250":
        self.summary_limit = 250
    if self.lim_type == "percent":
        self.summary_limit = int(len(self.sen_list) * .15)
    if self.lim_type == "rev_count":
        self.summary_limit = int(num_responses / 5) * 25

  def set_summary_limit_doc(self):
    self.summary_limit = int(self.total_words * .8)  # *1 for full match. * .8 to trim size

  def get_summary_word_num(self):
    return self.summary_word_num

  def get_full_dictionary(self):
    return self.full_dictionary

  def get_word_counts(self):
    return self.total_words, self.num_filter_words

  def get_sen_list(self):
    return self.sen_list

  def get_lim_type(self):
    return self.lim_type

  def get_has_positive(self):
    return self.has_positive

  def kl_divergence(self):  # Calculates each sentence's KL_score, appends to kl_list in respective order
    self.kl_list.clear()
    for i in range(len(self.sen_list)):
        if not self.sen_list[i].is_in_summary():
            self.kl_list.append(kl_archive_divergence_op(
                self.full_dictionary,
                self.num_filter_words,
                self.sen_list[i].word_freq_dict,
                self.sen_list[i].filt_num_words))
            if 0 < self.kl_list[-1] < 100:
                self.has_positive = True
        else:
            self.kl_list.append(100)  # 100 when aiming for min KL (our default), -100 when aiming for max KL
    self.kl_counter += 1

  def get_kl(self):
    return self.kl_list

  def remove_redundancy(self, index):  # After sentence is added to summary, remove its filtered word occurrences from the archive dictionary
    self.sen_list[index].add_to_summary()  # Flags sentence as used
    sen_dict = self.sen_list[index].get_dict()
    for word in sen_dict:  # Nothing happens if redun_level = 0
        if self.redun_level == 2:  # New idea, bigger redundancy removal by dividing the remaining frequency by the used sentence frequency 10 / 3 = 3.3333
            self.full_dictionary[word] = self.full_dictionary[word] / sen_dict[word]
            self.num_filter_words -= sen_dict[word]
        if self.redun_level == 1:  # OG idea, just reduces the archive frequency by the used sentence frequency 10 - 3 = 7
            self.full_dictionary[word] -= sen_dict[word]
            self.num_filter_words -= sen_dict[word]

  def print_sen_and_kl(self):
    for i in range(len(self.sen_list)):
        print(self.kl_list[i], "\t", self.sen_list[i].get_sen())

  def build_summary(self):
    if self.lim_type == "percent":
        measure = "sentences"
    else:
        measure = "words"
    print("Building Summary. \tlimitType =", self.lim_type, " summary limit = ", self.summary_limit, measure, "\n")
    counter = 0
    while counter < self.summary_limit:
        min_val = min(self.kl_list)
        max_index = self.kl_list.index(min_val)
        if min_val == 100:  # Breaks while loop if we're out of sentences
            break
        if self.sen_list[max_index].get_og_num_words() >= self.min_sen_length \
                and self.sen_list[max_index].get_og_num_words() <= self.max_sen_length:  # Only accepts 5 <= Sentence Length <= 29
            if len(self.summary) > 0 and self.sen_list[max_index].get_sen() == self.summary[-1]:  # Ignore duplicate sentences
                dummy = 0
            else:
                self.summary.append(self.sen_list[max_index].get_sen())  # Add sentence to summary
                self.summary_word_num += self.sen_list[max_index].get_og_num_words()  # Add to total word count
        self.remove_redundancy(max_index)
        self.kl_divergence()
        if self.lim_type == "percent":
            counter = len(self.summary)
        else:
            counter = self.summary_word_num

  def print_summary_as_paragraph(self):
    return ' '.join(self.summary)

  def print_summary_by_lines(self):
    print("Here is the summary by lines: , \t ", self.summary_word_num, " words")
    print()
    for sen in self.summary:
        print(sen)
    return self.summary

  def get_summary(self):
    return self.summary

  def load_sentences(self):
    self.total_words = 0
    rev_counter = 0
    for i in self.df.index:
        rev = self.df.at[i, "answer_text"]  # Parses out review info

        if not isinstance(rev, str):
          continue  # Skip non-string entries (e.g., NaN)

        temp_list = nltk.tokenize.sent_tokenize(rev)  # Splits review into sentences
        for j in range(len(temp_list)):
            sen = Sentence(temp_list[j], rev_counter)  # Creates Sentence
            self.sen_list.append(sen)  # Adds Sentence to list
            self.total_words += len(temp_list[j].split())
            sen_dict = self.sen_list[-1].filter()  # Returns Sentence's filtered dictionary

            for word in sen_dict:  # Add sentence filter dictionary to archive filter dictionary
                if word in self.full_dictionary:
                    self.full_dictionary[word] += sen_dict[word]
                else:
                    self.full_dictionary[word] = sen_dict[word]
                self.num_filter_words += sen_dict[word]
        rev_counter += 1
    self.set_summary_limit(rev_counter)
    self.rev_count = rev_counter

  def load_sentences_doc(self, doc):  # Input is just one line/paragraph.
    self.total_words = 0
    rev_counter = 0
    self.doc = doc
    temp_list = nltk.tokenize.sent_tokenize(self.doc)  # Splits review into sentences
    for j in range(len(temp_list)):
        sen = Sentence(temp_list[j], rev_counter)  # Creates Sentence
        self.sen_list.append(sen)  # Adds Sentence to list
        self.total_words += len(temp_list[j].split())
        sen_dict = self.sen_list[-1].filter()  # Returns Sentence's filtered dictionary
        for word in sen_dict:  # Add sentence filter dictionary to archive filter dictionary
            if word in self.full_dictionary:
                self.full_dictionary[word] += sen_dict[word]
            else:
                self.full_dictionary[word] = sen_dict[word]
            self.num_filter_words += sen_dict[word]
    rev_counter += 1
    self.set_summary_limit_doc()

# Summarization of Subreddit Conversations or Responses

In [None]:
def save_file(summary, arc, val, inp):
  if inp == "conv":
    filename = f"../corpus_summaries/corpus_conversation_summaries/conv_summary_id_{val}.txt"
  else:
    filename = f"../corpus_summaries/corpus_response_summaries/res_summary_id_{val}.txt"

  with open(filename, 'w', encoding='utf-8') as writefile:
    writefile.write("Original Response: \n")
    for item in arc.get_sen_list():
      writefile.write(item.get_sen() + "\n")
    writefile.write("\nSummary: \n")
    for j in range(len(summary)):
      writefile.write(summary[j] + "\n")
  print("Saved:", filename)

while True:
  inp = input("Do you want a summary for each conversation or for each response? (conv/res)")
  if inp == "conv":
    id = "conv_id"
    break
  elif inp == "res":
    id = "res_id"
    break
  else:
    print("Invalid input. Please enter 'conv' or 'res'.")

i = 1

for item in df[f"{id}"].unique():
  revs = df.loc[df[f"{id}"] == item]   #  // gets a set of reviews for a given dataset
  no_red = 0
  n_grams = 2  #used for the unique-ngram redundancy checker

  print(f"\n\n~~~ Extractor for {id} = {item} ~~~")

  arc = Archive(revs, no_red, "percent");  #use "250" , "percent" , or "rev_count"
  arc.min_words = 100 if inp == "conv" else 0
  arc.load_sentences()

  if arc.total_words > arc.min_words:
    arc.kl_divergence()
    arc.build_summary()

    summary = arc.print_summary_by_lines()
    print()
    save_file(summary, arc, str(i), inp)
    i += 1
  else:
    print("No summary was generated.")
    i += 1



~~~ Extractor for conv_id = oj8g6 ~~~
Building Summary. 	limitType = percent  summary limit =  1 sentences 

Here is the summary by lines: , 	  7  words

I haven't had any problems with crime.

Saved: ./corpus_summaries/corpus_conversation_summaries/conv_summary_id_1.txt


~~~ Extractor for conv_id = 1dwy4n ~~~
No summary was generated.


~~~ Extractor for conv_id = 1ebrkd ~~~
No summary was generated.


~~~ Extractor for conv_id = 1ebnwu ~~~
Building Summary. 	limitType = percent  summary limit =  0 sentences 

Here is the summary by lines: , 	  0  words


Saved: ./corpus_summaries/corpus_conversation_summaries/conv_summary_id_4.txt


~~~ Extractor for conv_id = 1f7vu4 ~~~
No summary was generated.


~~~ Extractor for conv_id = 1f7923 ~~~
No summary was generated.


~~~ Extractor for conv_id = 1ffjhy ~~~
No summary was generated.


~~~ Extractor for conv_id = qo1cp ~~~
No summary was generated.


~~~ Extractor for conv_id = qvd9c ~~~
No summary was generated.


~~~ Extractor for con

TypeError: expected string or bytes-like object, got 'float'