# Rogets Thesaurus Mapper

## Initialization

In [1]:
import _pickle as cPickle
import os
from collections import defaultdict
import os.path

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
import pandas as pd
import string

_file_path = '/content/drive/MyDrive/CSE498R/rogets_text/'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
class PyRoget(object):

    def __init__(self):
        with open(os.path.join(_file_path, 'thes_dict.txt'), 'rb') as f:
            self.word_categories_dict = cPickle.load(f, encoding='utf-8')
        with open(os.path.join(_file_path, 'thes_cat.txt'), 'rb') as f:
            self.category_word_dict = cPickle.load(f, encoding='utf-8')
        with open(os.path.join(_file_path, 'cat_num.txt'), 'rb') as f:
            self.category_code_dict = cPickle.load(f, encoding='utf-8')
        with open(os.path.join(_file_path, 'node_codes.txt'), 'rb') as f:
            self.node_code_category_dict = cPickle.load(f, encoding='utf-8')
        with open(os.path.join(_file_path, 'code_nodes.txt'), 'rb') as f:
            self.category_node_code_dict = cPickle.load(f, encoding='utf-8')
        with open(os.path.join(_file_path, 'full_childparent.txt'), 'rb') as f:
            self.node_parent_dict = cPickle.load(f, encoding='utf-8')
        # TODO: fix capitalization?
        with open(os.path.join(_file_path, 'num_cat.txt'), 'rb') as f:
            self.code_category_dict = cPickle.load(f, encoding='utf-8')

    def add_custom_words(self, fid='add_words.txt'):
        ''' Load a file of words and connections into THES_DICT '''
        try:
            type(self.added_words) is list
        except:
            self.added_words = []
        with open(fid, 'rb') as f:
            for line in f:
                wd_pair = line.strip().split(',')
                if wd_pair[0] not in self.word_categories_dict:
                    self.word_categories_dict[wd_pair[0]] = [wd_pair[1], ]
                    self.added_words.append(wd_pair[0])
                else:
                    if wd_pair[1] not in self.word_categories_dict[wd_pair[0]]:
                        self.word_categories_dict[wd_pair[0]].append(
                            wd_pair[1])

        self.added_words = list(set(self.added_words))
        print("Words Added to THES_DICT:")
        print(' '.join(self.added_words))

    def extract_words(self, text, lower=True):
        """Removes punctuation from text and returns a list of words

        Arguments:
            text {str} -- text from which to extract words

        Keyword Arguments:
            lower {bool} -- case of words extracted (default: {True})

        Returns:
            list of str -- extracted words
        """

        puncMarks = [',', '.', '?', '!', ':', ';',
                     '\'', '\"', '(', ')', '[', ']', '-']
        for item in puncMarks:
            text = text.replace(item, '')
        if lower:
            lowerText = text.lower()
        textWords = lowerText.split()
        return textWords

    def categorize_word(self, word, levels=0):
        """Gets all base categories of a word, returns list of (cateogry, code)

        Arguments:
            word {str} -- word to categorize

        Keyword Arguments:
            levels {int} -- how many levels above base category (default: {0})

        Returns:
            list of tuple of str or None -- list of (category, code) tuples or
                None, if not found
        """

        if word in self.word_categories_dict:
            cats = [(self.code_category_dict[x], x) for x
                    in self.word_categories_dict[word]]
        else:
            return None
        for _ in range(levels):
            for i, cat in enumerate(cats):
                if cat and cat[1] not in {'A', 'B', 'C', 'D', 'E', 'F'}:
                    node = self.node_parent_dict[cat[1]]
                    cats[i] = (self.node_code_category_dict[node], node)

        return cats

    def categorize_words(self, text, levels=0):
        """Get all categories of a text or list of words

        Arguments:
            text {str | list of str} -- text to categorize

        Keyword Arguments:
            levels {int} -- how many levels up to classify (default: {0})

        Returns:
            list of tuple of str or None -- list of (category, code) tuples or
                None, if not found
        """

        if type(text) == str:
            wordlist = self.extract_words(text)
#             print(wordlist)
        else:
            wordlist = [x.lower() for x in text if x.isalpha()]
#             print(wordlist)
        categorized = [self.categorize_word(x, levels=levels) for x
                       in wordlist if x in self.word_categories_dict]
#         print(categorized)
        return [y for x in categorized for y in x if y] or None


    def get_category_freqs(self, text_list, levels=0):
        '''returns dict of word category frequencies of levels=n for a text,
        throwing out words not in the thesaurus'''
        frequency_counts = defaultdict(lambda: 0)
        bagofwords = self.categorize_words(text_list, levels=levels)
#         print(bagofwords)
        if bagofwords is None:
            return None
        good_cats = [x for x in bagofwords if x]
        for x in good_cats:
            frequency_counts[x] += 1
        return frequency_counts

    def get_all_category_freqs(self, text_list):
        freqlist = self.get_category_freqs(text_list)
        fulldict = {x: 0 for x in self.code_category_dict.keys()}
        for key in freqlist.keys():
            if key[1] in fulldict:
                fulldict[key[1]] = freqlist[key]
        newdict = {(self.code_category_dict[x], x):
                   fulldict[x] for x in fulldict.keys()}
        return sorted(newdict.items(), key=lambda x: x[0][1])

    def get_category_hierarchies(self, word):
        '''returns full hierarchical paths for word
        from base categories to parent node "WORDS" (code '0');
        also returns distance of each node from word/base category;
        return format tuple: (distance,node)'''
        all_cats = [x for x in self.word_categories_dict[word]]
        syn_paths = []
        for cat in all_cats:
            path = []
            counter = 0
            path.append((counter, self.code_category_dict[cat]))
            node = cat
            while True:
                counter += 1
                parent = self.node_parent_dict[node]
                path.append((counter, self.node_code_category_dict[parent]))
                node = parent
                if parent not in self.node_parent_dict.keys():
                    break
            syn_paths.append(path)
        return syn_paths

    def get_words(self, category):
        '''Returns all words in given base category (accepts code or category name)'''
        if category in self.code_category_dict:
            return (self.code_category_dict[category],
                    self.category_word_dict[category])
        elif category.lower() in self.category_code_dict:
            code = self.category_code_dict[category.lower()]
            return(category.upper(), self.category_word_dict[code])
        return []

    def get_all_related_words(self, word):
        '''given word, return all other words in word's categories'''
        cats = [x for x in self.word_categories_dict[word]]
        return tuple((self.code_category_dict[cat],
                      self.category_word_dict[cat]) for cat in cats)

    def distance_to_node(self, word1, node):
        '''given word and node, return distance (in nodes) from base category to node;
        if node not in path to "WORDS" node, distance equals sum of word's and node's
        path to "WORDS"'''
        if node in self.node_code_category_dict:
            node = self.node_code_category_dict[node]
        wordcats = [x[0] for x in self.categorize_word(word1)]
        word1 = word1.lower()
        distances = []
        paths = self.get_category_hierarchies(word1)
        for path in paths:
            for tup in path:
                if node in tup:
                    distances.append(tup[0])
        if distances == []:
            for cat in wordcats:
                pathlength1 = self.get_category_distance(cat, 'WORDS')
                pathlength2 = self.get_category_distance(node, 'WORDS')
                distances.append(pathlength1 + pathlength2)
        dist = min(distances)
        return dist

    def get_shared_nodes(self, word1, word2):
        '''given two words, returns tuples containing all shared nodes
        and minimum distance between the two words via that node;
        output format tuple: (distance,node)'''
        word1 = word1.lower()
        word2 = word2.lower()
        paths1 = self.get_category_hierarchies(word1)
        paths2 = self.get_category_hierarchies(word2)
        nodes1 = []
        nodes2 = []
        path_lengths = []
        for path in paths1:
            for node in path:
                nodes1.append(node[1])
        for path in paths2:
            for node in path:
                nodes2.append(node[1])
        common_nodes = list(set(nodes1).intersection(set(nodes2)))
        for node in common_nodes:
            distances1 = []
            distances2 = []
            for path in paths1:
                for n in path:
                    if node in n:
                        distances1.append(n[0])
            for path in paths2:
                for n in path:
                    if node in n:
                        distances2.append(n[0])
            path_length = min(distances1) + min(distances2)
            path_lengths.append((path_length, node))
        path_lengths.sort(key=lambda x: x[0])
        return path_lengths

    def get_word_distance(self, word1, word2):
        '''returns minimum distance between two words as int'''
        word1 = word1.lower()
        word2 = word2.lower()
        paths1 = self.get_category_hierarchies(word1)
        paths2 = self.get_category_hierarchies(word2)
        nodes1 = []
        nodes2 = []
        path_lengths = []
        for path in paths1:
            for node in path:
                nodes1.append(node[1])
        for path in paths2:
            for node in path:
                nodes2.append(node[1])
        common_nodes = list(set(nodes1).intersection(set(nodes2)))
        for node in common_nodes:
            distances1 = []
            distances2 = []
            for path in paths1:
                for n in path:
                    if node in n:
                        distances1.append(n[0])
            for path in paths2:
                for n in path:
                    if node in n:
                        distances2.append(n[0])
            path_length = min(distances1) + min(distances2)
            path_lengths.append(path_length)
        distance = min(path_lengths)
        return distance

    def get_category_hierarchy(self, category):
        '''returns path from node to parent node "WORDS"
        and distance from given node to each node in path;
        output format list of tuples: [(distance,node),...]'''
        if category == '0':
            return [(0, 'WORDS')]
        elif category.upper() == 'WORDS':
            return [(0, 'WORDS')]
        elif category.lower() in self.node_code_category_dict.keys():
            cat = category.lower()
        elif category.upper() in self.node_code_category_dict.values():
            cat = self.category_node_code_dict[category.upper()]
        # else:
            # NOTE: MAY NEED RETURN THAT DOESN'T MESS UP DISTANCE CALCULATIONS IN NODE CLUSTRING ALGORITHM BELOW
            # return [(0,'WORDS')]
        path = []
        counter = 0
        path.append((counter, self.node_code_category_dict[cat]))
        node = cat
        while True:
            counter += 1
            parent = self.node_parent_dict[node]
            path.append((counter, self.node_code_category_dict[parent]))
            node = parent
            if parent not in self.node_parent_dict.keys():
                break
        return path

    def get_category_distance(self, category1, category2):
        '''return minimum distance between two categories as int'''
        paths1 = self.get_category_hierarchy(category1)
        paths2 = self.get_category_hierarchy(category2)
        nodes1 = []
        nodes2 = []
        path_lengths = []
        for path in paths1:
            nodes1.append(path[1])
        for path in paths2:
            nodes2.append(path[1])
        common_nodes = list(set(nodes1).intersection(set(nodes2)))
        for node in common_nodes:
            distances1 = []
            distances2 = []
            for n in paths1:
                if node in n:
                    distances1.append(n[0])
            for n in paths2:
                if node in n:
                    distances2.append(n[0])
            path_length = min(distances1) + min(distances2)
            path_lengths.append(path_length)
        distance = min(path_lengths)
        return distance

    def cluster_by_categories(self, wlist, verbose=False, N=0):
        '''returns all nodes that minimize aggregate distance to all words in wordlist;
        output format list of tuples: (node,aggregate distance,average distance per word);
        verbose flag triggers running results; N flag determines number of nearest nodes printed'''
        notwords = []
        wordlist = []
        for word in wlist:
            if word not in self.word_categories_dict.keys():
                notwords.append(self.categorize_word(word))
            else:
                wordlist.append(word)
        word_basecats = []
        if verbose:
            print("excluded words:", notwords)
        node_distances = []
        nodelist = self.node_parent_dict.keys()
        for ndx, node in enumerate(nodelist):
            dists = [self.distance_to_node(word, node) for word in wordlist]
            if verbose:
                print(dists)
            aggdist = sum(dists)
            node_entry = (node, aggdist)
            node_distances.append(node_entry)
            avg_node_distance = aggdist / float(len(wordlist))
            if verbose:
                print(ndx, node, aggdist, avg_node_distance)
        node_distances = sorted(node_distances, key=lambda x: x[1])
        node_distances_named = [(node, self.node_code_category_dict[node], dist, (float(
            dist) / len(wordlist))) for (node, dist) in node_distances]
        distlist = [x[1] for x in node_distances]
        mindist = min(distlist)
        mindist_nodes = [(node, self.node_code_category_dict[node], dist, (float(
            dist) / len(wordlist))) for (node, dist) in node_distances if dist == mindist]
        if N > 0:
            return node_distances_named[:N]
        else:
            return mindist_nodes

    def get_all_categories_batch_by_folder(self, folder, csv=False):
        '''accepts name of folder containing only files'''
        import pandas as pd
        import numpy as np
        from os import listdir
        flist = listdir(folder)[1:]
        headlist = [x[:-4] for x in flist]
        with open(folder + flist[0], 'r') as thefile:
            reader = thefile.read()
            thingy = self.get_all_category_freqs(reader)
            indexlist = [x[0] for x in thingy]
        newarray = []
        for f in flist:
            with open(folder + f, 'r') as current_file:
                text = current_file.read()
            freqs = self.get_all_category_freqs(text)
            newarray.append([x[1] for x in freqs])
        nparray = np.array(newarray, dtype=int)
        nparray = np.transpose(nparray)
        df = pd.DataFrame(nparray, index=indexlist, columns=headlist)
        if csv:
            df.to_csv(folder + 'summary.csv')
        return df

r = PyRoget() # main code

## Utilities

In [11]:
!pip install datasets --quiet

In [12]:
def remove_punctuation(text):
  punctuation_chars = string.punctuation
  translator = str.maketrans('', '', punctuation_chars)
  return text.translate(translator)

In [13]:
def get_category_label(categories):
  categories_arr = []

  if categories == None:
    return categories
  for category in categories:
    categories_arr.append(category[0])

  return categories_arr

def get_rogets_categories_by_level(word_list):
  rogets_categories_by_level = []

  for word in word_list:
    categories_in_level = []

    for i in range(5): # 5 levels max
      categories = r.categorize_words(word.lower(), i)
      category_labels = get_category_label(categories)
      categories_in_level.append(category_labels)

    rogets_categories_by_level.append(categories_in_level)

  return rogets_categories_by_level

In [14]:
def custom_flatten_array(array):
  flattened_array = []

  for a1 in array:
    for a2 in a1:
      for a3 in a2:
        if a3 == None:
            continue
        for a4 in a3:
          flattened_array.append(a4)

  return flattened_array

## Implementation

In [6]:
# Using HC3
from google.colab import userdata
from datasets import load_dataset

dataset = load_dataset(
    "Hello-SimpleAI/HC3",
    name='all',
    token=userdata.get('HC3'),
    trust_remote_code=True,
    )

Downloading data:   0%|          | 0.00/39.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24322 [00:00<?, ? examples/s]

In [7]:
dataset['train']

Dataset({
    features: ['id', 'question', 'human_answers', 'chatgpt_answers', 'source'],
    num_rows: 24322
})

In [8]:
break_at = 0

questions = []
human_answer_word_corpora = []
chatgpt_answer_word_corpora = []

human_len = len(dataset['train']['human_answers'])
for datapoint in dataset['train']:
  question = datapoint['question']
  human_answers = datapoint['human_answers']
  chatgpt_answers = datapoint['chatgpt_answers']

  questions.append(question)

  for human_answer in human_answers:
    cleaned_answer = remove_punctuation(human_answer)
    words = cleaned_answer.split()
    human_answer_word_corpora.append(words)

  for chatgpt_answer in chatgpt_answers:
    cleaned_answer = remove_punctuation(chatgpt_answer)
    words = cleaned_answer.split()
    chatgpt_answer_word_corpora.append(words)

  # early break
  # break_at += 1
  # if break_at == 5:
  #   break

print(questions[0])
print(human_answer_word_corpora[0])
print(chatgpt_answer_word_corpora[0])

Why is every book I hear about a " NY Times # 1 Best Seller " ? ELI5 : Why is every book I hear about a " NY Times # 1 Best Seller " ? Should n't there only be one " # 1 " best seller ? Please explain like I'm five.
['Basically', 'there', 'are', 'many', 'categories', 'of', 'Best', 'Seller', 'Replace', 'Best', 'Seller', 'by', 'something', 'like', 'Oscars', 'and', 'every', 'best', 'seller', 'book', 'is', 'basically', 'an', 'oscar', 'winning', 'book', 'May', 'not', 'have', 'won', 'the', 'Best', 'film', 'but', 'even', 'if', 'you', 'won', 'the', 'best', 'director', 'or', 'best', 'script', 'you', 're', 'still', 'an', 'oscar', 'winning', 'film', 'Same', 'thing', 'for', 'best', 'sellers', 'Also', 'IIRC', 'the', 'rankings', 'change', 'every', 'week', 'or', 'something', 'like', 'that', 'Some', 'you', 'might', 'not', 'be', 'best', 'seller', 'one', 'week', 'but', 'you', 'may', 'be', 'the', 'next', 'week', 'I', 'guess', 'even', 'if', 'you', 'do', 'nt', 'stay', 'there', 'for', 'long', 'you', 'still'

In [9]:
print(len(questions))
print(len(human_answer_word_corpora))
print(len(chatgpt_answer_word_corpora))

24322
58546
26903


In [17]:
human_answer_rogets_categories = []
chatgpt_answer_rogets_categories = []

print("questions:", len(questions))
print("human words:", len(human_answer_word_corpora))
print("chatgpt words:", len(chatgpt_answer_word_corpora))

for human_answer_word_corpus in human_answer_word_corpora:
  human_answer_rogets_category = get_rogets_categories_by_level(human_answer_word_corpus)
  human_answer_rogets_categories.append(human_answer_rogets_category)

for chatgpt_answer_word_corpus in chatgpt_answer_word_corpora:
  chatgpt_answer_rogets_category = get_rogets_categories_by_level(chatgpt_answer_word_corpus)
  chatgpt_answer_rogets_categories.append(chatgpt_answer_rogets_category)

print(len(human_answer_rogets_categories), len(chatgpt_answer_rogets_categories))
if len(human_answer_word_corpora) == len(human_answer_rogets_categories) and len(chatgpt_answer_word_corpora) == len(chatgpt_answer_rogets_categories):
  print("Successfully mapped")

questions: 24322
human words: 58546
chatgpt words: 26903
58546 26903
Successfully mapped


In [18]:
print(len(human_answer_rogets_categories), len(chatgpt_answer_rogets_categories))

# chatgpt_answer_rogets_categories is a 2D array where each element is an array of 5 elemnts, which are 5 rogets cat levels

58546 26903


In [21]:
import numpy as np
import math

def split_array_into_chunks(arr, num_chunks):
    chunk_size = math.ceil(len(arr) / num_chunks)
    return [arr[i:i + chunk_size] for i in range(0, len(arr), chunk_size)]

import json

def save_list_chunks_json(list_data, num_chunks, base_filename):
    chunks = split_array_into_chunks(list_data, num_chunks)
    for i, chunk in enumerate(chunks):
        filename = f"{base_filename}_chunk_{i}.json"
        with open(filename, 'w', encoding='utf-8') as file:
            json.dump(chunk, file)
        print(f"Saved chunk {i} to {filename}")


large_array = human_answer_rogets_categories

save_list_chunks_json(large_array, 10, "chunks")


Saved chunk 0 to chunks_chunk_0.json
Saved chunk 1 to chunks_chunk_1.json
Saved chunk 2 to chunks_chunk_2.json
Saved chunk 3 to chunks_chunk_3.json
Saved chunk 4 to chunks_chunk_4.json
Saved chunk 5 to chunks_chunk_5.json
Saved chunk 6 to chunks_chunk_6.json
Saved chunk 7 to chunks_chunk_7.json
Saved chunk 8 to chunks_chunk_8.json
Saved chunk 9 to chunks_chunk_9.json


In [27]:
import gc

# Assume `large_var` is a large variable consuming a lot of memory.
# del human_answer_rogets_categories
# del chatgpt_answer_rogets_categories
# del human_answer_word_corpora
# del chatgpt_answer_word_corpora
# del human_answers
# del chatgpt_answers
del dataset

# Collect garbage
gc.collect()


136

In [22]:
import json

def read_list_chunks_json(num_of_chunk, base_filename):
    reconstructed_list = []
    filename = f"{base_filename}_chunk_{num_of_chunk}.json"
    with open(filename, 'r', encoding='utf-8') as file:
        chunk = json.load(file)
        reconstructed_list.extend(chunk)
    return reconstructed_list

# Usage
reconstructed_list = read_list_chunks_json(0, "chunks")


In [32]:
level_one   = []
level_two   = []
level_three = []
level_four  = []
level_five  = []

for word_array in reconstructed_list:
  for cat in word_array:
    for i in range(len(cat)):
      if cat[i] == None:
        continue
      if i == 0:
        level_one.append(cat[i])
      if i == 1:
        level_two.append(cat[i])
      if i == 2:
        level_three.append(cat[i])
      if i == 3:
        level_four.append(cat[i])
      if i == 4:
        level_five.append(cat[i])




In [33]:
print(level_one[1])
print(level_two[1])
print(level_three[1])
print(level_four[1])
print(level_five[1])

['GREATNESS', 'MULTITUDE']
['QUANTITY BY COMPARISON WITH A STANDARD', 'INDETERMINATE NUMBER']
['COMPARATIVE QUANTITY', 'NUMBER']
['QUANTITY', 'WORDS EXPRESSING ABSTRACT RELATIONS']
['WORDS EXPRESSING ABSTRACT RELATIONS', 'WORDS EXPRESSING ABSTRACT RELATIONS']


In [34]:
import pandas as pd
from collections import Counter
from itertools import chain

def count_unique_words_2d(arrays_of_strings):
    # Flatten the 2D array into a 1D array of strings
    flattened_strings = list(chain.from_iterable(arrays_of_strings))

    # Split the flattened list of strings into individual words
    words = ' '.join(flattened_strings).split()

    # Count each unique word
    word_counts = Counter(words)

    # Convert the word counts to a pandas DataFrame
    df = pd.DataFrame(word_counts.items(), columns=['Word', 'Count'])

    # Sort the DataFrame by the word counts
    df = df.sort_values(by='Count', ascending=False).reset_index(drop=True)

    return df

In [None]:
concatenated_human_answer_rogets_categories = custom_flatten_array(human_answer_rogets_categories)
concatenated_chatgpt_answer_rogets_categories = custom_flatten_array(chatgpt_answer_rogets_categories)

In [None]:
import numpy as np
import pandas as pd

pd.value_counts(np.array(concatenated_human_answer_rogets_categories))