#### Hella Swag
#### Research paper: 
#### Copy of data: 

In [47]:
#!pip install import-ipynb

In [48]:
import sys
sys.path.append('../Modules')
sys.path.append('../Modules/Processors from Prof')

from Packages import *
import contextlib


# Import Jupyter Notebook files
with contextlib.redirect_stdout(None):
    #ToDo import the correct data processor: JSON or CSV
    from ipynb.fs.full.Json_Processor import *
    from ipynb.fs.full.CSV_Processor import *
    from ipynb.fs.full.Reading_Level import *
    from ipynb.fs.full.Word_Processing import *
    from ipynb.fs.full.Utilities import *
    from Wordcloud import *

In [49]:
def get_random_question(questions):
    '''
    Extract a random question 
    @param list questions: a list of dictionaries, each a question. 
    @param str source_id: the desired source of the question. Defaults to None for don't care.  
    @return: The random question as a dictionary
    '''
    return random.choice(questions)

In [50]:
def print_question(question):
    '''
    Print a question
    @param question dictionary: The question to be printed
    @return: None
    '''
    #print("print_question():", question)
    print("Input:", question["input"])
    print("Target:", question["target"])

In [51]:
def get_json_files(path):
    '''
    Extract json files from a list of file paths
    @param path list: The list to be processed
    @return list: The list of files with the extension ".json"
    '''
    all_files = read_directory_contents(path)
    return [file for file in all_files if file.endswith("json")]

In [52]:
def load_questions(benchmark_name, question_path):
    '''
    Read questions from a directory containing one or more quesiton files
    @param question_path string: The location of the question file(s)
    @return dictionary: key "questions" is a list of dictionaries, one dictionary for each question. Key "json_files" is a list of the file names that were processed
    '''
    results = dict()
    # Example usage:
    #input_file_path = r'Questions\boolean_expressions.json'
    #input_file_path = r'Questions\causal_judgement.json'
    #input_file_path = r'Questions\penguins_in_a_table.json'
    output_file_path = 'output.json'
    json_files = get_json_files(benchmark_name + "/"  + question_path)
    #print(json_files)

    json_processor = Big_Bench_Json_Processor(benchmark_name + "/" + question_path, json_files, output_file_path)
    questions = json_processor.read_json()
    results["questions"] = questions
    results["json_files"] = json_files
    return results

In [53]:
def build_text_from_questions(questions, write_to = None):
    '''
    Build one long text string from all the questions.
    This logic will vary based on the architecture of the benchmark questions.
    @param questions list: list of dictionaries, one dictionary per question.
    @param write_to string: File path to write the text string. Default is None. 
    @return String: The text string
    '''
    text = ""
    for question in questions:
        text += " " + str(question["input"]) + " " + str(question["target"])
    if write_to != None:
        write_string_to_file(text, write_to)

    #print("************************************", "HIGHTECH", "HIGHTECH" in text, text.find("HIGHTECH"))        
    return text

In [54]:
def compute_readability_indices(benchmark_name, questions, verbose = True):
    '''
    Compute readability values using our Reading_Level class
    @param questions list: A list of dictionaries, one dictionary per benchmark question
    @param verbose bool: True if this function should print the computed values
    @return dictionary: key is the readability metric, value is the corresponding score
    '''
    text = build_text_from_questions(questions)
    indices = Reading_Level.compute_readability_indices(benchmark_name, text)

    if (verbose):
        print("Readability Indices:")
        for index, score in indices.items():
            print(f"{index}: {score:.2f}")
    return indices

In [55]:
def generate_wordcloud(benchmark_name, questions):
    '''
    Build a word cloud based on the questions in a benchmark
    @param questions list: List of dictionaries, each dictionary os a question from the benchmark
    @return None
    '''
    text = build_text_from_questions(questions)

    wordcloud = Wordcloud()
    wordcloud.generate01(benchmark_name, text, myStopwords={"Numerical", "options"})

In [56]:
def compute_word_frequency(benchmark_name, questions, verbose = True, min_percentage = 1.0):
    '''
    Compute the word frequencies in the benchmark questions
    @param questions list: List of dictionaries, each dictionary os a question from the benchmark
    @param verbose bool: True if the function should print the word frequencies
    @param min_percentage float: the smallest percentage that should be printed. Defaults to 1.0
    @return set (dictionary, float): ({word:# of times that word appears over all questions}, total words)
    '''
    text = build_text_from_questions(questions)

    word_frequency, count = Word_Processing.compute_word_frequency(text)
    sorted_word_frequency = {k: v for k, v in sorted(word_frequency.items(), key=lambda item: item[1], reverse = True)}
    count = float(count)
    write_dict_to_file(sorted_word_frequency, ".\\" + benchmark_name + "\\results\\word_frequency.txt", length = 100, denominator = count)
    if verbose:
        print("Word Frequency:")
        #for key in [key for key in sorted_word_frequency.keys()][:5]:
        for key in sorted_word_frequency.keys():
            percentage = (sorted_word_frequency[key] / count)* 100
            if percentage >= min_percentage:
                print(key, ":", sorted_word_frequency[key], ",", '{0:.2f}'.format(percentage))

    return (sorted_word_frequency, count)

In [57]:
def compute_longest_words(benchmark_name, questions, verbose = True):
    '''
    Compute the longest words appearing across all the questions
    @param questions list: List of dictionaries, each dictionary is a question from the benchmark
    @param verbose bool: True if the function should print the words
    @return set (dictionary, float): ({word:# of times that word appears over all questions}, total words)    
    '''

    text = build_text_from_questions(questions)

    word_length, count = Word_Processing.compute_longest_words(text, min_length = 12)
    #sorted_word_lengths = {k: v for k, v in sorted(word_length.items(), key=lambda item: item[1], reverse = True)}
    sorted_word_lengths = {k: v for k, v in sorted(word_length.items(), key=lambda item: len(item[0]), reverse = True)}
    count = float(count)
    if verbose:
        print("Longest Words:")
        for key in list(sorted_word_lengths.keys())[0:25]:
            print(len(key), ", ", key, ":", sorted_word_lengths[key], ",", '{0:.2f}'.format((sorted_word_lengths[key] / count)* 100))

    # Write all the missing words to a text file
    write_dict_keys_to_file(sorted_word_lengths, ".\\" + benchmark_name + "\\results\\longest_words.txt", 100)

    return (sorted_word_lengths, count)

In [58]:
def find_word(questions, word):
    return Word_Processing.find_word(questions, word)

In [59]:
def find_missing_words(benchmark_name, questions, verbose = True, very_verbose = False):
    """
    Find words not in the English dictionary. Numbers are ignired in this function.
    @param questions dictionary: The data to be processed
    @param verbose bool: If true, print some information about the first 10 words not found in the dictionary. Default to True
    @param very_verbose bool: If true, print the questions containing the first 10 words that were not found in the dictionary. Default to False.
    
    @return dictionary: The unique words. key and value are both the unique word
    """
    text = build_text_from_questions(questions, ".\\" + benchmark_name + "\\results\\questions_as_text.txt")
    print("Text built...")
    english = Word_Processing.load_dictionary()
    print("Dictionary loaded...")

    words = Word_Processing.split_text(text)
    print("Text split...")
    words_not_found = dict()
    print("Processing word list...")
    for word in words:
        try:
            # If this fails, the word is not a number and we will add it to the dictionary of missing words.
            tmp = float(word)
        except:
            if word.upper() not in english:
                words_not_found[word] = word
                #words_not_found.add(word)
    # Write all the missing words to a text file
    write_dict_keys_to_file(words_not_found, ".\\" + benchmark_name + "\\results\\words_not_in_dictionary.txt", length = len(words_not_found))

    if verbose:
        print(len(words_not_found), "words not in dictionary")
        print("First 10 words not in dictionary:")
        for i, key in enumerate(words_not_found.keys()):
            if i >= 10:
              break
            print(key)
    if very_verbose:
        print("First 10 words not in dictionary and the questions those words appear in:")
        for i, key in enumerate(words_not_found.keys()):
            if i >= 10:
                break
            print(key)
            print(key , "found in", find_word(questions, key))
    
    return words_not_found

In [60]:
def demo():
    benchmark_name = "HellaSwag"
    input_file_path = 'HellaSwag/data/hellaswag_train.json'
    output_file_path = 'HellaSwag/data/output.json'
    question_path = "HellaSwag/data/"
    
    json_processor = HellaSwag_Json_Processor(input_file_path, output_file_path)
    questions = json_processor.read_data()

    print(len(questions), "questions read from", input_file_path)
    print(len(questions), "questions read from", len(json_files), "files in", question_path)

    random_question = get_random_question(questions)

    words_not_found = find_missing_words(benchmark_name, questions)

    print("Random question:")
    print_question(random_question)
    
    compute_readability_indices(benchmark_name, questions)
    
    generate_wordcloud(benchmark_name, questions)

    compute_word_frequency(benchmark_name, questions)
    
    compute_longest_words(benchmark_name, questions)

In [61]:
demo()

FileNotFoundError: [Errno 2] No such file or directory: 'HellaSwag/data/hellaswag_train.jsonH'

In [None]:
# if __name__ == "__main__":
#     demo()

FileNotFoundError: [Errno 2] No such file or directory: 'data/hellaswag_train.json'