#### HellaSwag

In [1]:
#!pip install import-ipynb

In [2]:
import sys
sys.path.append('../Modules')
sys.path.append('../Modules/Processors from Prof')

from Packages import *
import contextlib


# Import Jupyter Notebook files
with contextlib.redirect_stdout(None):
    #ToDo import the correct data processor: JSON or CSV
    from ipynb.fs.full.Json_Processor import *
    from ipynb.fs.full.CSV_Processor import *
    from ipynb.fs.full.Reading_Level import *
    from ipynb.fs.full.Word_Processing import *
    from ipynb.fs.full.Utilities import *
    from Wordcloud import *

In [3]:
def analyze_questions(questions):
    analysis = dict()
    activity_labels = set()
    for question in questions:
        activity_labels.add(question["activity_label"])

    analysis["activity_labels"] = activity_labels
    return analysis

In [4]:
def get_random_question(questions, source_id = None):
    '''
    Extract a random question 
    @param questions list: a list of dictionaries, each a question. 
    @param source_id str: the desired source of the question. Defaults to None for don't care.  
    @return dictionary: The random question as a dictionary
    '''
    if source_id == None:
        return random.choice(questions)
    else:
        while True:
            question = random.choice(questions)
            if source_id in question["source_id"]:
                return question

In [5]:
def print_question(question):
    '''
    Print a question
    @param question dictionary: The question to be printed
    @return: None
    '''
    print("Activity Label:", question["activity_label"])
    print("Full Context:", question["ctx_a"])
    print("Source_id:", question["source_id"])
    try:
        print("Label (correct answer, zero-based):", question["label"])
    except:
        print("Label: not provided, probably because this is the test dataset")
    print("Endings:")
    idx = 0
    for ending in question["endings"]:
        print(idx, ":", question["ctx_b"] + " " + ending)
        idx += 1
    
    #for key in question.keys():
    #    print(key,":",question[key])

In [6]:
def build_text_from_questions(questions):
    '''
    Build one long text string from all the questions.
    This logic will vary based on the architecture of the benchmark questions.
    @param questions list: list of dictionaries, one dictionary per question.
    @return String: The text string
    '''
    
    # dict_keys(['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'split', 'split_type', 'label', 'endings', 'source_id']
    #print("HellaSwag keys are:")
    #print(questions[0].keys())
    text = ""
    # Contrive the prompt as a complete sentence and each of the possible completions as a complete sentence.
    for question in questions:
        text += " " + str(question["ctx_a"])
        for ending in question["endings"]:
            text += " " + question["ctx_b"] + " " + ending
    return text

In [7]:
def compute_readability_indices(questions, verbose = True):
    '''
    Compute readability values using our Reading_Level class
    @param questions list: A list of dictionaries, one dictionary per benchmark question
    @param verbose bool: True if this function should print the computed values
    @return dictionary: key is the readability metric, value is the corresponding score
    '''
    text = build_text_from_questions(questions)
    indices = Reading_Level.compute_readability_indices(text)

    if (verbose):
        print("Readability Indices:")
        for index, score in indices.items():
            print(f"{index}: {score:.2f}")
    return indices

In [8]:
def generate_wordcloud(questions):
    '''
    Build a word cloud based on the questions in a benchmark
    @param questions list: List of dictionaries, each dictionary os a question from the benchmark
    @return None
    '''
    text = build_text_from_questions(questions)

    wordcloud = Wordcloud()
    wordcloud.generate01(text, myStopwords={"Numerical", "options"})

In [9]:
def compute_word_frequency(questions, verbose = True, min_percentage = 1.0):
    '''
    Compute the word frequencies in the benchmark questions
    @param questions list: List of dictionaries, each dictionary os a question from the benchmark
    @param verbose bool: True if the function should print the word frequencies
    @param min_percentage float: the smallest percentage that should be printed. Defaults to 1.0
    @return set (dictionary, float): ({word:# of times that word appears over all questions}, total words)
    '''
    text = build_text_from_questions(questions)

    word_frequency, count = Word_Processing.compute_word_frequency(text)
    sorted_word_frequency = {k: v for k, v in sorted(word_frequency.items(), key=lambda item: item[1], reverse = True)}
    count = float(count)
    if verbose:
        print("Word Frequency:")
        #for key in [key for key in sorted_word_frequency.keys()][:5]:
        for key in sorted_word_frequency.keys():
            percentage = (sorted_word_frequency[key] / count)* 100
            if percentage >= min_percentage:
                print(key, ":", sorted_word_frequency[key], ",", '{0:.2f}'.format(percentage))

    return (sorted_word_frequency, count)

In [10]:
def compute_longest_words(questions, verbose = True):
    '''
    Compute the longest words appearing across all the questions
    @param questions list: List of dictionaries, each dictionary is a question from the benchmark
    @param verbose bool: True if the function should print the words
    @return set (dictionary, float): ({word:# of times that word appears over all questions}, total words)    
    '''

    text = build_text_from_questions(questions)
    #print("**********************")
    #print(text.split())
    #print("**********************")

    word_length, count = Word_Processing.compute_longest_words(text, min_length = 12)
    #sorted_word_lengths = {k: v for k, v in sorted(word_length.items(), key=lambda item: item[1], reverse = True)}
    sorted_word_lengths = {k: v for k, v in sorted(word_length.items(), key=lambda item: len(item[0]), reverse = True)}
    count = float(count)
    if verbose:
        print("Longest Words:")
        for key in list(sorted_word_lengths.keys())[0:25]:
            print(len(key), ", ", key, ":", sorted_word_lengths[key], ",", '{0:.2f}'.format((sorted_word_lengths[key] / count)* 100))

    return (sorted_word_lengths, count)

In [11]:
def find_word(questions, word):
    return Word_Processing.find_word(questions, word)

In [23]:
def find_missing_words(benchmark_name, questions, verbose = True, very_verbose = False):
    """
    Find words not in the English dictionary. Numbers are ignored in this function.
    @param questions dictionary: The data to be processed
    @param verbose bool: If true, print some information about the first 10 words not found in the dictionary. Default to True
    @param very_verbose bool: If true, print the questions containing the first 10 words that were not found in the dictionary. Default to False.
    
    @return dictionary: The unique words. key and value are both the unique word
    """
    text = build_text_from_questions(questions)
    print("Text built...")
    english = Word_Processing.load_dictionary()
    print("Dictionary loaded...")

    words = Word_Processing.split_text(text)
    print("Text split...")
    words_not_found = dict()
    print("Processing word list...")
    for word in words:
        try:
            # If this fails, the word is not a number and we will add it to the dictionary of missing words.
            tmp = float(word)
        except:
            if word.upper() not in english:
                words_not_found[word] = word
                #words_not_found.add(word)
    # Write all the missing words to a text file
    write_dict_keys_to_file(words_not_found, ".\\" + benchmark_name + "\\results\\words_not_in_dictionary.txt", length = len(words_not_found))

    if verbose:
        print(len(words_not_found), "words not in dictionary")
        print("First 10 words not in dictionary:")
        for i, key in enumerate(words_not_found.keys()):
            if i >= 10:
              break
            print(key)
    if very_verbose:
        print("First 10 words not in dictionary and the questions those words appear in:")
        for i, key in enumerate(words_not_found.keys()):
            if i >= 10:
                break
            print(key)
            print(key , "found in", find_word(questions, key))
    
    return words_not_found

In [32]:
def demo():

    benchmark_name = "HellaSwag"
    input_files = ['hellaswag_train.json']
    # input_file_path = 'data/hellaswag_train.json'
    # output_file_path = 'data/output.json'
    # question_path = "data/"
    
    json_processor = HellaSwag_Json_Processor(benchmark_name, input_files)
    questions = json_processor.read_data()

    # print(len(questions), "questions read from", input_file_path)
    # print(len(questions), "questions read from", len(json_files), "files in", question_path)

    random_question = get_random_question(questions)

    words_not_found = find_missing_words(benchmark_name, questions)

    print("Random question:")
    print_question(random_question)
    
    compute_readability_indices(benchmark_name, questions)
    
    generate_wordcloud(benchmark_name, questions)

    compute_word_frequency(benchmark_name, questions)
    
    compute_longest_words(benchmark_name, questions)

In [33]:
demo()

KeyError: 'ctx_a'

In [None]:
# if __name__ == "__main__":
#     demo()

In [34]:
my_questions[0]

{'input': 'Then, the man writes over the snow covering the window of a car, and a woman wearing winter clothes smiles.',
 'topic': 'Removing ice from car',
 'target': 'then , the man adds wax to the windshield and cuts it.\nthen , a person board a ski lift, while two men supporting the head of the person wearing winter clothes snow as the we girls sled.\nthen , the man puts on a christmas coat, knitted with netting.\nthen , the man continues removing the snow on his car.\n'}