In [None]:
#| default_exp utilities

In [None]:
#|export

def find_first_sublist_index(lst, sublist):
    """
    Find the first occurrence of a sublist within a larger list and return its starting index.
    
    Args:
        lst (list): The larger list to search within.
        sublist (list): The sublist to search for.
    
    Returns:
        index (int): The starting index of the first occurrence of the sublist, or -1 if not found.
    """
    sublist_len = len(sublist)
    if sublist_len == 0 or sublist_len > len(lst):
        return -1
    
    for i in range(len(lst) - sublist_len + 1):
        if lst[i:i + sublist_len] == sublist:
            return i
    
    return -1

def find_sublist_indices(lst, sublist):
    """
    Find all occurrences of a sublist within a larger list and return their starting indices.
    
    Args:
        lst (list): The larger list to search within.
        sublist (list): The sublist to search for.
    
    Returns:
        indices (list): A list of starting indices where the sublist occurs in the larger list.
    """
    indices = []
    sublist_len = len(sublist)
    if sublist_len == 0 or sublist_len > len(lst):
        return indices
    
    for i in range(len(lst) - sublist_len + 1):
        if lst[i:i + sublist_len] == sublist:
            indices.append(i)
    
    return indices

In [None]:
#| export

from IPython.display import display, Markdown

def generate_markdown_table(data, column_names, skip_empty_rows=False):
    # Determine the number of columns based on the length of column_names
    num_columns = len(column_names)

    # Generate the header row for the markdown table
    header_row = "|"
    for name in column_names:
        header_row += f" {name} |"
    markdown = f"{header_row}\n"

    # Add a separator row for the markdown table
    separator_row = "|"
    for i in range(num_columns):
        separator_row += " --- |"
    markdown += f"{separator_row}\n"

    # Add the rows of data to the markdown table
    num_rows = max(len(lst) for lst in data)
    for i in range(num_rows):
        row = "|"
        empty_entries = 0
        for j in range(num_columns):
            if i < len(data[j]):
                row += f" {data[j][i]} |"
                if data[j][i] == "":
                    empty_entries += 1
            else:
                row += "   |"
                empty_entries += 1

        # Only add the row if it doesn't have any empty entries, or if skip_empty_rows is False
        if not (skip_empty_rows and empty_entries > 0):
            markdown += f"{row}\n"

    # Return the markdown string
    return Markdown(markdown)

In [None]:
response_units = [
    "I think one meaning is the part of the day between morning and evening",
    "The metal device used to secure a door",
    "To feel a strong desire for something",
        "All cats are mammals.",
    "The sun sets in the west.",
    "She received a brand new bicycle for her birthday.",
    "He is a software engineer.",
    "",
    "The Earth revolves around the sun.",
        "All dogs are mammals.",
    "The cat is on the roof.",
    "She enjoys reading mystery novels.",
    "The earth is flat.",
    "Tom is taking a nap.",
]

target_items = [
    "a period of 24 hours, especially from twelve o'clock one night to twelve o'clock the next night",
    "a part of the day when it is light, between the time when the sun rises and the time when the sun sets",
    "a metal object that you put into a hole on a door and turn to lock it",
    "to want something very much",
        "Mammals include cats.",
    "The sun rises in the east.",
    "She got a new bike on her birthday.",
    "He works as a software developer.",
    "The sun is orbited by the Earth.",
        "Dogs are a type of mammal.",
    "The cat is resting on top of the building.",
    "She likes to read books about solving crimes.",
    "The earth is round.",
    "Tom is currently asleep.",
    ""
]

#generate_markdown_table([response_units, target_items], ["Response Units", "Target Items"], skip_empty_rows=False)
generate_markdown_table([response_units, target_items], ["Response Units", "Target Items"], skip_empty_rows=False)

| Response Units | Target Items |
| --- | --- |
| I think one meaning is the part of the day between morning and evening | a period of 24 hours, especially from twelve o'clock one night to twelve o'clock the next night |
| The metal device used to secure a door | a part of the day when it is light, between the time when the sun rises and the time when the sun sets |
| To feel a strong desire for something | a metal object that you put into a hole on a door and turn to lock it |
| All cats are mammals. | to want something very much |
| The sun sets in the west. | Mammals include cats. |
| She received a brand new bicycle for her birthday. | The sun rises in the east. |
| He is a software engineer. | She got a new bike on her birthday. |
|  | He works as a software developer. |
| The Earth revolves around the sun. | The sun is orbited by the Earth. |
| All dogs are mammals. | Dogs are a type of mammal. |
| The cat is on the roof. | The cat is resting on top of the building. |
| She enjoys reading mystery novels. | She likes to read books about solving crimes. |
| The earth is flat. | The earth is round. |
| Tom is taking a nap. | Tom is currently asleep. |
|   |  |


In [None]:
#| export

def invert_match_mapping(original_keys, target_keys, matching):
    """
    Inverts the match mapping between original_keys and target_keys.

    Given a matching represented as a list of target keys corresponding to each original key,
    this function inverts the mapping and creates a list of original keys corresponding to each target key.
    Unmatched target keys are represented by empty strings.

    Args:
        original_keys (List[str]): A list of original keys (e.g., response_units).
        target_keys (List[str]): A list of target keys (e.g., target_items).
        matching (List[str]): A list of matched target keys, one for each original key. Use empty strings for unmatched original keys.

    Returns:
        List[str]: A new list with the same length as target_keys, where each position contains the original key matched to the target key.
                   Unmatched target keys are represented by empty strings.
    """


    inverted_mapping = {target_key: "" for target_key in target_keys}
    for original_key, matched_target_key in zip(original_keys, matching):
        if matched_target_key:
            inverted_mapping[matched_target_key] = original_key
    return [inverted_mapping[key] for key in inverted_mapping]

In [None]:
#|export
import numpy as np
from typing import List

def items_by_ordering(items: List[str], ordering: np.ndarray) -> List[str]:
    """
    Reorders a list of items based on the provided ordering.

    Args:
        items (List[str]): A list of items to be reordered.
        ordering (np.ndarray): A numpy array with the same length as items, where the value at each position
                               represents the index of the response unit the target item is matched to. Use
                               np.nan for items to be excluded..

    Returns:
        List[str]: items reordered according to the provided ordering.
    """
    
    ordered_items = [""] * int((np.nanmax(ordering) + 1))
    for target_index, response_index in enumerate(ordering):
        if not np.isnan(response_index):
            ordered_items[int(response_index)] = items[target_index]

    return [each for each in ordered_items if each]

In [None]:
matched_items = ["a period of 24 hours, especially from twelve o'clock one night to twelve o'clock the next night",
 'a metal object that you put into a hole on a door and turn to lock it',
 'to want something very much',
 'Mammals include cats.',
 'a part of the day when it is light, between the time when the sun rises and the time when the sun sets',
 'She got a new bike on her birthday.',
 'He works as a software developer.',
 'The sun is orbited by the Earth.',
 'Mammals include cats.',
 'The cat is resting on top of the building.',
 'She likes to read books about solving crimes.',
 'The earth is round.',
 'Tom is currently asleep.']

 
inverted_mapping = invert_match_mapping(response_units, target_items, matched_items)

generate_markdown_table([target_items, inverted_mapping], ["Target Item", "Matched Response Unit"])

| Target Item | Matched Response Unit |
| --- | --- |
| a period of 24 hours, especially from twelve o'clock one night to twelve o'clock the next night | I think one meaning is the part of the day between morning and evening |
| a part of the day when it is light, between the time when the sun rises and the time when the sun sets | The sun sets in the west. |
| a metal object that you put into a hole on a door and turn to lock it | The metal device used to secure a door |
| to want something very much | To feel a strong desire for something |
| Mammals include cats. | All dogs are mammals. |
| The sun rises in the east. |  |
| She got a new bike on her birthday. | She received a brand new bicycle for her birthday. |
| He works as a software developer. | He is a software engineer. |
| The sun is orbited by the Earth. | The Earth revolves around the sun. |
| Dogs are a type of mammal. |  |
| The cat is resting on top of the building. | The cat is on the roof. |
| She likes to read books about solving crimes. | She enjoys reading mystery novels. |
| The earth is round. | The earth is flat. |
| Tom is currently asleep. | Tom is taking a nap. |


In [None]:
#| export

def extract_recalled(origText_recalled_posRec_recText):

    data = [r.strip().split('\t') for r in origText_recalled_posRec_recText.split('\n')]

    # Extract the origText and posRec columns from the data
    origText = [row[0] for row in data]
    posRec = [row[2] for row in data]

    # Create a list of tuples containing origText and posRec values
    origText_posRec = list(zip(origText, posRec))

    # Remove any NaN values from the posRec column
    origText_posRec = [(text, int(pos)) for text, pos in origText_posRec if pos != 'NaN']

    # Sort the list of tuples by the posRec value
    origText_posRec_sorted = sorted(origText_posRec, key=lambda x: x[1])

    # Extract just the sorted origText values from the list of tuples
    sorted_origText = [text for text, pos in origText_posRec_sorted]

    return sorted_origText

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()