In [2]:
import project_path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

In [3]:
%load_ext autoreload
%autoreload 2

**Goal**: convert from a token mapping to a different one.

# Input

In [101]:
text = """ab de gh.\n a bc."""

char_tokenization = [
    {
        "t": t,
        "i": i,
    }
    for i, t in enumerate(list(text))
]
from pprint import pprint

pprint(char_tokenization)

[{'i': 0, 't': 'a'},
 {'i': 1, 't': 'b'},
 {'i': 2, 't': ' '},
 {'i': 3, 't': 'd'},
 {'i': 4, 't': 'e'},
 {'i': 5, 't': ' '},
 {'i': 6, 't': 'g'},
 {'i': 7, 't': 'h'},
 {'i': 8, 't': '.'},
 {'i': 9, 't': '\n'},
 {'i': 10, 't': ' '},
 {'i': 11, 't': 'a'},
 {'i': 12, 't': ' '},
 {'i': 13, 't': 'b'},
 {'i': 14, 't': 'c'},
 {'i': 15, 't': '.'}]


In [102]:
tok_char = list(text)
tok_space = ["ab", " ", "de", " ", "gh", ".", "\n", " ", "a", " ", "bc", "."]
tok_1 = ["a", "b ", "d", "e", " ", "gh", ".", "\n", " ", "a", " ", "bc", "."]
tok_2 = ["ab de", " ", "gh.\n", " ", "a bc."]

In [103]:
tok_1

['a', 'b ', 'd', 'e', ' ', 'gh', '.', '\n', ' ', 'a', ' ', 'bc', '.']

In [104]:
tok_2_solution = [
    {'i': 0, 't': 'ab de', 'c': 0, 'l': 0, "s": 0},
    {'i': 1, 't': ' ', 'c': 5, 'l': 0, "s": 5},
    {'i': 2, 't': 'gh.\n', 'c': 6, 'l': 0, "s": 6},
    {'i': 3, 't': ' ', 'c': 0, 'l': 1, "s": 10},
    {'i': 4, 't': 'a bc.', 'c': 1, 'l': 1, "s": 11},
]

In [105]:
from typing import List, Dict, Tuple, Any

def get_tokens_with_col_and_line(text, tokens: List[str]):
    """Get tokens with column and line number.
    
    This function assumes that the concatenation of the tokens matches the text
    as prefix. 
    
    Args:
        text (str): The text to tokenize.
        tokens (list): The tokens to add column and line number to.

    Returns:
        list of dict with the following keys:
            - i: The index of the token in the list of tokens.
            - t: The token text.
            - c: The column number of the token.
            - l: The line number of the token.
            - s: The start index of the token in the text.
    """
    assert text.startswith("".join(tokens))
    tokens_with_col_and_line = []
    line = 0
    column = 0
    tot_prefix_len = 0
    for i, token in enumerate(tokens):
        new_token =  {}
        # Check if the token matches the text as prefix.
        if text.startswith(token):
            n_new_lines_char = token.count("\n")

            new_token["s"] = tot_prefix_len
            tot_prefix_len += len(token)
            new_token["i"] = i
            # Add the column and line number to the token.
            new_token["c"] = column
            new_token["l"] = line
            # Update the column and line number.
            column += len(token)
            new_token["t"] = token
            line += n_new_lines_char
            if n_new_lines_char > 0:
                n_chars_after_last_new_line = \
                    len(new_token["t"]) - (new_token["t"].rfind("\n") + 1)
                column = n_chars_after_last_new_line 
            # Add the token to the list of tokens with column and line number.
            tokens_with_col_and_line.append(new_token)
            # remove prefix from text
            text = text[len(token):]
        else:
            raise ValueError(
                f"Token {token} does not match text {text}"
            )
    return tokens_with_col_and_line


In [106]:
get_tokens_with_col_and_line(text, tok_char)

[{'s': 0, 'i': 0, 'c': 0, 'l': 0, 't': 'a'},
 {'s': 1, 'i': 1, 'c': 1, 'l': 0, 't': 'b'},
 {'s': 2, 'i': 2, 'c': 2, 'l': 0, 't': ' '},
 {'s': 3, 'i': 3, 'c': 3, 'l': 0, 't': 'd'},
 {'s': 4, 'i': 4, 'c': 4, 'l': 0, 't': 'e'},
 {'s': 5, 'i': 5, 'c': 5, 'l': 0, 't': ' '},
 {'s': 6, 'i': 6, 'c': 6, 'l': 0, 't': 'g'},
 {'s': 7, 'i': 7, 'c': 7, 'l': 0, 't': 'h'},
 {'s': 8, 'i': 8, 'c': 8, 'l': 0, 't': '.'},
 {'s': 9, 'i': 9, 'c': 9, 'l': 0, 't': '\n'},
 {'s': 10, 'i': 10, 'c': 0, 'l': 1, 't': ' '},
 {'s': 11, 'i': 11, 'c': 1, 'l': 1, 't': 'a'},
 {'s': 12, 'i': 12, 'c': 2, 'l': 1, 't': ' '},
 {'s': 13, 'i': 13, 'c': 3, 'l': 1, 't': 'b'},
 {'s': 14, 'i': 14, 'c': 4, 'l': 1, 't': 'c'},
 {'s': 15, 'i': 15, 'c': 5, 'l': 1, 't': '.'}]

In [107]:
get_tokens_with_col_and_line(text, tok_space)

[{'s': 0, 'i': 0, 'c': 0, 'l': 0, 't': 'ab'},
 {'s': 2, 'i': 1, 'c': 2, 'l': 0, 't': ' '},
 {'s': 3, 'i': 2, 'c': 3, 'l': 0, 't': 'de'},
 {'s': 5, 'i': 3, 'c': 5, 'l': 0, 't': ' '},
 {'s': 6, 'i': 4, 'c': 6, 'l': 0, 't': 'gh'},
 {'s': 8, 'i': 5, 'c': 8, 'l': 0, 't': '.'},
 {'s': 9, 'i': 6, 'c': 9, 'l': 0, 't': '\n'},
 {'s': 10, 'i': 7, 'c': 0, 'l': 1, 't': ' '},
 {'s': 11, 'i': 8, 'c': 1, 'l': 1, 't': 'a'},
 {'s': 12, 'i': 9, 'c': 2, 'l': 1, 't': ' '},
 {'s': 13, 'i': 10, 'c': 3, 'l': 1, 't': 'bc'},
 {'s': 15, 'i': 11, 'c': 5, 'l': 1, 't': '.'}]

In [91]:
get_tokens_with_col_and_line(text, tok_1)

[{'s': 0, 'i': 0, 'c': 0, 'l': 0, 't': 'a'},
 {'s': 1, 'i': 1, 'c': 1, 'l': 0, 't': 'b '},
 {'s': 3, 'i': 2, 'c': 3, 'l': 0, 't': 'd'},
 {'s': 4, 'i': 3, 'c': 4, 'l': 0, 't': 'e'},
 {'s': 5, 'i': 4, 'c': 5, 'l': 0, 't': ' '},
 {'s': 6, 'i': 5, 'c': 6, 'l': 0, 't': 'gh'},
 {'s': 8, 'i': 6, 'c': 8, 'l': 0, 't': '.'},
 {'s': 9, 'i': 7, 'c': 9, 'l': 0, 't': '\n'},
 {'s': 10, 'i': 8, 'c': 0, 'l': 1, 't': ' '},
 {'s': 11, 'i': 9, 'c': 1, 'l': 1, 't': 'a'},
 {'s': 12, 'i': 10, 'c': 2, 'l': 1, 't': ' '},
 {'s': 13, 'i': 11, 'c': 3, 'l': 1, 't': 'bc'},
 {'s': 15, 'i': 12, 'c': 5, 'l': 1, 't': '.'}]

In [92]:
get_tokens_with_col_and_line(text, tok_2)

[{'s': 0, 'i': 0, 'c': 0, 'l': 0, 't': 'ab de'},
 {'s': 5, 'i': 1, 'c': 5, 'l': 0, 't': ' '},
 {'s': 6, 'i': 2, 'c': 6, 'l': 0, 't': 'gh.\n'},
 {'s': 10, 'i': 3, 'c': 0, 'l': 1, 't': ' '},
 {'s': 11, 'i': 4, 'c': 1, 'l': 1, 't': 'a bc.'}]

In [93]:
assert tok_2_solution == get_tokens_with_col_and_line(text, tok_2)

In [94]:
def convert_weigths_form_tok_to_tok(tokenization, weights, target_tokenization):
    """Convert the weights to the target tokenization."""
    weights_tok_char = convert_weights_from_tok_to_tok_char(tokenization, weights)
    print("Intermediate char weights")
    pprint(weights_tok_char)
    weights_tok = convert_weights_from_tok_char_to_tok(
        weights_tok_char, target_tokenization
    )
    return weights_tok

def convert_weights_from_tok_to_tok_char(tokenization, weights):
    """Convert the weights to the char tokenization.
    
    Note that the weight of a char is derived by the weight of a token divided
    by the number of chars in the token.
    """
    char_weights = []
    for w, t in zip(weights, tokenization):
        new_weight = w / len(t["t"])
        char_weights.extend([new_weight] * len(t["t"]))
    return char_weights

def convert_weights_from_tok_char_to_tok(weights, target_tokenization):
    """Convert the weights to the tokenization.
    
    Note that the weight of a token is derived by the sum of the weights of the
    chars in the token.
    """
    token_weights = []
    for t in target_tokenization:
        new_weight = sum(weights[t["s"]:t["s"] + len(t["t"])])
        print(f"Token {t['t']} has weight {new_weight}")
        token_weights.append(new_weight)
    return token_weights

In [95]:
weights_2 = [1, 1, 1, 1, 1]
tokenization_2 = get_tokens_with_col_and_line(text, tok_2)
pprint(tokenization_2)

pprint("Target tokenization 1")
tokenization_1 = get_tokens_with_col_and_line(text, tok_1)
pprint(tokenization_1)
weights_1 = convert_weigths_form_tok_to_tok(
    tokenization_2, weights_2, tokenization_1)
pprint(weights_1)

[{'c': 0, 'i': 0, 'l': 0, 's': 0, 't': 'ab de'},
 {'c': 5, 'i': 1, 'l': 0, 's': 5, 't': ' '},
 {'c': 6, 'i': 2, 'l': 0, 's': 6, 't': 'gh.\n'},
 {'c': 0, 'i': 3, 'l': 1, 's': 10, 't': ' '},
 {'c': 1, 'i': 4, 'l': 1, 's': 11, 't': 'a bc.'}]
'Target tokenization 1'
[{'c': 0, 'i': 0, 'l': 0, 's': 0, 't': 'a'},
 {'c': 1, 'i': 1, 'l': 0, 's': 1, 't': 'b '},
 {'c': 3, 'i': 2, 'l': 0, 's': 3, 't': 'd'},
 {'c': 4, 'i': 3, 'l': 0, 's': 4, 't': 'e'},
 {'c': 5, 'i': 4, 'l': 0, 's': 5, 't': ' '},
 {'c': 6, 'i': 5, 'l': 0, 's': 6, 't': 'gh'},
 {'c': 8, 'i': 6, 'l': 0, 's': 8, 't': '.'},
 {'c': 9, 'i': 7, 'l': 0, 's': 9, 't': '\n'},
 {'c': 0, 'i': 8, 'l': 1, 's': 10, 't': ' '},
 {'c': 1, 'i': 9, 'l': 1, 's': 11, 't': 'a'},
 {'c': 2, 'i': 10, 'l': 1, 's': 12, 't': ' '},
 {'c': 3, 'i': 11, 'l': 1, 's': 13, 't': 'bc'},
 {'c': 5, 'i': 12, 'l': 1, 's': 15, 't': '.'}]
Intermediate char weights
[0.2,
 0.2,
 0.2,
 0.2,
 0.2,
 1.0,
 0.25,
 0.25,
 0.25,
 0.25,
 1.0,
 0.2,
 0.2,
 0.2,
 0.2,
 0.2]
Token a has we

In [100]:
assert np.isclose(sum(weights_1), sum(weights_2))