In [5]:
from collections import Counter

In [6]:
text = "Ｕｎｉｃｏｄｅ! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don’t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode’s inception."

In [7]:
tokens = text.encode("utf-8")
tokens = list(map(int, tokens))


In [8]:
def train(tokens, n):
    replacements = {}  # Store replacement mappings
    
    for i in range(n):
        # Count occurrences of each pair of numbers in the list
        pairs = [(tokens[j], tokens[j+1]) for j in range(len(tokens) - 1)]
        pair_counts = Counter(pairs)
        
        # Find the most common pair
        if not pair_counts:
            print("No more pairs to process.")
            break
        
        most_common_pair, most_common_count = pair_counts.most_common(1)[0]
        
        # Determine the new code point
        new_code_point = 255 + i + 1
        
        # Store the replacement mapping
        replacements[new_code_point] = most_common_pair
        
        # Replace the most common pair with the new code point in tokens
        new_tokens = []
        skip = False
        
        for j in range(len(tokens) - 1):
            if skip:
                skip = False
                continue
            
            if (tokens[j], tokens[j+1]) == most_common_pair:
                new_tokens.append(new_code_point)
                skip = True
            else:
                new_tokens.append(tokens[j])
        
        # Add the last token if it was not part of a pair
        if not skip and len(tokens) > 0:
            new_tokens.append(tokens[-1])
        
        tokens = new_tokens
    
    return replacements

In [9]:
def encode(text, replacements):
    # Convert the input string to a list of UTF-8 encoded bytes
    tokens = list(text.encode("utf-8"))
    
    new_tokens = tokens.copy()
    
    # Apply each replacement in the order of the replacements dictionary
    for code_point, pair in replacements.items():
        encoded_tokens = []
        skip = False
        
        for j in range(len(new_tokens) - 1):
            if skip:
                skip = False
                continue
            
            if (new_tokens[j], new_tokens[j+1]) == pair:
                encoded_tokens.append(code_point)
                skip = True
            else:
                encoded_tokens.append(new_tokens[j])
        
        # Add the last token if it was not part of a pair
        if not skip and len(new_tokens) > 0:
            encoded_tokens.append(new_tokens[-1])
        
        new_tokens = encoded_tokens
    
    return new_tokens

In [10]:
def decode_tokens(tokens, replacements):
    # Create a new list to store the decoded tokens
    original_tokens = tokens.copy()  # Start with the encoded tokens

    # Keep decoding until no further replacements are possible
    while True:
        # Track whether any replacements were made in this pass
        replaced = False
        new_tokens = []

        i = 0
        while i < len(original_tokens):
            token = original_tokens[i]
            
            if token in replacements:
                replaced = True
                new_tokens.extend(replacements[token])  # Replace token with original pair
            else:
                new_tokens.append(token)
            i += 1
        
        # If no replacements were made, we are done
        if not replaced:
            break
        
        # Update original_tokens with the new_tokens for the next iteration
        original_tokens = new_tokens

    # Ensure all values are within the valid byte range
    original_tokens = [t for t in original_tokens if 0 <= t <= 255]

    # Reconstruct the original bytes from the original tokens
    try:
        original_bytes = bytes(original_tokens)
        original_string = original_bytes.decode('utf-8', errors='replace')
    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError: {e}")
        original_string = None

    return original_tokens, original_string

In [13]:
text = "Hi my name is ayush and i am the best person alive in this world whole"
tokens = list(text.encode("utf-8"))

n = 3  # Number of iterations
replacements = train(tokens, n)
print("Replacements Mapping:", replacements)

# Encode the text using the generated replacements
encoded_tokens = encode(text, replacements)
print("Encoded Tokens:", encoded_tokens)

# Decode the tokens
original_tokens, original_string = decode_tokens(encoded_tokens, replacements)
print("Original Tokens:", original_tokens)
if original_tokens == tokens:
    print("The original tokens were successfully reconstructed.")
else:
    print("The original tokens were not successfully reconstructed.")
print("Original String:", original_string)



Replacements Mapping: {256: (32, 97), 257: (101, 32), 258: (257, 105)}
Encoded Tokens: [72, 105, 32, 109, 121, 32, 110, 97, 109, 258, 115, 256, 121, 117, 115, 104, 256, 110, 100, 32, 105, 256, 109, 32, 116, 104, 257, 98, 101, 115, 116, 32, 112, 101, 114, 115, 111, 110, 256, 108, 105, 118, 258, 110, 32, 116, 104, 105, 115, 32, 119, 111, 114, 108, 100, 32, 119, 104, 111, 108, 101]
Original Tokens: [72, 105, 32, 109, 121, 32, 110, 97, 109, 101, 32, 105, 115, 32, 97, 121, 117, 115, 104, 32, 97, 110, 100, 32, 105, 32, 97, 109, 32, 116, 104, 101, 32, 98, 101, 115, 116, 32, 112, 101, 114, 115, 111, 110, 32, 97, 108, 105, 118, 101, 32, 105, 110, 32, 116, 104, 105, 115, 32, 119, 111, 114, 108, 100, 32, 119, 104, 111, 108, 101]
The original tokens were successfully reconstructed.
Original String: Hi my name is ayush and i am the best person alive in this world whole
