#### Sequence predicting coding

* We want to remap each sequence to a compressed version based on linear predictive coding
* e.g. 'The cat sat on the mat' should be remapped to 'The cat sat - - -' or something like that
* For each token, get prediction and compare to real sequence
* if predicted = real, then represent the sequence with some kind of placeholder
* the final output should be a dictionary of token locations and items, e.g. {0: 'The', 1: 'cat', 5: 'mat'}

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

#### With pretrained GPT-2:

In [None]:
# Initialize the model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

def compress_sequence_with_gpt2(sequence):
    # Tokenize the input sequence and add the required tokens for GPT-2
    input_ids = tokenizer.encode(sequence, return_tensors='pt')
    
    compressed_sequence = {0: tokenizer.decode(input_ids[:, 0])}
    placeholder = '-'  # Define a placeholder for matched predictions

    with torch.no_grad():  # Disable gradient calculations
        for i in range(input_ids.size(1) - 1):
            # Get the model's prediction for the next token
            outputs = model(input_ids[:, :i+1])
            predictions = outputs.logits[:, -1, :].argmax(dim=1)
            
            # Check if the predicted next token matches the actual next token
            if predictions == input_ids[:, i+1]:
                # If predicted token matches the actual token, use a placeholder
                continue
            else:
                # If not, store the actual token in the compressed sequence
                actual_token = tokenizer.decode(input_ids[:, i+1])
                compressed_sequence[i+1] = actual_token

    print(compressed_sequence)
    return compressed_sequence

In [None]:
def format_string_from_tokens(tokens_dict):
    # Find the maximum key value to determine the length of the output string
    max_key = max(tokens_dict.keys())

    # Initialize a list to hold the string components
    string_components = []

    # Iterate through each position up to the maximum key value
    for i in range(0, max_key + 1):
        if i in tokens_dict:
            # Add the word from the dictionary
            string_components.append(tokens_dict[i])
        else:
            # Add a placeholder for missing words
            string_components.append(" _")

    # Join the components with spaces, but you might want to adjust spacing around punctuation
    output_string = ''.join(string_components)

    return output_string


sequence = "What lovely weather! I went for a walk in the park and the sun was shining. I had to wear a pair of sunglasses."
compressed_sequence = compress_sequence_with_gpt2(sequence)
print(compressed_sequence)

formatted_string = format_string_from_tokens(compressed_sequence)
print(formatted_string)

#### With the planning model:

In [None]:
# Initialize the model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('rule_model')
model = GPT2LMHeadModel.from_pretrained('rule_model')

sequence = "\nSTART: yellow fruit, STOP: green, REWARD: animal, SEQUENCE: red animal (2), green vehicle (-1)"
compressed_sequence = compress_sequence_with_gpt2(sequence)
print(compressed_sequence)
formatted_string = format_string_from_tokens(compressed_sequence)
print(formatted_string)

#### With the inference model:

In [None]:
# Initialize the model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('outputs_graph')
model = GPT2LMHeadModel.from_pretrained('outputs_graph')

sequence = "ab EAST bn SOUTH ty NORTH bn NORTH iu "
compressed_sequence = compress_sequence_with_gpt2(sequence)
print(compressed_sequence)
formatted_string = format_string_from_tokens(compressed_sequence)
print(formatted_string)