In [108]:
import random

class Rule:
    def apply_rule(self, dataset, token):
        raise NotImplementedError("Each rule must implement an apply_rule method.")

class MultipleRule(Rule):
    def __init__(self, multiple):
        self.multiple = multiple
        self.range = 2 #hardcoded for simplicity

    def apply_rule(self, dataset, token):
        # Generate all valid multiples of the token
        return [token * self.multiple * i for i in range(1, self.range)]

class CopyBehindRule(Rule):
    def apply_rule(self, dataset, token):
        raise NotImplementedError("we need to think through this rule more")
        # Copy the last character of the dataset
        if dataset:
            return [token + dataset[-1]]
        return []

class DatasetGenerator:
    def __init__(self, n, rules):
        self.vocabulary = [chr(i) for i in range(65, 65 + n)]
        self.rules = rules

    def generate(self, l_c, l_t):
        dataset = ""
        while len(dataset) < l_t:
            token = random.choice(self.vocabulary)
            rule_set = self.rules.get(token, [])
            
            if not rule_set:
                # No rules for the token, add it to the dataset
                raise ValueError(f"No rules defined for token '{token}'.")
            
            possible_continuations = []
            for rule in rule_set:
                continuations = rule.apply_rule(dataset, token)
                possible_continuations.append(continuations)
            
            # Find a common continuation that satisfies all rules
            common_continuations = set.intersection(*map(set, possible_continuations)) if possible_continuations else []
            
            if common_continuations:
                continuation = random.choice(list(common_continuations))
                dataset += continuation
            else:
                # No valid continuation found that satisfies all rules
                rule_descriptions = [type(rule).__name__ for rule in rule_set]
                remaining_length = l_t - len(dataset)
                error_message = (
                    f"Token '{token}' with rules {', '.join(rule_descriptions)} cannot be applied. "
                    f"Current dataset: '{dataset}', remaining length: {remaining_length}, "
                    f"required context length: {l_c}, total target length: {l_t}."
                )
                raise ValueError(error_message)

        return dataset[:l_t]

# Example usage
rules = {
    'A': [MultipleRule(2)],
    'B': [MultipleRule(3)]
}
n = 2
l_c = 5
l_t = 20 
generator = DatasetGenerator(n, rules)
try:
    dataset = generator.generate(l_c, l_t)
    print(f"ctx = {dataset[:l_c]}")
    print(f"gen = {dataset}")
except ValueError as e:
    print(e)

ctx = BBBAA
gen = BBBAAAAAAAABBBBBBAAA
