In [2]:
from itertools import product
from collections import defaultdict
from datetime import datetime
import json

# === CONFIGURATION ===
INPUT_FILE = "training_sentences.txt"
BLACKLIST_FILE = "blacklisted_bigrams.txt"
OUTPUT_FILE = "htpc_model.json"
PHRASE_CHUNK_SIZE = 3  # number of tokens per phrase

# === TEXT PROCESSING UTILITIES ===

def normalize_token(token):
    return token.lower().strip(".,!?;:()[]{}\"'")

def tokenize(sentence):
    """Split sentence into normalized tokens, handling alternatives."""
    tokens = []
    for word in sentence.strip().split():
        options = [normalize_token(w) for w in word.split('|')]
        tokens.append(options)
    return tokens

def expand_sequences(token_matrix):
    """Generate all possible token sequences from a list of lists of alternatives."""
    return list(product(*token_matrix))  # full combinatorial expansion

def load_blacklist(path):
    blacklist = set()
    with open(path, "r") as f:
        for line in f:
            parts = normalize_token(line.strip()).split()
            if len(parts) == 2:
                blacklist.add(tuple(parts))
    return blacklist

# === MODEL BUILDING ===

def build_token_transitions(sequences, blacklist):
    transitions = defaultdict(lambda: None)
    for seq in sequences:
        for i in range(len(seq) - 1):
            bigram = (seq[i], seq[i + 1])
            if bigram not in blacklist:
                transitions[seq[i]] = seq[i + 1]
    return dict(transitions)

def build_bigram_memory(sequences, blacklist):
    bigram_counts = defaultdict(int)
    for seq in sequences:
        for i in range(len(seq) - 1):
            bigram = (seq[i], seq[i + 1])
            if bigram not in blacklist:
                bigram_counts[bigram] += 1
    return dict(bigram_counts)

def build_phrase_memory(sequences, chunk_size=3, blacklist=None):
    phrase_counts = defaultdict(int)
    for seq in sequences:
        if len(seq) >= chunk_size:
            for i in range(len(seq) - chunk_size + 1):
                bigrams = [(seq[j], seq[j + 1]) for j in range(i, i + chunk_size - 1)]
                if blacklist and any(bg in blacklist for bg in bigrams):
                    continue
                phrase_counts[tuple(bigrams)] += 1
    return dict(phrase_counts)

def convert_for_json(model):
    return {
        'metadata': {
            'trained_on': datetime.now().isoformat(),
            'num_sentences': model['num_sentences'],
            'vocab_size': len(model['vocab']),
        },
        'token_transitions': model['token_transitions'],
        'bigram_memory': {
            f"{k[0]}|||{k[1]}": v for k, v in model['bigram_memory'].items()
        },
        'phrase_memory': {
            "|||".join([f"{a}__{b}" for (a, b) in k]): v
            for k, v in model['phrase_memory'].items()
        }
    }

# === TRAINING PIPELINE ===

def train_htpc_from_file(input_path, blacklist_path, output_path, chunk_size=3):
    with open(input_path, "r") as f:
        raw_sentences = f.readlines()

    # Read and expand training data
    expanded_sequences = []
    for line in raw_sentences:
        matrix = tokenize(line)
        expanded_sequences.extend(expand_sequences(matrix))

    blacklist = load_blacklist(blacklist_path)
    vocab = set(tok for seq in expanded_sequences for tok in seq)

    token_transitions = build_token_transitions(expanded_sequences, blacklist)
    bigram_memory = build_bigram_memory(expanded_sequences, blacklist)
    phrase_memory = build_phrase_memory(expanded_sequences, chunk_size, blacklist)

    model = {
        'token_transitions': token_transitions,
        'bigram_memory': bigram_memory,
        'phrase_memory': phrase_memory,
        'vocab': vocab,
        'num_sentences': len(expanded_sequences)
    }

    json_model = convert_for_json(model)
    with open(output_path, "w") as f:
        json.dump(json_model, f, indent=2)

    print(f"✅ Trained on {len(expanded_sequences)} expanded sequences.")
    print(f"📘 Vocabulary size: {len(vocab)} tokens.")
    print(f"🧠 Model saved to: {output_path}")
    print(f"🚫 Blacklisted bigrams: {len(blacklist)}")

# === RUN ===

if __name__ == "__main__":
    train_htpc_from_file(INPUT_FILE, BLACKLIST_FILE, OUTPUT_FILE, chunk_size=PHRASE_CHUNK_SIZE)


✅ Trained on 22 expanded sequences.
📘 Vocabulary size: 18 tokens.
🧠 Model saved to: htpc_model.json
🚫 Blacklisted bigrams: 0
