In [8]:
import json
from collections import defaultdict
from datetime import datetime

# === CONFIGURATION ===
INPUT_FILE = "training_sentences.txt"
BLACKLIST_FILE = "blacklisted_bigrams.txt"
OUTPUT_FILE = "htpc_model.json"
PHRASE_CHUNK_SIZE = 3

def normalize_token(token):
    return token.lower().strip(".,!?;:()[]{}\"'")

def tokenize(sentence):
    return [normalize_token(tok) for tok in sentence.strip().split() if tok]

def load_blacklist(path):
    blacklist = set()
    with open(path, "r") as f:
        for line in f:
            parts = tokenize(line.strip())
            if len(parts) == 2:
                blacklist.add(tuple(parts))
    return blacklist

def build_token_transitions(sentences, blacklist):
    transitions = defaultdict(lambda: None)
    for sentence in sentences:
        for i in range(len(sentence) - 1):
            bigram = (sentence[i], sentence[i + 1])
            if bigram not in blacklist:
                transitions[sentence[i]] = sentence[i + 1]
    return dict(transitions)

def build_bigram_memory(sentences, blacklist):
    bigram_counts = defaultdict(int)
    for sentence in sentences:
        for i in range(1, len(sentence)):
            bigram = (sentence[i - 1], sentence[i])
            if bigram not in blacklist:
                bigram_counts[bigram] += 1
    return dict(bigram_counts)

def build_phrase_memory(sentences, chunk_size=2, blacklist=None):
    phrase_counts = defaultdict(int)
    for sentence in sentences:
        n = len(sentence)
        if n >= chunk_size:
            for i in range(n - chunk_size + 1):
                bigrams = [(sentence[j], sentence[j + 1]) for j in range(i, i + chunk_size - 1)]
                if blacklist and any(bg in blacklist for bg in bigrams):
                    continue
                phrase = tuple(bigrams)
                phrase_counts[phrase] += 1
    return dict(phrase_counts)

def convert_for_json(model):
    return {
        'metadata': {
            'trained_on': datetime.now().isoformat(),
            'num_sentences': model['num_sentences'],
            'vocab_size': len(model['vocab']),
        },
        'token_transitions': model['token_transitions'],
        'bigram_memory': {
            f"{k[0]}|||{k[1]}": v for k, v in model['bigram_memory'].items()
        },
        'phrase_memory': {
            "|||".join([f"{a}__{b}" for (a, b) in k]): v
            for k, v in model['phrase_memory'].items()
        }
    }

def train_htpc_from_file(input_path, blacklist_path, output_path, chunk_size=2):
    with open(input_path, "r") as f:
        raw_sentences = f.readlines()
    tokenized_sentences = [tokenize(line) for line in raw_sentences if line.strip()]
    blacklist = load_blacklist(blacklist_path)

    vocab = set(tok for sent in tokenized_sentences for tok in sent)

    token_transitions = build_token_transitions(tokenized_sentences, blacklist)
    bigram_memory = build_bigram_memory(tokenized_sentences, blacklist)
    phrase_memory = build_phrase_memory(tokenized_sentences, chunk_size, blacklist)

    model = {
        'token_transitions': token_transitions,
        'bigram_memory': bigram_memory,
        'phrase_memory': phrase_memory,
        'vocab': vocab,
        'num_sentences': len(tokenized_sentences)
    }

    json_model = convert_for_json(model)
    with open(output_path, "w") as f:
        json.dump(json_model, f, indent=2)

    print(f"âœ… HTPC model trained on {len(tokenized_sentences)} sentences.")
    print(f"ðŸ§  Vocabulary size: {len(vocab)} tokens.")
    print(f"ðŸ“„ Model saved to '{output_path}'.")
    print(f"ðŸš« Ignored {len(blacklist)} blacklisted bigrams.")

if __name__ == "__main__":
    train_htpc_from_file(INPUT_FILE, BLACKLIST_FILE, OUTPUT_FILE, chunk_size=PHRASE_CHUNK_SIZE)


âœ… HTPC model trained on 5 sentences.
ðŸ§  Vocabulary size: 19 tokens.
ðŸ“„ Model saved to 'htpc_model.json'.
ðŸš« Ignored 0 blacklisted bigrams.
