In [7]:
!pip install -q faiss-cpu datasets pandas sentence-transformers sacrebleu tf-keras


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip[0m


In [8]:
# -------- CPU ONLY (avant import TF) --------
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "false"

# (Colab) installs si besoin :
# !pip install -q faiss-cpu datasets pandas sentence-transformers sacrebleu

import json, datetime, faiss, numpy as np, tensorflow as tf, pandas as pd, math, pathlib
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from tensorflow.keras.layers import Dense, Embedding, MultiHeadAttention, Dropout, LayerNormalization, TextVectorization
from tensorflow.keras import callbacks as Kcb

In [10]:
# =========================
# Données
# =========================
def load_squad_pairs():
    ds = load_dataset("squad", split="train")
    pairs = []
    for it in ds:
        ctx = (it["context"] or "").strip()
        q = (it["question"] or "").strip()
        ans = it["answers"]["text"][0].strip() if it["answers"]["text"] else ""
        if ctx and q and ans:
            pairs.append((f"{ctx}\nQ: {q}", ans))
    print(f"✅ SQuAD: {len(pairs)} paires")
    return pairs

def load_shirayuki_pairs(csv_path="shirayuki.csv"):
    df = pd.read_csv(csv_path)
    pairs = [(str(i).strip(), str(o).strip())
             for i,o in zip(df["guy"], df["girl"])
             if str(i).strip() and str(o).strip()]
    print(f"✅ Shirayuki: {len(pairs)} paires")
    return pairs

def split_pairs(pairs, val_ratio=0.02, seed=42):
    rng = np.random.default_rng(seed)
    idx = np.arange(len(pairs))
    rng.shuffle(idx)
    cut = max(1, int(len(pairs) * (1 - val_ratio)))
    train_idx, val_idx = idx[:cut], idx[cut:]
    train = [pairs[i] for i in train_idx]
    val = [pairs[i] for i in val_idx]
    return train, val

def make_ds_from_pairs(pairs, tokenizer, max_len=96, batch_size=64, shuffle=True):
    X = [x for x,_ in pairs]
    Y = [f"[START] {y} [END]" for _,y in pairs]
    enc = tokenizer(X)
    out = tokenizer(Y)
    dec_in = out[:, :-1]
    dec_tg = out[:, 1:]
    ds = tf.data.Dataset.from_tensor_slices(
        ({"encoder_input": enc, "decoder_input": dec_in}, dec_tg)
    )
    if shuffle:
        ds = ds.shuffle(10000)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    steps = math.ceil(len(pairs) / batch_size)
    return ds, steps

def prepare_datasets(pairs, tokenizer=None, vocab_size=20000, max_len=96, batch_size=64, val_ratio=0.02):
    train_pairs, val_pairs = split_pairs(pairs, val_ratio=val_ratio)
    X_all = [x for x,_ in pairs]
    Y_all = [f"[START] {y} [END]" for _,y in pairs]
    if tokenizer is None:
        tokenizer = TextVectorization(
            max_tokens=vocab_size,
            output_sequence_length=max_len,
            standardize="lower_and_strip_punctuation",
            split="whitespace"
        )
        tokenizer.adapt(X_all + Y_all)
    train_ds, train_steps = make_ds_from_pairs(train_pairs, tokenizer, max_len, batch_size, shuffle=True)
    val_ds, val_steps     = make_ds_from_pairs(val_pairs, tokenizer, max_len, batch_size, shuffle=False)
    return tokenizer, train_ds, val_ds, train_steps, val_steps

In [11]:
# =========================
# Mémoire FAISS (RAG light)
# =========================
EMBED_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
EMBED_DIM = 384
MEMORY_FILE = "shirayuki_memory.jsonl"
INDEX_FILE = "shirayuki_faiss.index"
index = faiss.read_index(INDEX_FILE) if os.path.exists(INDEX_FILE) else faiss.IndexFlatL2(EMBED_DIM)

def _encode(text): return np.array([EMBED_MODEL.encode(text)], dtype="float32")

def save_to_memory(user_text, bot_text):
    ts = datetime.datetime.now().isoformat()
    index.add(_encode(user_text))
    with open(MEMORY_FILE, "a", encoding="utf-8") as f:
        f.write(json.dumps({"input": user_text, "response": bot_text, "timestamp": ts}, ensure_ascii=False) + "\n")
    faiss.write_index(index, INDEX_FILE)

def search_memory(query, top_k=3):
    if index.ntotal == 0 or not os.path.exists(MEMORY_FILE): return []
    D, I = index.search(_encode(query), top_k)
    with open(MEMORY_FILE, "r", encoding="utf-8") as f:
        mem = [json.loads(l) for l in f]
    return [mem[i] for i in I[0] if 0 <= i < len(mem)]


In [12]:

# =========================
# Masques (compatibles Keras MHA)
# =========================
PAD = 0
def padding_mask_2d(token_ids):
    return tf.cast(tf.not_equal(token_ids, PAD), tf.float32)   # (B,T)
def self_attention_mask(tokens):
    m = padding_mask_2d(tokens)                                # (B,T)
    return tf.einsum("bi,bj->bij", m, m)                       # (B,T,T)
def look_ahead_matrix(T):
    return tf.linalg.band_part(tf.ones((T, T), dtype=tf.float32), -1, 0)  # (T,T)
def decoder_self_mask(dec_tokens):
    m = padding_mask_2d(dec_tokens)                            # (B,Td)
    pad_pair = tf.einsum("bi,bj->bij", m, m)                   # (B,Td,Td)
    la = look_ahead_matrix(tf.shape(dec_tokens)[1])            # (Td,Td)
    return pad_pair * la                                       # (B,Td,Td)
def cross_attention_mask(dec_tokens, enc_tokens):
    m_dec = padding_mask_2d(dec_tokens)                        # (B,Td)
    m_enc = padding_mask_2d(enc_tokens)                        # (B,Te)
    return tf.einsum("bi,bj->bij", m_dec, m_enc)               # (B,Td,Te)


In [14]:
# =========================
# Modèle Transformer
# =========================
class Block(tf.keras.layers.Layer):
    def __init__(self, d, h, ff, drop=0.1, decoder=False):
        super().__init__()
        self.decoder = decoder
        self.self_att = MultiHeadAttention(num_heads=h, key_dim=d//h, dropout=drop)
        self.ln1 = LayerNormalization(epsilon=1e-6)
        self.do1 = Dropout(drop)
        if decoder:
            self.cross = MultiHeadAttention(num_heads=h, key_dim=d//h, dropout=drop)
            self.ln_c = LayerNormalization(epsilon=1e-6)
            self.do_c = Dropout(drop)
        self.ffn = tf.keras.Sequential([Dense(ff, activation="gelu"), Dense(d)])
        self.ln2 = LayerNormalization(epsilon=1e-6)
        self.do2 = Dropout(drop)
    def call(self, x, enc_out=None, self_mask=None, enc_mask=None, training=False):
        a = self.self_att(x, x, x, attention_mask=self_mask, training=training)
        x = self.ln1(x + self.do1(a, training=training))
        if self.decoder and enc_out is not None:
            a2 = self.cross(x, enc_out, enc_out, attention_mask=enc_mask, training=training)
            x = self.ln_c(x + self.do_c(a2, training=training))
        f = self.ffn(x)
        return self.ln2(x + self.do2(f, training=training))

class Seq2Seq(tf.keras.Model):
    def __init__(self, vocab_size, d=256, h=8, ff=768, max_len=96, L=4, drop=0.1):
        super().__init__()
        self.d, self.max_len = d, max_len
        self.tok_emb = Embedding(vocab_size, d)
        self.pos_emb = Embedding(max_len, d)
        self.enc = [Block(d, h, ff, drop, decoder=False) for _ in range(L)]
        self.dec = [Block(d, h, ff, drop, decoder=True) for _ in range(L)]
        self.final = Dense(vocab_size)
    def _add_pos(self, tok_ids):
        T = tf.shape(tok_ids)[1]
        return self.tok_emb(tok_ids) + self.pos_emb(tf.range(T)[tf.newaxis, :])
    def encode(self, enc_tokens, training=False):
        x = self._add_pos(enc_tokens)
        mask = self_attention_mask(enc_tokens)                 # (B,Te,Te)
        for blk in self.enc:
            x = blk(x, self_mask=mask, training=training)
        return x
    def decode(self, dec_tokens, enc_tokens, enc_out, training=False):
        y = self._add_pos(dec_tokens)
        self_m = decoder_self_mask(dec_tokens)                 # (B,Td,Td)
        cross_m = cross_attention_mask(dec_tokens, enc_tokens) # (B,Td,Te)
        for blk in self.dec:
            y = blk(y, enc_out=enc_out, self_mask=self_m, enc_mask=cross_m, training=training)
        return y
    def call(self, inputs, training=False):
        enc_tokens = inputs["encoder_input"]
        dec_tokens = inputs["decoder_input"]
        enc_out = self.encode(enc_tokens, training=training)
        dec_out = self.decode(dec_tokens, enc_tokens, enc_out, training=training)
        return self.final(dec_out)


In [15]:
# =========================
# Génération
# =========================
def build_generation(tokenizer, model):
    vocab = tokenizer.get_vocabulary()
    tok2id = {t:i for i,t in enumerate(vocab)}
    START = tok2id.get("[START]", 1)
    END = tok2id.get("[END]", 2)

    @tf.function(reduce_retracing=True)
    def _tf_encode(enc_tokens):
        return model.encode(enc_tokens, training=False)
    @tf.function(reduce_retracing=True)
    def _tf_decode(dec_tokens, enc_tokens, enc_out):
        y = model.decode(dec_tokens, enc_tokens, enc_out, training=False)
        return model.final(y)[:, -1, :]

    def generate_response(prompt, max_new_tokens=64, temperature=0.7, top_k=None, use_memory=True, save_mem=True):
        ctx = ""
        if use_memory:
            hits = search_memory(prompt, top_k=3)
            if hits:
                ctx = "\n".join([f"User: {m['input']}\nShirayuki: {m['response']}" for m in hits]) + "\n"
        full_inp = ctx + f"User: {prompt}\nShirayuki:"

        enc_tokens = tokenizer([full_inp])
        enc_out = _tf_encode(enc_tokens)

        y = tf.constant([[START]], dtype=tf.int64)
        for _ in range(max_new_tokens):
            logits = _tf_decode(y, enc_tokens, enc_out)
            if temperature and temperature > 0:
                logits = logits / temperature
                if top_k and top_k > 0:
                    values, indices = tf.math.top_k(logits, k=top_k)
                    probs = tf.nn.softmax(values)
                    next_id_rel = tf.random.categorical(tf.math.log(probs), 1)
                    next_id = tf.gather(indices, next_id_rel, batch_dims=1)
                    next_token = int(next_id.numpy()[0][0])
                else:
                    next_token = int(tf.random.categorical(logits, 1).numpy()[0][0])
            else:
                next_token = int(tf.argmax(logits, axis=-1).numpy()[0])
            if next_token == END: break
            y = tf.concat([y, tf.constant([[next_token]], dtype=tf.int64)], axis=1)

        id2tok = {i:t for i,t in enumerate(vocab)}
        toks = [id2tok.get(int(t), "") for t in y.numpy()[0] if int(t) not in (0, START, END)]
        text = " ".join(toks).strip()
        if save_mem:
            save_to_memory(prompt, text)
        return text or "[Aucune réponse générée]"

    return generate_response

In [23]:
# =========================
# Callbacks
# =========================
def build_callbacks(run_name="run"):
    ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = pathlib.Path("logs") / f"{run_name}-{ts}"
    ckpt_dir = pathlib.Path("ckpts") / f"{run_name}-{ts}"
    log_dir.mkdir(parents=True, exist_ok=True)
    ckpt_dir.mkdir(parents=True, exist_ok=True)

    # LR schedule: warmup -> cosine
    warmup_epochs = 1
    max_epochs = 50
    base_lr = 1e-3
    min_lr = 1e-5
    def lr_schedule(epoch, lr):
        if epoch < warmup_epochs:
            return base_lr * (epoch + 1) / warmup_epochs
        # cosine decay from base_lr to min_lr
        t = (epoch - warmup_epochs) / max(1, (max_epochs - warmup_epochs))
        return float(min_lr + 0.5*(base_lr - min_lr)*(1 + math.cos(math.pi * t)))

    # Simple sample generation callback (prints 2 prompts)
    def make_gen_cb(gen_fn):
        sample_prompts = ["Hello Shirayuki", "How are you today?"]
        def _on_epoch_end(epoch, logs=None):
            print("\n🧪 Samples:")
            for p in sample_prompts:
                print(" >", p)
                print(" >", gen_fn(p, temperature=0.8, top_k=40))
        return Kcb.LambdaCallback(on_epoch_end=_on_epoch_end)

    cbs = [
        Kcb.TensorBoard(log_dir=str(log_dir), histogram_freq=0, write_graph=True),
        Kcb.BackupAndRestore(backup_dir=str(log_dir / "backup")),
        Kcb.ModelCheckpoint(
            filepath=str(ckpt_dir / "{epoch:02d}-{val_loss:.3f}.weights.h5"),
            save_weights_only=True, monitor="val_loss", mode="min", save_best_only=True, verbose=1
        ),
        Kcb.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True, verbose=1),
        Kcb.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2, min_lr=1e-5, verbose=1),
        Kcb.LearningRateScheduler(lr_schedule, verbose=0),
        Kcb.CSVLogger(str(log_dir / "training.csv"), append=False),
        Kcb.TerminateOnNaN(),
    ]
    return cbs


In [24]:
# =========================
# Entraînement (CPU)
# =========================
with tf.device("/CPU:0"):
    # Prétrain SQuAD
    squad_pairs = load_squad_pairs()
    tokenizer, squad_train, squad_val, squad_steps, squad_val_steps = prepare_datasets(
        squad_pairs, vocab_size=20000, max_len=96, batch_size=64, val_ratio=0.02
    )

    model = Seq2Seq(vocab_size=tokenizer.vocabulary_size(), d=256, h=8, ff=768, max_len=96, L=4, drop=0.1)
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
    generate_response = build_generation(tokenizer, model)
    cbs_pre = build_callbacks("pretrain_squad")
    cbs_pre.append((lambda gen: Kcb.LambdaCallback(on_epoch_end=lambda e,l: None))(generate_response))  # placeholder (no-op) to keep list editable
    # Remplace le placeholder par un vrai callback de génération:
    cbs_pre[-1] = (lambda gen: (lambda: None, Kcb.LambdaCallback(on_epoch_end=lambda e,l: [print("\n🧪 Sample:", gen("Hello Shirayuki", temperature=0.8, top_k=40))])))(generate_response)[1]

    print("🚀 Pré-entraînement sur SQuAD...")
    model.fit(
        squad_train,
        validation_data=squad_val,
        epochs=5,
        steps_per_epoch=squad_steps,
        validation_steps=squad_val_steps,
        callbacks=cbs_pre,
        verbose=1
    )

    # Finetune Shirayuki
    shirayuki_pairs = load_shirayuki_pairs("shirayuki.csv")   # <-- assure le fichier présent
    _, sh_train, sh_val, sh_steps, sh_val_steps = prepare_datasets(
        shirayuki_pairs, tokenizer=tokenizer, max_len=96, batch_size=64, val_ratio=0.05
    )
    cbs_ft = build_callbacks("finetune_shirayuki")
    # Génération à chaque epoch sur FT
    def _on_epoch_end_ft(epoch, logs=None):
        print("\n🧪 FT Samples:")
        for p in ["Hello Shirayuki", "Peux-tu m'aider à planifier ma journée ?"]:
            print(" >", p)
            print(" >", generate_response(p, temperature=0.8, top_k=40))
    cbs_ft.append(Kcb.LambdaCallback(on_epoch_end=_on_epoch_end_ft))

    print("🔄 Fine-tuning sur Shirayuki...")
    model.fit(
        sh_train,
        validation_data=sh_val,
        epochs=10,
        steps_per_epoch=sh_steps,
        validation_steps=sh_val_steps,
        callbacks=cbs_ft,
        verbose=1
    )


✅ SQuAD: 87599 paires
🚀 Pré-entraînement sur SQuAD...


ValueError: To use the BackupAndRestore callback, you model must be built before you call `fit()`. Model <Seq2Seq name=seq2_seq_2, built=False> is unbuilt. You can build it beforehand by calling it on a batch of data.

In [20]:
# =========================
# Démo rapide post-entraînement
# =========================
tests = [
    "Hello Shirayuki",
    "How are you today?",
    "What's your favorite music?",
    "Peux-tu m'aider à planifier ma journée ?"
]
for t in tests:
    print("\n> 💬", t)
    print("> 🤖", generate_response(t, temperature=0.8, top_k=40))


> 💬 Hello Shirayuki




> 🤖 phoenician nominative pipil feynmans kindergarten economics rivera convenience breast dualism recorded recited vault establishing loanwords securitization gala boudhanath survival leopards occupancy ring consequent defined vienna circumstances neurological portrayed rob kidnapped 122nd panther extent cyrus cyrus attract tension beidou liechtenstein complete categorize whitehead phenomenology 11 angry reinforced obligations evanston indication institut alloys 1758 accessing gaza tag vocalist 8bit torchbearers fighter came demarcation addressed continue theses

> 💬 How are you today?
> 🤖 widow differentiation organization tablets chanting aonuma carl limited retreated inside jp europes waterfront injustice 72 freedmen tongue athenian emancipation nocturnes suppressing assume brestlitovsk constructive plaza ignored elk male imac seventeen façade backwards untranslated dylan diversity brutality chief genera 510 bank bavarian reacts arisen parisian mistaken cleopatra predicted ethnograp