In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2

# Acc√®s Token
Colle ton token Hugging Face dans la cellule suivante

In [None]:
# Colle ton token ici
# from huggingface_hub import login
# login("ton_token_ici")

In [None]:
# üì¶ Installation des d√©pendances
!pip install -q gdown

# üîß Importations et configuration
import os
import json
import torch
import gdown
import shutil
from datetime import datetime
from datasets import load_dataset, concatenate_datasets
from google.colab import drive
from unsloth import FastLanguageModel
from trl import SFTConfig, SFTTrainer
from transformers import DataCollatorForSeq2Seq

# =======================
# üîπ Configuration Principale
# =======================
class Config:
    # Mod√®le et donn√©es
    BASE_MODEL = "Gems234/Alisia-7B-it-V1.0"
    DATASET_NAME = "HuggingFaceH4/ultrachat_200k"

    # Objectifs d'entra√Ænement
    TOTAL_DATASET_SIZE = 50000
    TARGET_COVERAGE = 20000  # 20% de 100k
    EXAMPLES_PER_SESSION = 336  # 84 steps √ó 4 grad_accum

    # Param√®tres techniques
    MAX_SEQ_LENGTH = 2048
    LOAD_IN_4BIT = True
    DTYPE = None
    PER_DEVICE_BATCH = 2
    GRAD_ACCUM = 4
    MAX_STEPS = 84
    LEARNING_RATE = 2e-4
    SEED = 3407

    # Chemins
    WORK_DIR = "/content/alisia_collab"
    DRIVE_MOUNT = "/content/drive"
    DRIVE_BASE_PATH = "/content/drive/MyDrive/Alisia_Collab"
    SOURCES_FILE = "sources.json"
    LOCK_TIMEOUT_HOURS = 6

    # Sources collaboratives
    MANUAL_SOURCES = {
        "√âlie": "https://drive.google.com/drive/folders/1ndS1XHWkcTp57s1wibxYIH4P_KB6elsw",
        "Jos": "https://drive.google.com/drive/folders/1bw9J5goW1GzT1O7MA2vA0_BC-wAiiVGE"
    }

config = Config()

# üìÅ Montage Google Drive
def setup_drive():
    """Monte Google Drive et cr√©e la structure de dossiers"""
    drive.mount(config.DRIVE_MOUNT)
    os.makedirs(config.DRIVE_BASE_PATH, exist_ok=True)
    print(f"‚úÖ Drive mont√©: {config.DRIVE_BASE_PATH}")

setup_drive()

# üîó FONCTION load_sources
def load_sources():
    """Charge ou initialise le fichier sources.json avec la nouvelle structure"""
    sources_path = os.path.join(config.DRIVE_BASE_PATH, config.SOURCES_FILE)

    if os.path.exists(sources_path):
        with open(sources_path, "r") as f:
            sources = json.load(f)
        sources = migrate_sources_structure(sources)
        return sources
    else:
        # Nouvelle structure avec scores
        initial_sources = {
            "participants": {},
            "metadata": {
                "created": datetime.now().isoformat(),
                "total_sessions": 0,
                "last_training_user": None,
                "best_performer": None,
                "total_collective_steps": 0,
                "dernier_termine": None,
                "heure_dernier_termine": None,
                "session_id": 0
            },
            "training_rotation": []
        }
        save_sources(initial_sources)
        print("üìÅ Fichier sources.json cr√©√© avec nouvelle structure")
        return initial_sources

def save_sources(sources):
    """Sauvegarde le fichier sources.json"""
    sources_path = os.path.join(config.DRIVE_BASE_PATH, config.SOURCES_FILE)
    with open(sources_path, "w") as f:
        json.dump(sources, f, indent=2)

def migrate_sources_structure(sources):
    """Migre l'ancienne structure vers la nouvelle"""
    print("üîÑ Migration automatique de sources.json...")

    if "participants" in sources:
        for user_name, user_data in sources["participants"].items():
            # Ajouter les champs manquants
            if "user_id" not in user_data:
                user_data["user_id"] = generate_user_id(sources)
                print(f"   ‚úÖ ID g√©n√©r√© pour {user_name}: {user_data['user_id']}")
            if "total_steps" not in user_data:
                user_data["total_steps"] = 0
            if "performance_score" not in user_data:
                user_data["performance_score"] = 0.0
            if "last_activity" not in user_data:
                user_data["last_activity"] = datetime.now().isoformat()

    if "metadata" not in sources:
        sources["metadata"] = {
            "created": datetime.now().isoformat(),
            "total_sessions": 0,
            "last_training_user": None,
            "best_performer": None,
            "total_collective_steps": 0
        }

    # Ajouter les champs √©tiquette si absents
    if "dernier_termine" not in sources["metadata"]:
        sources["metadata"]["dernier_termine"] = None
        sources["metadata"]["heure_dernier_termine"] = None
        sources["metadata"]["session_id"] = 0

    # Correction : s'assurer que total_collective_steps existe
    if "total_collective_steps" not in sources["metadata"]:
        sources["metadata"]["total_collective_steps"] = 0

    if "training_rotation" not in sources:
        sources["training_rotation"] = []

    save_sources(sources)
    print("‚úÖ Structure de sources.json migr√©e")
    return sources

def generate_user_id(sources):
    """G√©n√®re un ID utilisateur unique"""
    existing_ids = [user_data.get("user_id", "") for user_data in sources["participants"].values() if user_data.get("user_id")]
    new_id = str(len(existing_ids) + 1)
    while new_id in existing_ids:
        new_id = str(int(new_id) + 1)
    return new_id

# üë§ Syst√®me d'identification
def get_user_identity():
    """Identifie l'utilisateur avec g√©n√©ration d'ID unique"""
    sources = load_sources()

    print("üë§ Syst√®me d'identification collaboratif")
    print("1. Nouvel utilisateur")
    print("2. Utilisateur existant")

    choice = input("Choisissez une option (1 ou 2): ").strip()

    if choice == "1":
        # Nouvel utilisateur
        user_name = input("Entrez votre nom: ").strip()

        # V√©rifier si le nom existe d√©j√†
        if user_name in sources["participants"]:
            print("‚ùå Ce nom existe d√©j√†")
            return get_user_identity()

        user_id = generate_user_id(sources)

        # Enregistrer le nouvel utilisateur
        sources["participants"][user_name] = {
            "user_id": user_id,
            "drive_url": "",
            "added_date": datetime.now().isoformat(),
            "sessions_completed": 0,
            "total_steps": 0,
            "last_activity": datetime.now().isoformat(),
            "performance_score": 0.0
        }
        save_sources(sources)
        print(f"‚úÖ Nouvel utilisateur cr√©√©: {user_name} (ID: {user_id})")

    else:
        # Utilisateur existant
        user_name = input("Entrez votre nom: ").strip()
        if user_name in sources["participants"]:
            user_data = sources["participants"][user_name]
            if "user_id" not in user_data:
                user_data["user_id"] = generate_user_id(sources)
                save_sources(sources)
                print(f"üÜï ID g√©n√©r√© pour {user_name}: {user_data['user_id']}")

            user_id = user_data["user_id"]
            print(f"‚úÖ Utilisateur reconnu: {user_name} (ID: {user_id})")
        else:
            print("‚ùå Utilisateur non trouv√©")
            return get_user_identity()

    return user_id, user_name

user_id, user_name = get_user_identity()

‚úÖ Drive mont√©: /content/drive/MyDrive/Alisia_Collab
üîÑ Migration automatique de sources.json...
‚úÖ Structure de sources.json migr√©e
üë§ Syst√®me d'identification collaboratif
1. Nouvel utilisateur
2. Utilisateur existant


In [None]:
def prepare_dataset(ranges, tokenizer):
    """Charge les tranches de dataset et applique le format de conversation"""
    if not isinstance(ranges, (list, tuple)):
        raise ValueError("‚ùå 'ranges' doit √™tre une liste de tuples (start, end)")

    loaded_slices = []
    for s, e in ranges:
        slice_expr = f"train_sft[{s}:{e}]"
        print(f"üì• Chargement: {slice_expr}")
        ds_part = load_dataset(config.DATASET_NAME, split=slice_expr)
        loaded_slices.append(ds_part)

    if len(loaded_slices) == 1:
        dataset = loaded_slices[0]
    else:
        dataset = concatenate_datasets(loaded_slices)
        print(f"üîó {len(loaded_slices)} tranches concat√©n√©es -> {len(dataset)} exemples")

    tokenizer.chat_template = """<|im_start|>system
You are Alisia, a helpful, precise, and knowledgeable assistant created by the Alisia Research Team.<|im_end|>
{% for message in messages %}
<|im_start|>{{ message['role'] }}
{{ message['content'] }}<|im_end|>
{% endfor %}
{% if add_generation_prompt %}<|im_start|>assistant
{% endif %}"""

    def formatting_prompts_func(examples):
        conversations = examples["messages"]
        texts = [
            tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
            for convo in conversations
        ]
        return {"text": texts}

    dataset = dataset.map(formatting_prompts_func, batched=True)
    print(f"‚úÖ Dataset pr√™t pour entra√Ænement : {len(dataset)} exemples")

    return dataset

In [None]:
# ============================
# SYST√àME COLLABORATIF COMPLET
# ============================

import os
import sys
import json
import shutil
import gdown
from datetime import datetime, timedelta

# --- V√©rifications pr√©alables s√©curis√©es ---
def verify_dependencies():
    """V√©rifie toutes les d√©pendances de mani√®re s√©curis√©e"""
    missing = []

    # V√©rification des imports critiques
    try:
        from unsloth import FastLanguageModel
    except ImportError:
        missing.append("unsloth")

    try:
        from trl import SFTConfig, SFTTrainer
    except ImportError:
        missing.append("trl")

    try:
        from transformers import DataCollatorForSeq2Seq
    except ImportError:
        missing.append("transformers")

    # V√©rification des fonctions d√©finies
    if "prepare_dataset" not in globals():
        missing.append("prepare_dataset (d√©fini en cellule 2)")

    if missing:
        print("‚ùå D√©pendances manquantes :")
        for m in missing:
            print(f"   - {m}")
        print("\nüì¶ Installation recommand√©e :")
        print("!pip install -q unsloth trl transformers bitsandbytes accelerate datasets gdown")
        return False

    print("‚úÖ Toutes les d√©pendances sont disponibles")
    return True

# --- Configuration s√©curis√©e ---
class SafeConfig:
    """Configuration avec valeurs par d√©faut s√©curis√©es"""
    # Chemins
    DRIVE_BASE_PATH = getattr(config, 'DRIVE_BASE_PATH', '/content/drive')
    WORK_DIR = getattr(config, 'WORK_DIR', '/content')
    SOURCES_FILE = getattr(config, 'SOURCES_FILE', 'sources.json')

    # Mod√®le
    BASE_MODEL = getattr(config, 'BASE_MODEL', 'Gems234/Alisia-7B-it-V1.0')
    MAX_SEQ_LENGTH = getattr(config, 'MAX_SEQ_LENGTH', 2048)
    DTYPE = getattr(config, 'DTYPE', None)
    LOAD_IN_4BIT = getattr(config, 'LOAD_IN_4BIT', True)

    # Entra√Ænement
    MAX_STEPS = getattr(config, 'MAX_STEPS', 100)
    LEARNING_RATE = getattr(config, 'LEARNING_RATE', 2e-4)
    PER_DEVICE_BATCH = getattr(config, 'PER_DEVICE_BATCH', 1)
    GRAD_ACCUM = getattr(config, 'GRAD_ACCUM', 1)
    SEED = getattr(config, 'SEED', 42)

    # Dataset
    EXAMPLES_PER_SESSION = getattr(config, 'EXAMPLES_PER_SESSION', 100)
    TOTAL_DATASET_SIZE = getattr(config, 'TOTAL_DATASET_SIZE', 1000)
    TARGET_COVERAGE = getattr(config, 'TARGET_COVERAGE', 1000)

    # Verrou
    LOCK_TIMEOUT_HOURS = getattr(config, 'LOCK_TIMEOUT_HOURS', 6)

    # Sources manuelles (fallback)
    MANUAL_SOURCES = getattr(config, 'MANUAL_SOURCES', {})

# --- Fonction principale ---
def main_corrected():
    """Fonction principale avec √©tiquette CORRIG√âE"""
    print("üöÄ D√©marrage syst√®me collaboratif...")
    print(f"üëã Bienvenue {user_name} (ID: {user_id})!")

    # V√©rification des d√©pendances
    if not verify_dependencies():
        return

    print("‚úÖ Syst√®me pr√™t pour l'entra√Ænement collaboratif")

# üéØ LANCEMENT
if __name__ == "__main__":
    main_corrected()

üöÄ D√©marrage syst√®me collaboratif...
üëã Bienvenue √âlie (ID: 1)!
‚úÖ Toutes les d√©pendances sont disponibles
‚úÖ Syst√®me pr√™t pour l'entra√Ænement collaboratif


In [None]:
from huggingface_hub import HfApi
api = HfApi()

# Exemple pour rendre public (√† d√©commenter si besoin)
# api.update_repo_visibility(
#     repo_id="Gems234/Alisia-7B-it-V1.0",
#     private=False,
#     token='ton_token_ici'
# )

print("‚úÖ Module Hugging Face charg√©")

‚úÖ Module Hugging Face charg√©


# Objectif du projet
L'objectif est d'atteindre au moins 20% de couverture du dataset UltraChat gr√¢ce √† un syst√®me d'entra√Ænement collaboratif.