# Hugging Face Model Manager
This notebook helps you securely manage your Hugging Face access token and download models or files from the Hub into this container.

In [None]:
import os
from pathlib import Path
from typing import Optional
import ipywidgets as widgets
from huggingface_hub import HfApi, HfFolder, snapshot_download

HF_CACHE = Path(os.getenv('HF_HOME', '/app/cache')).expanduser()
HF_CACHE.mkdir(parents=True, exist_ok=True)
DEST_DIR = Path('/app/models')
DEST_DIR.mkdir(parents=True, exist_ok=True)

api = HfApi()

def get_saved_token() -> Optional[str]:
    token = HfFolder.get_token()
    return token.strip() if token else None

def save_token(token: str) -> None:
    HfFolder.save_token(token.strip())

display(widgets.HTML(f"<b>Hugging Face cache:</b> {HF_CACHE}<br><b>Model destination:</b> {DEST_DIR}"))

## 1. Manage your Hugging Face token

In [None]:
token_input = widgets.Password(
    description='Token',
    placeholder='hf_xxx',
    layout=widgets.Layout(width='60%'),
    value=get_saved_token() or ''
)
status_label = widgets.Label()
save_button = widgets.Button(description='Save token', button_style='success', icon='check')
clear_button = widgets.Button(description='Clear token', button_style='warning', icon='trash')

def on_save_token(_):
    if not token_input.value.strip():
        status_label.value = '‚ö†Ô∏è Please enter a token before saving.'
        return
    save_token(token_input.value)
    status_label.value = '‚úÖ Token saved to ~/.huggingface/token'

def on_clear_token(_):
    HfFolder.delete_token()
    token_input.value = ''
    status_label.value = 'üßπ Token cleared.'

save_button.on_click(on_save_token)
clear_button.on_click(on_clear_token)

display(widgets.VBox([token_input, widgets.HBox([save_button, clear_button]), status_label]))

## 2. Download models or files

In [None]:
repo_input = widgets.Text(
    description='Repo ID',
    placeholder='e.g. meta-llama/Llama-3.1-8B-Instruct',
    layout=widgets.Layout(width='70%')
)
revision_input = widgets.Text(
    description='Revision',
    placeholder='main',
    layout=widgets.Layout(width='50%')
)
pattern_input = widgets.Text(
    description='File glob',
    placeholder='*.bin (leave empty for all)',
    layout=widgets.Layout(width='50%')
)
download_button = widgets.Button(description='Download', button_style='primary', icon='download')
output_area = widgets.Output()
progress = widgets.Label()

def download_model(_):
    output_area.clear_output()
    token = token_input.value.strip() or get_saved_token()
    if not repo_input.value.strip():
        progress.value = '‚ö†Ô∏è Please provide a repository ID.'
        return
    if not token:
        progress.value = '‚ö†Ô∏è An access token is required for most repos.'
        return
    progress.value = '‚è≥ Downloading... this may take a while.'
    
    # Create a subfolder based on the repo name (e.g., "meta-llama/Llama-3.1-8B" -> "meta-llama--Llama-3.1-8B")
    repo_id = repo_input.value.strip()
    safe_repo_name = repo_id.replace('/', '--')
    model_dest = DEST_DIR / safe_repo_name
    model_dest.mkdir(parents=True, exist_ok=True)
    
    try:
        local_path = snapshot_download(
            repo_id=repo_id,
            revision=revision_input.value.strip() or None,
            cache_dir=str(HF_CACHE),
            local_dir=str(model_dest),
            local_dir_use_symlinks=False,
            allow_patterns=pattern_input.value.strip() or None,
            token=token
        )
    except Exception as exc:
        progress.value = f'‚ùå Download failed: {exc}'
        return
    progress.value = '‚úÖ Download complete.'
    with output_area:
        print(f'Model: {repo_id}')
        print(f'Saved to: {local_path}')

download_button.on_click(download_model)
display(widgets.VBox([repo_input, revision_input, pattern_input, download_button, progress, output_area]))

## 3. View installed models

In [None]:
import humanize

def get_dir_size(path: Path) -> int:
    """Calculate total size of all files in a directory."""
    total = 0
    try:
        for item in path.rglob('*'):
            if item.is_file():
                total += item.stat().st_size
    except (PermissionError, FileNotFoundError):
        pass
    return total

def count_files(path: Path) -> int:
    """Count total files in a directory."""
    try:
        return sum(1 for item in path.rglob('*') if item.is_file())
    except (PermissionError, FileNotFoundError):
        return 0

def list_installed_models():
    """Display all models in the DEST_DIR."""
    models_output.clear_output()
    
    if not DEST_DIR.exists() or not any(DEST_DIR.iterdir()):
        with models_output:
            print('üì¶ No models found in /app/models/')
        return
    
    model_dirs = [d for d in DEST_DIR.iterdir() if d.is_dir()]
    
    if not model_dirs:
        with models_output:
            print('üì¶ No models found in /app/models/')
        return
    
    with models_output:
        print(f'üì¶ Found {len(model_dirs)} model(s) in {DEST_DIR}:\n')
        print(f'{"Model Name":<50} {"Files":<10} {"Size":<15}')
        print('-' * 75)
        
        for model_dir in sorted(model_dirs):
            # Convert back from safe name (e.g., "meta-llama--Llama-3.1-8B" -> "meta-llama/Llama-3.1-8B")
            display_name = model_dir.name.replace('--', '/', 1)
            file_count = count_files(model_dir)
            size_bytes = get_dir_size(model_dir)
            size_human = humanize.naturalsize(size_bytes, binary=True) if size_bytes > 0 else '0 B'
            
            print(f'{display_name:<50} {file_count:<10} {size_human:<15}')

models_output = widgets.Output()
refresh_button = widgets.Button(description='Refresh', button_style='info', icon='refresh')

def on_refresh(_):
    list_installed_models()

refresh_button.on_click(on_refresh)

# Display initial list
list_installed_models()

display(widgets.VBox([refresh_button, models_output]))

## 4. Load and test a model

Use this section to load a downloaded model with GPU acceleration, quantization, and test inference.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import gc

# Global model cache
loaded_model = None
loaded_tokenizer = None
loaded_model_name = None

def get_model_dirs():
    """Get list of downloaded models."""
    if not DEST_DIR.exists():
        return []
    return sorted([d for d in DEST_DIR.iterdir() if d.is_dir()])

# Model selection dropdown
model_dirs = get_model_dirs()
model_choices = [('None', None)] + [(d.name.replace('--', '/', 1), str(d)) for d in model_dirs]

model_dropdown = widgets.Dropdown(
    options=model_choices,
    description='Model:',
    layout=widgets.Layout(width='70%')
)

quantization_dropdown = widgets.Dropdown(
    options=[
        ('No quantization (full precision)', 'none'),
        ('4-bit (NF4) - Recommended', '4bit'),
        ('8-bit', '8bit')
    ],
    value='4bit',
    description='Quantization:',
    layout=widgets.Layout(width='50%')
)

load_button = widgets.Button(description='Load Model', button_style='primary', icon='upload')
unload_button = widgets.Button(description='Unload Model', button_style='danger', icon='times')
load_output = widgets.Output()
load_status = widgets.Label()

def load_model(_):
    global loaded_model, loaded_tokenizer, loaded_model_name
    
    load_output.clear_output()
    
    if model_dropdown.value is None:
        load_status.value = '‚ö†Ô∏è Please select a model.'
        return
    
    model_path = model_dropdown.value
    quant_mode = quantization_dropdown.value
    
    load_status.value = '‚è≥ Loading model... this may take a minute.'
    
    try:
        with load_output:
            print(f'üìÇ Model path: {model_path}')
            print(f'‚öôÔ∏è  Quantization: {quant_mode}')
            print(f'üîß Loading tokenizer...')
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
        
        with load_output:
            print(f'‚úÖ Tokenizer loaded')
            print(f'üîß Loading model...')
        
        # Configure quantization
        if quant_mode == '4bit':
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True
            )
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                quantization_config=bnb_config,
                device_map="auto",
                local_files_only=True,
                torch_dtype=torch.float16
            )
        elif quant_mode == '8bit':
            bnb_config = BitsAndBytesConfig(load_in_8bit=True)
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                quantization_config=bnb_config,
                device_map="auto",
                local_files_only=True
            )
        else:
            model = AutoModelForCausalLM.from_pretrained(
                model_path,
                device_map="auto",
                local_files_only=True,
                torch_dtype=torch.float16
            )
        
        loaded_model = model
        loaded_tokenizer = tokenizer
        loaded_model_name = model_path
        
        with load_output:
            print(f'‚úÖ Model loaded successfully!')
            print(f'üìä Device map: {model.hf_device_map if hasattr(model, "hf_device_map") else "N/A"}')
            if torch.cuda.is_available():
                print(f'üéÆ GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB')
                print(f'üéÆ GPU memory reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB')
        
        load_status.value = '‚úÖ Model loaded and ready for inference!'
        
    except Exception as exc:
        load_status.value = f'‚ùå Load failed: {exc}'
        with load_output:
            import traceback
            traceback.print_exc()

def unload_model(_):
    global loaded_model, loaded_tokenizer, loaded_model_name
    
    load_output.clear_output()
    
    if loaded_model is None:
        load_status.value = '‚ö†Ô∏è No model is currently loaded.'
        return
    
    with load_output:
        print('üßπ Unloading model...')
    
    del loaded_model
    del loaded_tokenizer
    loaded_model = None
    loaded_tokenizer = None
    loaded_model_name = None
    
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    with load_output:
        print('‚úÖ Model unloaded')
        if torch.cuda.is_available():
            print(f'üéÆ GPU memory allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB')
    
    load_status.value = '‚úÖ Model unloaded, memory cleared.'

load_button.on_click(load_model)
unload_button.on_click(unload_model)

display(widgets.VBox([
    model_dropdown,
    quantization_dropdown,
    widgets.HBox([load_button, unload_button]),
    load_status,
    load_output
]))

## 5. Run inference

Generate text with your loaded model.

In [None]:
prompt_input = widgets.Textarea(
    description='Prompt:',
    placeholder='Enter your prompt here...',
    layout=widgets.Layout(width='90%', height='100px')
)

max_tokens_slider = widgets.IntSlider(
    value=100,
    min=10,
    max=2048,
    step=10,
    description='Max tokens:',
    layout=widgets.Layout(width='50%')
)

temperature_slider = widgets.FloatSlider(
    value=0.7,
    min=0.1,
    max=2.0,
    step=0.1,
    description='Temperature:',
    layout=widgets.Layout(width='50%')
)

top_p_slider = widgets.FloatSlider(
    value=0.9,
    min=0.1,
    max=1.0,
    step=0.05,
    description='Top-p:',
    layout=widgets.Layout(width='50%')
)

generate_button = widgets.Button(description='Generate', button_style='success', icon='play')
inference_output = widgets.Output()
inference_status = widgets.Label()

def generate_text(_):
    global loaded_model, loaded_tokenizer
    
    inference_output.clear_output()
    
    if loaded_model is None or loaded_tokenizer is None:
        inference_status.value = '‚ö†Ô∏è Please load a model first (see section 4).'
        return
    
    if not prompt_input.value.strip():
        inference_status.value = '‚ö†Ô∏è Please enter a prompt.'
        return
    
    inference_status.value = '‚è≥ Generating...'
    
    try:
        prompt = prompt_input.value.strip()
        
        with inference_output:
            print(f'üéØ Prompt: {prompt}\n')
            print('-' * 80)
        
        # Tokenize
        inputs = loaded_tokenizer(prompt, return_tensors="pt").to(loaded_model.device)
        
        # Generate
        import time
        start_time = time.time()
        
        with torch.inference_mode():
            outputs = loaded_model.generate(
                **inputs,
                max_new_tokens=max_tokens_slider.value,
                temperature=temperature_slider.value,
                top_p=top_p_slider.value,
                do_sample=True,
                pad_token_id=loaded_tokenizer.eos_token_id
            )
        
        elapsed = time.time() - start_time
        
        # Decode
        generated_text = loaded_tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Calculate tokens/sec
        num_tokens = outputs.shape[1] - inputs.input_ids.shape[1]
        tokens_per_sec = num_tokens / elapsed if elapsed > 0 else 0
        
        with inference_output:
            print(f'üìù Generated text:\n{generated_text}\n')
            print('-' * 80)
            print(f'‚è±Ô∏è  Time: {elapsed:.2f}s | Tokens: {num_tokens} | Speed: {tokens_per_sec:.2f} tokens/s')
        
        inference_status.value = f'‚úÖ Generated {num_tokens} tokens in {elapsed:.2f}s'
        
    except Exception as exc:
        inference_status.value = f'‚ùå Generation failed: {exc}'
        with inference_output:
            import traceback
            traceback.print_exc()

generate_button.on_click(generate_text)

display(widgets.VBox([
    prompt_input,
    max_tokens_slider,
    temperature_slider,
    top_p_slider,
    generate_button,
    inference_status,
    inference_output
]))

## 6. System diagnostics

Check GPU status, CUDA availability, and installed package versions.

In [None]:
import transformers
import accelerate
import bitsandbytes

diag_button = widgets.Button(description='Run Diagnostics', button_style='info', icon='stethoscope')
diag_output = widgets.Output()

def run_diagnostics(_):
    diag_output.clear_output()
    
    with diag_output:
        print('=' * 80)
        print('GPU LLM Environment - System Diagnostics')
        print('=' * 80)
        print()
        
        # Python & Core Libraries
        print('üì¶ CORE PACKAGES')
        print(f'  PyTorch version: {torch.__version__}')
        print(f'  Transformers version: {transformers.__version__}')
        print(f'  Accelerate version: {accelerate.__version__}')
        print(f'  Bitsandbytes version: {bitsandbytes.__version__}')
        print()
        
        # CUDA & GPU
        print('üéÆ CUDA & GPU')
        print(f'  CUDA available: {torch.cuda.is_available()}')
        if torch.cuda.is_available():
            print(f'  CUDA version: {torch.version.cuda}')
            print(f'  cuDNN version: {torch.backends.cudnn.version()}')
            print(f'  Number of GPUs: {torch.cuda.device_count()}')
            print()
            for i in range(torch.cuda.device_count()):
                print(f'  GPU {i}: {torch.cuda.get_device_name(i)}')
                props = torch.cuda.get_device_properties(i)
                print(f'    Total memory: {props.total_memory / 1024**3:.2f} GB')
                print(f'    Allocated: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB')
                print(f'    Reserved: {torch.cuda.memory_reserved(i) / 1024**3:.2f} GB')
                print(f'    Free: {(props.total_memory - torch.cuda.memory_reserved(i)) / 1024**3:.2f} GB')
        else:
            print('  ‚ö†Ô∏è  No GPU detected!')
        print()
        
        # Bitsandbytes check
        print('üîß BITSANDBYTES')
        try:
            import bitsandbytes as bnb
            print(f'  Status: ‚úÖ Available')
            print(f'  CUDA support: {bnb.cuda_setup.common.get_cuda_lib_handle() is not None}')
        except Exception as e:
            print(f'  Status: ‚ùå Error: {e}')
        print()
        
        # Model status
        print('ü§ñ LOADED MODEL')
        if loaded_model is not None:
            print(f'  Model: {loaded_model_name}')
            print(f'  Device: {loaded_model.device if hasattr(loaded_model, "device") else "N/A"}')
            print(f'  Device map: {loaded_model.hf_device_map if hasattr(loaded_model, "hf_device_map") else "N/A"}')
        else:
            print('  No model currently loaded')
        print()
        
        print('=' * 80)

diag_button.on_click(run_diagnostics)

display(widgets.VBox([diag_button, diag_output]))