## 1Ô∏è‚É£ Check GPU & Environment

In [None]:
# Check GPU availability
import torch
import subprocess
import psutil

print("üîç Environment Check")
print(f"Python: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f}GB" if torch.cuda.is_available() else "")

# RAM Check
ram = psutil.virtual_memory()
print(f"RAM: {ram.total / 1e9:.2f}GB (Available: {ram.available / 1e9:.2f}GB)")

# NVIDIA SMI
try:
    result = subprocess.run(['nvidia-smi', '--query-gpu=index,name,driver_version,memory.total', '--format=csv,noheader'], 
                          capture_output=True, text=True)
    print(f"\nüìä nvidia-smi Output:\n{result.stdout}")
except Exception as e:
    print(f"nvidia-smi not available: {e}")

## 2Ô∏è‚É£ Install Dependencies

In [None]:
# Install required packages
%pip install -q transformers torch bitsandbytes accelerate peft llama-cpp-python fastapi uvicorn pydantic redis aiohttp psutil

print("‚úÖ Dependencies installed!")

## 3Ô∏è‚É£ Download & Load Qwen-2.5-Coder (Q4_K_M)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch.cuda

# Check VRAM before loading
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

print("üì• Loading Qwen-2.5-Coder 7B (Q4_K_M)...")

# Configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Model configuration
MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"

try:
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    print(f"‚úÖ Tokenizer loaded: {MODEL_ID}")
    
    # Load model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.float16,
    )
    print(f"‚úÖ Model loaded: {MODEL_ID}")
    
    # Check VRAM usage
    if torch.cuda.is_available():
        vram_used = torch.cuda.memory_allocated() / 1e9
        vram_total = torch.cuda.get_device_properties(0).total_memory / 1e9
        print(f"\nüìä VRAM Usage: {vram_used:.2f}GB / {vram_total:.2f}GB")
        print(f"Memory Peak: {torch.cuda.max_memory_allocated() / 1e9:.2f}GB")
        
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    print("\nüí° Try:") 
    print("1. Restart the runtime: Runtime ‚Üí Restart runtime")
    print("2. Use a smaller model: Qwen/Qwen2.5-Coder-1.5B-Instruct")

## 4Ô∏è‚É£ Test Model Inference

In [None]:
def generate_response(prompt: str, max_tokens: int = 128) -> str:
    """Generate response using Qwen-2.5-Coder."""
    try:
        # Prepare input
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
        
        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                temperature=0.2,
                top_p=0.9,
                top_k=40,
                repetition_penalty=1.05,
                do_sample=True,
            )
        
        # Decode
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response
    except Exception as e:
        return f"Error: {e}"

# Test prompts
test_prompts = [
    "# Python function to calculate fibonacci\ndef fibonacci",
    "# Refactor this code:\nfor i in range(len(list)):",
    "# Security audit: find issues in this SQL query:",
]

print("üß™ Testing Model Inference\n")
for i, prompt in enumerate(test_prompts[:1], 1):  # Test first prompt only to save tokens
    print(f"Test {i}:")
    print(f"Prompt: {prompt}...")
    response = generate_response(prompt, max_tokens=64)
    print(f"Response: {response[:200]}...\n")

## 5Ô∏è‚É£ Setup FastAPI Backend

In [None]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
import uvicorn
import asyncio
from threading import Thread

# Define request/response models
class GenerateRequest(BaseModel):
    prompt: str
    task: str = "general"
    max_tokens: int = 256
    temperature: float = 0.2

class GenerateResponse(BaseModel):
    response: str
    task: str
    tokens: int
    model: str

# Create FastAPI app
app = FastAPI(
    title="Colab AI Coder API",
    description="Assistant IA bas√© sur Qwen-2.5-Coder 7B",
    version="0.1.0"
)

print("‚úÖ FastAPI app created")

## 6Ô∏è‚É£ Define API Routes

In [None]:
@app.get("/health")
async def health_check():
    """Health check endpoint."""
    return {"status": "healthy", "model": "qwen2.5-coder:7b"}

@app.post("/api/v1/assistant/generate", response_model=GenerateResponse)
async def generate(request: GenerateRequest):
    """Generate code using Qwen-2.5-Coder."""
    try:
        response = generate_response(request.prompt, max_tokens=request.max_tokens)
        return GenerateResponse(
            response=response,
            task=request.task,
            tokens=len(response.split()),
            model="qwen2.5-coder:7b-q4_k_m"
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/v1/models/current")
async def get_current_model():
    """Get current model info."""
    if torch.cuda.is_available():
        vram_used = torch.cuda.memory_allocated() / 1e9
        vram_total = torch.cuda.get_device_properties(0).total_memory / 1e9
    else:
        vram_used = vram_total = 0
    
    return {
        "name": "qwen2.5-coder:7b-q4_k_m",
        "size": "5.2GB",
        "vram_usage": f"{vram_used:.2f}GB / {vram_total:.2f}GB",
        "status": "ready"
    }

print("‚úÖ Routes defined")

## 7Ô∏è‚É£ Start FastAPI Server

In [None]:
import nest_asyncio
nest_asyncio.apply()

# Get ngrok URL for public access
try:
    from google.colab import ngrok
    
    # Start server in background
    def run_server():
        uvicorn.run(
            app,
            host="127.0.0.1",
            port=8000,
            log_level="info"
        )
    
    # Run in thread
    server_thread = Thread(target=run_server, daemon=True)
    server_thread.start()
    
    print("‚úÖ FastAPI server started on http://127.0.0.1:8000")
    
    # Setup ngrok tunnel
    public_url = ngrok.connect(8000)
    print(f"üåê Public URL (ngrok): {public_url}")
    print(f"üìö API Docs: {public_url}/docs")
    
except Exception as e:
    print(f"‚ö†Ô∏è ngrok not available in this environment: {e}")
    print("Local API will be available on: http://127.0.0.1:8000")

## 8Ô∏è‚É£ Example: Use Assistant

In [None]:
import time

# Wait for server to start
time.sleep(2)

# Example 1: Generate code
print("\nüî® Example 1: Generate Python Function\n")
prompt = """# Python function to calculate factorial
# Input: n (integer)
# Output: factorial of n
def factorial(n):"""

response = generate_response(prompt, max_tokens=100)
print("Generated Code:")
print(response)

# Example 2: Refactor
print("\n\nüîß Example 2: Refactor Code\n")
refactor_prompt = """# Refactor this code for better performance and readability:
for i in range(len(my_list)):
    for j in range(len(my_list)):
        if my_list[i] == my_list[j]:
            print(my_list[i])"""

response = generate_response(refactor_prompt, max_tokens=100)
print("Refactored Code:")
print(response)

## 9Ô∏è‚É£ Connect from VS Code

### Steps to connect from VS Code:

1. **Get the API URL:**
   - Copy the public URL from the cell above (if using ngrok)
   - Or use: `http://127.0.0.1:8000` for local testing

2. **Install Colab AI Coder Extension** (coming soon):
   - Open VS Code
   - Go to Extensions (Ctrl+Shift+X)
   - Search for "Colab AI Coder"
   - Click Install

3. **Configure extension:**
   - Open Command Palette (Ctrl+Shift+P)
   - Type "Colab AI Coder: Configure API"
   - Paste the API URL

4. **Use the assistant:**
   - Right-click on code
   - Select "Generate", "Refactor", "Debug", or "Audit"
   - View results in sidebar

### API Endpoints:

```bash
# Health check
curl http://API_URL/health

# Generate code
curl -X POST http://API_URL/api/v1/assistant/generate \
  -H "Content-Type: application/json" \
  -d '{"prompt": "def hello", "task": "generate", "max_tokens": 100}'

# Get current model
curl http://API_URL/api/v1/models/current
```

## üîü Monitor Resources

In [None]:
import psutil
import torch

def monitor_resources():
    """Monitor system resources."""
    print("\nüìä Resource Monitor\n")
    
    # CPU
    cpu_percent = psutil.cpu_percent(interval=1)
    print(f"CPU Usage: {cpu_percent}%")
    
    # Memory
    ram = psutil.virtual_memory()
    print(f"RAM Usage: {ram.percent}% ({ram.used / 1e9:.2f}GB / {ram.total / 1e9:.2f}GB)")
    
    # GPU
    if torch.cuda.is_available():
        vram_used = torch.cuda.memory_allocated() / 1e9
        vram_total = torch.cuda.get_device_properties(0).total_memory / 1e9
        print(f"GPU VRAM: {vram_used:.2f}GB / {vram_total:.2f}GB ({vram_used/vram_total*100:.1f}%)")
        print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    
    # Disk
    disk = psutil.disk_usage('/')
    print(f"Disk Usage: {disk.percent}% ({disk.used / 1e9:.2f}GB / {disk.total / 1e9:.2f}GB)")

monitor_resources()

---

## ‚úÖ Setup Complete!

Your Colab AI Coder environment is ready. You can now:

- ü§ñ Generate code using Qwen-2.5-Coder
- üîß Refactor and debug code
- üîí Audit code for security issues
- üìö Access the API from VS Code
- üìä Monitor VRAM and performance

Keep this cell running to maintain the API server.

For more info, visit: [Colab AI Coder GitHub](https://github.com/yourusername/colab-ai-coder)