# BGE endpoints on Colab

This notebook exposes HTTP endpoints for embeddings and reranking:
- `GET /health`
- `POST /embed` (BAAI/bge-m3)
- `POST /rerank` (BAAI/bge-reranker-v2-m3)

Request formats:
- `POST /embed` `{'texts': ['text1', 'text2']}`
- `POST /rerank` `{'query': 'q', 'documents': ['d1', 'd2']}`

Public URL is created via ngrok. Set `NGROK_AUTH_TOKEN` in the environment.

Optional protection: set `BGE_API_KEY` and pass header `X-API-Key`.


In [3]:
!pip -q install fastapi "uvicorn[standard]" sentence-transformers transformers pyngrok --upgrade

In [10]:
import os
from pyngrok import ngrok
from google.colab import userdata

# –ü–æ–ø—ã—Ç–∫–∞ –∑–∞–≥—Ä—É–∑–∏—Ç—å —Ç–æ–∫–µ–Ω –∏–∑ —Å–µ–∫—Ä–µ—Ç–æ–≤ Colab
try:
    NGROK_AUTH_TOKEN = userdata.get("NGROK_AUTH_TOKEN")
except Exception:
    NGROK_AUTH_TOKEN = os.environ.get("NGROK_AUTH_TOKEN", "")

if NGROK_AUTH_TOKEN:
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)
    print("Ngrok token set successfully.")
else:
    print("‚ö†Ô∏è NGROK_AUTH_TOKEN –Ω–µ –Ω–∞–π–¥–µ–Ω. –ü—É–±–ª–∏—á–Ω—ã–π URL –Ω–µ –±—É–¥–µ—Ç —Å–æ–∑–¥–∞–Ω.")

Ngrok token set successfully.


In [5]:
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

EMBED_MODEL_NAME = "BAAI/bge-m3"
RERANK_MODEL_NAME = "BAAI/bge-reranker-v2-m3"

device = "cuda" if torch.cuda.is_available() else "cpu"

embed_model = SentenceTransformer(EMBED_MODEL_NAME, device=device)
rerank_tokenizer = AutoTokenizer.from_pretrained(RERANK_MODEL_NAME)
rerank_model = AutoModelForSequenceClassification.from_pretrained(RERANK_MODEL_NAME)
rerank_model.eval()
rerank_model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/795 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(8194, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, o

In [11]:
import os
import torch
from fastapi import FastAPI, Header, HTTPException
from pydantic import BaseModel
from typing import List, Optional
from google.colab import userdata

app = FastAPI(title="bge-endpoints")

# –ü–æ–ø—ã—Ç–∫–∞ –∑–∞–≥—Ä—É–∑–∏—Ç—å API –∫–ª—é—á –∏–∑ —Å–µ–∫—Ä–µ—Ç–æ–≤
try:
    API_KEY = userdata.get("BGE_API_KEY")
except Exception:
    API_KEY = os.environ.get("BGE_API_KEY", "")

def _auth(x_api_key: Optional[str]) -> None:
    if API_KEY and x_api_key != API_KEY:
        raise HTTPException(status_code=401, detail="invalid api key")

class EmbedRequest(BaseModel):
    texts: List[str]

class RerankRequest(BaseModel):
    query: str
    documents: List[str]

@app.get("/health")
def health(x_api_key: Optional[str] = Header(default=None, alias="X-API-Key")):
    _auth(x_api_key)
    return {
        'status': 'ok',
        'embed_model': EMBED_MODEL_NAME,
        'rerank_model': RERANK_MODEL_NAME,
        'device': device,
    }

@app.post("/embed")
def embed(req: EmbedRequest, x_api_key: Optional[str] = Header(default=None, alias="X-API-Key")):
    _auth(x_api_key)
    vectors = embed_model.encode(req.texts, normalize_embeddings=True).tolist()
    return {'embeddings': vectors}

@app.post("/rerank")
def rerank(req: RerankRequest, x_api_key: Optional[str] = Header(default=None, alias="X-API-Key")):
    _auth(x_api_key)
    pairs = [(req.query, doc) for doc in req.documents]
    inputs = rerank_tokenizer(
        pairs,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=512,
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        scores = rerank_model(**inputs).logits.view(-1)
    return {'scores': scores.detach().cpu().tolist()}

In [7]:
import threading
import uvicorn

def run():
    uvicorn.run(app, host="0.0.0.0", port=8000)

thread = threading.Thread(target=run, daemon=True)
thread.start()

In [12]:
if not NGROK_AUTH_TOKEN:
    print("Set NGROK_AUTH_TOKEN to enable ngrok public URL.")
else:
    public_url = ngrok.connect(8000).public_url
    print("Public base URL:", public_url)
    print("Embed endpoint:", public_url + "/embed")
    print("Rerank endpoint:", public_url + "/rerank")

Public base URL: https://cc2bb1b6c3b0.ngrok-free.app
Embed endpoint: https://cc2bb1b6c3b0.ngrok-free.app/embed
Rerank endpoint: https://cc2bb1b6c3b0.ngrok-free.app/rerank


## Quick test (curl)

If `BGE_API_KEY` is set, add header `X-API-Key: $BGE_API_KEY`.

```
curl -X GET "$PUBLIC_URL/health"

curl -X POST "$PUBLIC_URL/embed" -H "Content-Type: application/json" \
  -d '{"texts": ["hello world", "bitrix docs"]}'

curl -X POST "$PUBLIC_URL/rerank" -H "Content-Type: application/json" \
  -d '{"query": "bitrix user", "documents": ["CUser class", "CRM lead"]}'
```


In [19]:
print("="*50)
print("üöÄ API INFO & CONFIGURATION")
print("="*50)

if 'public_url' in globals() and public_url:
    print(f"\nüåê Base URL:\n   {public_url}")

    print("\nüìç Endpoints:")
    print(f"   ‚Ä¢ [GET]  Health: {public_url}/health")
    print(f"   ‚Ä¢ [POST] Embed:  {public_url}/embed")
    print(f"   ‚Ä¢ [POST] Rerank: {public_url}/rerank")

    print("\nüîë Authentication:")
    if 'API_KEY' in globals() and API_KEY:
        print(f"   Header: X-API-Key: {API_KEY}")
    else:
        print("   ‚ö†Ô∏è No API Key configured! The endpoint is public.")

    print("\nüìö Usage Example (Python requests):")
    print("-"*30)
    example_key = API_KEY if ('API_KEY' in globals() and API_KEY) else 'YOUR_KEY'
    print(f"import requests\n")
    print(f"url = '{public_url}/embed'")
    print(f"headers = {{'X-API-Key': '{example_key}'}}")
    print(f"payload = {{'texts': ['Hello world']}}")
    print(f"response = requests.post(url, json=payload, headers=headers)")
    print(f"print(response.json())")
    print("-"*30)

    print("\nüñ•Ô∏è Usage Example (cURL):")
    print("-"*30)
    print(f"curl -X POST {public_url}/embed \\")
    print(f"  -H 'X-API-Key: {example_key}' \\")
    print(f"  -H 'Content-Type: application/json' \\")
    print(f"  -d '{{\"texts\": [\"Hello world\"]}}'")
    print("-"*30)
else:
    print("‚ö†Ô∏è Public URL is not generated yet. Please run the ngrok cell above.")

üöÄ API INFO & CONFIGURATION

üåê Base URL:
   https://cc2bb1b6c3b0.ngrok-free.app

üìç Endpoints:
   ‚Ä¢ [GET]  Health: https://cc2bb1b6c3b0.ngrok-free.app/health
   ‚Ä¢ [POST] Embed:  https://cc2bb1b6c3b0.ngrok-free.app/embed
   ‚Ä¢ [POST] Rerank: https://cc2bb1b6c3b0.ngrok-free.app/rerank

üîë Authentication:
   Header: X-API-Key: BgE_My_SecreT!

üìö Usage Example (Python requests):
------------------------------
import requests

url = 'https://cc2bb1b6c3b0.ngrok-free.app/embed'
headers = {'X-API-Key': 'BgE_My_SecreT!'}
payload = {'texts': ['Hello world']}
response = requests.post(url, json=payload, headers=headers)
print(response.json())
------------------------------

üñ•Ô∏è Usage Example (cURL):
------------------------------
curl -X POST https://cc2bb1b6c3b0.ngrok-free.app/embed \
  -H 'X-API-Key: BgE_My_SecreT!' \
  -H 'Content-Type: application/json' \
  -d '{"texts": ["Hello world"]}'
------------------------------


In [16]:
import requests
import json

# –ü—Ä–æ–≤–µ—Ä—è–µ–º, –æ–ø—Ä–µ–¥–µ–ª–µ–Ω–∞ –ª–∏ –ø–µ—Ä–µ–º–µ–Ω–Ω–∞—è URL
if 'public_url' not in globals():
    print("‚ö†Ô∏è –ü–µ—Ä–µ–º–µ–Ω–Ω–∞—è public_url –Ω–µ –Ω–∞–π–¥–µ–Ω–∞. –£–±–µ–¥–∏—Ç–µ—Å—å, —á—Ç–æ —è—á–µ–π–∫–∞ —Å ngrok –∑–∞–ø—É—â–µ–Ω–∞.")
else:
    print(f"Testing API at: {public_url}")

    # –ü–æ–¥–≥–æ—Ç–∞–≤–ª–∏–≤–∞–µ–º –∑–∞–≥–æ–ª–æ–≤–∫–∏
    headers = {"Content-Type": "application/json"}
    if 'API_KEY' in globals() and API_KEY:
        headers["X-API-Key"] = API_KEY
        print(f"üîë Using API Key: {API_KEY[:3]}***")

    # 1. Health Check
    print("\n--- 1. Health Check ---")
    try:
        resp = requests.get(f"{public_url}/health", headers=headers)
        print(f"Status: {resp.status_code}")
        print("Response:", resp.json())
    except Exception as e:
        print(f"Failed: {e}")

    # 2. Embed Test
    print("\n--- 2. Embed Test ---")
    try:
        payload = {"texts": ["hello world", "testing api"]}
        resp = requests.post(f"{public_url}/embed", json=payload, headers=headers)
        if resp.status_code == 200:
            data = resp.json()
            vec_len = len(data['embeddings'][0])
            print(f"‚úÖ Success! Generated {len(data['embeddings'])} vectors. Dimension: {vec_len}")
        else:
            print("‚ùå Error:", resp.text)
    except Exception as e:
        print(f"Failed: {e}")

    # 3. Rerank Test
    print("\n--- 3. Rerank Test ---")
    try:
        payload = {"query": "what is colab?", "documents": ["Colab is a jupyter notebook service", "Apples are red"]}
        resp = requests.post(f"{public_url}/rerank", json=payload, headers=headers)
        if resp.status_code == 200:
            print("‚úÖ Success! Scores:", resp.json()['scores'])
        else:
            print("‚ùå Error:", resp.text)
    except Exception as e:
        print(f"Failed: {e}")

Testing API at: https://cc2bb1b6c3b0.ngrok-free.app
üîë Using API Key: BgE***

--- 1. Health Check ---
INFO:     34.187.252.128:0 - "GET /health HTTP/1.1" 200 OK
Status: 200
Response: {'status': 'ok', 'embed_model': 'BAAI/bge-m3', 'rerank_model': 'BAAI/bge-reranker-v2-m3', 'device': 'cuda'}

--- 2. Embed Test ---
INFO:     34.187.252.128:0 - "POST /embed HTTP/1.1" 200 OK
‚úÖ Success! Generated 2 vectors. Dimension: 1024

--- 3. Rerank Test ---
INFO:     34.187.252.128:0 - "POST /rerank HTTP/1.1" 200 OK
‚úÖ Success! Scores: [6.628366470336914, -11.033652305603027]
