# Upload to Qdrant

Upload `.md` files to vector-database using [TWYD API](https://twyd.kubeapps.alquimiaai.hostmydemo.online/docs#/).

---

In [None]:
!pip install transformers -q

In [None]:
import os
import requests
from transformers import AutoTokenizer

In [None]:
## CONFIGURATION

## ENV VARIABLES (SETTED)
TWYD_URL = os.environ.get("TWYD_URL")
TWYD_TOKEN = os.environ.get("TWYD_TOKEN")
DIR_PATH="ocr-output/"

## VARIABLES (DYNAMIC)
TWYD_TOPIC_ID = os.environ.get("topic_id")
TEXT_EMBEDDER = os.environ.get("embedding_model","nomic-ai/nomic-embed-text-v1")
MAX_TOKENS = 8192
# API configuration
headers = {
    'Authorization': f'Bearer {TWYD_TOKEN}'
}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(TEXT_EMBEDDER, trust_remote_code=True)

In [None]:
# Load the tokenizer
def get_tokens_from_file(file_path, tokenizer):
    """Tokenize text in .md file and return the number of tokens."""

    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    tokens = tokenizer.tokenize(text)
    return len(tokens)

def calculate_optimal_chunk_size(total_tokens):
    """Calculates optimal chunk size based on total tokens."""

    end=32
    for i in range(0,end+2,2):
        try:
            chunks = i
            chunk_size = total_tokens/chunks
        except ZeroDivisionError:
            chunks = 1
            chunk_size = total_tokens

        if chunk_size < MAX_TOKENS:
            break

    return int(min(MAX_TOKENS, chunk_size))

In [None]:
# --- Inicializar o cargar status.json ---
status_path = os.path.join(DIR_PATH, "status.json")
if os.path.exists(status_path):
    with open(status_path, 'r', encoding='utf-8') as f:
        status_list = json.load(f)
else:
    status_list = []

def save_status():
    with open(status_path, 'w', encoding='utf-8') as f:
        json.dump(status_list, f, ensure_ascii=False, indent=2)

def get_tokens_from_file(file_path, tokenizer):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return len(tokenizer.tokenize(text))

def calculate_optimal_chunk_size(total_tokens):
    for chunks in range(1, 34, 2):
        chunk_size = total_tokens / chunks if chunks else total_tokens
        if chunk_size < MAX_TOKENS:
            return int(min(MAX_TOKENS, chunk_size))
    return MAX_TOKENS

In [None]:
# Recorrer todos los archivos .md
for root, dirs, files in os.walk(DIR_PATH):
    for filename in files:
        if not filename.lower().endswith('.md'):
            continue

        file_path = os.path.join(root, filename)
        entry = {
            "file": filename,
            "status": None,
            "error": None,
            "topic_id": None
        }

        try:
            # 1) Upload
            with open(file_path, 'rb') as fp:
                files_payload = {'file': (filename, fp, 'text/markdown')}
                resp = requests.post(
                    f'https://{TWYD_URL}/api/files/upload',
                    files=files_payload,
                    headers=headers
                )
            resp.raise_for_status()
            file_id = resp.json().get("id")
            if not file_id:
                raise ValueError("No se devolvió 'id' al subir el archivo")

            # 2) Calcular chunks
            total_tokens = get_tokens_from_file(file_path, tokenizer)
            chunk_size = calculate_optimal_chunk_size(total_tokens)
            
            json_body = {
                "separators": ["# ","## ","### ","#### ","##### ","\n\n","\n","> ","- ","* ","---"],
                "isSeparatorRegex": False,
                "chunkSize": chunk_size,
                "chunkOverlap": int(chunk_size * 0.05),
                "keepSeparator": True,
                "addStartIndex": False,
                "stripWhitespace": True
            }

            # 3) Associate
            assoc_resp = requests.put(
                f"https://{TWYD_URL}.alquimiaai.hostmydemo.online/api/topics/{TWYD_TOPIC_ID}/add/{file_id}",
                headers={**headers, 'Content-Type': 'application/json'},
                json=json_body
            )
            assoc_resp.raise_for_status()

            # Éxito
            entry["status"]   = "success"
            entry["topic_id"] = TWYD_TOPIC_ID

            print(f"✅ {filename} → topic {TWYD_TOPIC_ID}")

        except Exception as e:
            # Capturamos cualquier error
            entry["status"] = "error"
            entry["error"]  = str(e)
            print(f"❌ {filename}: {e}")

        # Guardar en status list y persistir
        status_list.append(entry)
        save_status()

---