In [1]:
!pip install sentence-transformers datasets torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import torch
from sentence_transformers import (
    SentenceTransformer,
    InputExample,
    losses,
    models,
    util,
)
from torch.utils.data import DataLoader
from datasets import load_dataset

if torch.backends.mps.is_available():
    device = "mps"
    print("✅ Apple GPU detected. Using 'mps' backend.")
elif torch.cuda.is_available():
    device = "cuda"
    print("✅ NVIDIA GPU detected. Using 'cuda' backend.")
else:
    device = "cpu"
    print("⚠️ No GPU detected. Using 'cpu' (Training will be slow).")

BASE_MODEL_NAME = "distilbert-base-uncased"
OUTPUT_PATH = "output/my-google-qa-model-mps"

BATCH_SIZE = 32
NUM_EPOCHS = 1
NUM_SAMPLES = 10000

✅ Apple GPU detected. Using 'mps' backend.


In [3]:
from tqdm.auto import tqdm  # This adds a progress bar so you know it's working

# Configuration
NUM_SAMPLES = 10000  # Keep this small for testing
BATCH_SIZE = 32

print("Initializing streaming connection...")

dataset = load_dataset("sentence-transformers/gooaq", split="train", streaming=True)

dataset_head = dataset.take(NUM_SAMPLES)

print(f"Streaming first {NUM_SAMPLES} rows...")

train_examples = []

for row in tqdm(dataset_head, total=NUM_SAMPLES, desc="Processing Data"):
    query = row["question"]
    context = row["answer"]

    if isinstance(query, str) and isinstance(context, str):
        train_examples.append(InputExample(texts=[query, context]))

print(f"Successfully created {len(train_examples)} training examples.")

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)

Initializing streaming connection...
Streaming first 10000 rows...


Processing Data:   0%|          | 0/10000 [00:00<?, ?it/s]

Successfully created 10000 training examples.


In [5]:
import logging

# 1. Turn on detailed logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("transformers")
logger.setLevel(logging.INFO)

# 2. Test connection to Hugging Face
import requests
print("Testing connection to huggingface.co...")
try:
    response = requests.get("https://huggingface.co", timeout=5)
    print(f"✅ Connection successful (Status: {response.status_code})")
except Exception as e:
    print(f"❌ Connection FAILED: {e}")
    print("STOP HERE: You have a firewall or internet issue.")

# 3. Try downloading with 'resume_download' enabled
if 'response' in locals() and response.status_code == 200:
    print("\nAttempting download with resume support...")
    from transformers import AutoModel
    
    # force_download=True cleans up corrupted cache
    model = AutoModel.from_pretrained(
        BASE_MODEL_NAME, 
        force_download=True, 
        resume_download=True
    )
    print("✅ Download Success!")

Testing connection to huggingface.co...
✅ Connection successful (Status: 200)

Attempting download with resume support...




KeyboardInterrupt: 

In [None]:

print(f"Loading {BASE_MODEL_NAME} from local cache...")

word_embedding_model = models.Transformer(BASE_MODEL_NAME)

pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(), 
    pooling_mode_mean_tokens=True
)

if device == 'mps':
    print("Moving model to Apple Neural Engine (MPS)...")

model = SentenceTransformer(
    modules=[word_embedding_model, pooling_model], 
    device=device
)

print(f"✅ Model successfully loaded on device: {model.device}")

Initializing base model: distilbert-base-uncased...


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [None]:
train_loss = losses.MultipleNegativesRankingLoss(model)

warmup_steps = int(len(train_dataloader) * NUM_EPOCHS * 0.1)

print("Starting training...")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=NUM_EPOCHS,
    warmup_steps=warmup_steps,
    show_progress_bar=True,
)

model.save(OUTPUT_PATH)
print(f"Training finished! Model saved to {OUTPUT_PATH}")

In [None]:
print("Loading trained model for inference...")
trained_model = SentenceTransformer(OUTPUT_PATH)

test_query = "How do I make coffee?"
test_docs = [
    "To brew coffee, grind beans and use hot water in a french press.",  # Relevant
    "The capital of France is Paris.",  # Irrelevant
    "Python is a programming language.",  # Irrelevant
    "Boiling water is essential for extracting flavor from coffee beans.",  # Relevant
]

query_emb = trained_model.encode(test_query)
doc_embs = trained_model.encode(test_docs)

scores = util.cos_sim(query_emb, doc_embs)[0]

print(f"\nQuery: {test_query}")
print("-" * 50)
results = list(zip(test_docs, scores))
for doc, score in sorted(results, key=lambda x: x[1], reverse=True):
    print(f"Score: {score:.4f} | {doc}")