In [4]:
# Requires transformers>=4.51.0
# Requires sentence-transformers>=2.7.0

from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer("Qwen/Qwen3-Embedding-8B")

# We recommend enabling flash_attention_2 for better acceleration and memory saving,
# together with setting `padding_side` to "left":
# model = SentenceTransformer(
#     "Qwen/Qwen3-Embedding-8B",
#     model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
#     tokenizer_kwargs={"padding_side": "left"},
# )

# The queries and documents to embed
queries = [
    "What is the capital of China?",
    "Explain gravity",
]
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]

# Encode the queries and documents. Note that queries benefit from using a prompt
# Here we use the prompt called "query" stored under `model.prompts`, but you can
# also pass your own prompt via the `prompt` argument
query_embeddings = model.encode(queries, prompt_name="query")
document_embeddings = model.encode(documents)

# Compute the (cosine) similarity between the query and document embeddings
similarity = model.similarity(query_embeddings, document_embeddings)
print(similarity)
# tensor([[0.7493, 0.0751],
#         [0.0880, 0.6318]])


  from .autonotebook import tqdm as notebook_tqdm
Fetching 4 files: 100%|██████████| 4/4 [01:58<00:00, 29.70s/it] 
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.99it/s]


tensor([[0.7493, 0.0751],
        [0.0880, 0.6318]])


In [8]:
len(document_embeddings[0])

4096

In [9]:
len(document_embeddings[1])

4096

In [21]:
left = model.encode("left")
right = model.encode("right")
good = model.encode("good")
bad = model.encode("bad")

In [27]:
# Calculate cosine similarity between left and right
cosine_similarity = model.similarity(good, right)
print(f"good, right: {cosine_similarity}")

# Calculate cosine similarity between left and right
cosine_similarity = model.similarity(good, bad)
print(f"good, bad: {cosine_similarity}")



good, right: tensor([[0.7324]])
good, bad: tensor([[0.8374]])


In [28]:
def cos_sim(a, b):
    import numpy as np
    a = a / (np.linalg.norm(a) + 1e-12)
    b = b / (np.linalg.norm(b) + 1e-12)
    return float(np.dot(a, b))

def mean_pool(last_hidden_state, attention_mask):
    # last_hidden_state: [B, T, H], mask: [B, T]
    import torch
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)  # [B, T, 1]
    summed = (last_hidden_state * mask).sum(dim=1)                  # [B, H]
    counts = mask.sum(dim=1).clamp(min=1e-9)                        # [B, 1]
    return summed / counts

In [29]:
cos_sim(good, bad)

0.8373851180076599

In [33]:
# Use E5-large-v2 to check cosine similarity between good and bad
from sentence_transformers import SentenceTransformer
import torch

# Load E5-large-v2 model
e5_model = SentenceTransformer('intfloat/e5-large-v2')

# Encode "good" and "bad"
good_e5 = e5_model.encode("good")
bad_e5 = e5_model.encode("bad")
hot_e5 = e5_model.encode("hot")
cold_e5 = e5_model.encode("cold")

# Calculate cosine similarity using the model's similarity function
print(f"good, bad: {e5_model.similarity(good_e5, bad_e5)}")
print(f"bad, hot_e5: {e5_model.similarity(good_e5, hot_e5)}")


good, bad: tensor([[0.8947]])
bad, hot_e5: tensor([[0.8106]])


In [35]:
# Use SimCSE to check cosine similarity between good and bad
from sentence_transformers import SentenceTransformer

# Load SimCSE model
simcse_model = SentenceTransformer('princeton-nlp/unsup-simcse-bert-large-uncased')

# Encode "good" and "bad"
good_simcse = simcse_model.encode("good")
bad_simcse = simcse_model.encode("bad")
nice_simcse = simcse_model.encode("nice")

# Calculate cosine similarity  the model's similarity function
similarity_simcse = simcse_model.similarity(good_simcse, bad_simcse)
print(f"SimCSE - good和bad之间的cosine similarity: {similarity_simcse}")

# Calculate cosine similarity  the model's similarity function
similarity_simcse = simcse_model.similarity(good_simcse, nice_simcse)
print(f"SimCSE - good和nice之间的cosine similarity: {similarity_simcse}")



No sentence-transformers model found with name princeton-nlp/unsup-simcse-bert-large-uncased. Creating a new one with mean pooling.


SimCSE - good和bad之间的cosine similarity: tensor([[0.7198]])
SimCSE - good和nice之间的cosine similarity: tensor([[0.5724]])


In [None]:
# Run this cell to install the spaCy English model
# Uncomment the line below and run this cell

# !pip install --user https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl


In [41]:
import spacy

# Install spaCy model if not already installed
# Run this if you get an error: python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")
text = "We use retrieval augmented generation and ice cream as examples."
doc = nlp(text)

print("Noun phrases:")
for chunk in doc.noun_chunks:
    print(f"  - {chunk.text}")


Noun phrases:
  - We
  - retrieval augmented generation and ice cream
  - examples
