In [2]:
import sys
import os

# Add the project root to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if project_root not in sys.path:
    sys.path.insert(0, project_root)



In [3]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Download from the 🤗 Hub
model = SentenceTransformer("all-MiniLM-L6-v2")
# Run inference
sentences = [
    'Volksvertreter',
    'Parlamentarier',
    'Oberbürgermeister',
]
embeddings = model.encode(sentences)
print(embeddings.shape)

(3, 384)


In [3]:

# Download from the 🤗 Hub
model = SentenceTransformer("all-MiniLM-L6-v2")
# Run inference
sentences = [
    'Volksvertreter',
    'Parlamentarier',
    'Oberbürgermeister',
]
embeddings = model.encode(sentences, normalize_embeddings=True)
print(embeddings.shape)

(3, 384)


In [4]:
from model import BiEncoder 
import numpy as np

def test_normalization_flag(base_config):
    """Test that the normalize_output flag works correctly."""
    # Test with normalization ON
    base_config.model.proj_dim = None # No projection for simplicity
    model_norm = BiEncoder(base_config.model, base_config.device)
    
    job_titles = ["Data Scientist"]
    job_embs_norm = model_norm.encode_job(job_titles, normalize=True)
    
    # Check that the L2 norm is close to 1
    norm = np.linalg.norm(job_embs_norm, axis=1)
    assert np.allclose(norm, 1.0), "Embeddings should be normalized"

    # Test with normalization OFF
    model_no_norm = BiEncoder(base_config.model, base_config.device)

    job_embs_no_norm = model_no_norm.encode_job(job_titles, normalize=False)

    # Check that the L2 norm is NOT 1
    norm_unnormalized = np.linalg.norm(job_embs_no_norm, axis=1)
    assert not np.allclose(norm_unnormalized, 1.0), "Embeddings should not be normalized"

In [5]:
def base_config(dummy_data_files):
    """A base config that can be modified by other fixtures."""
    pairs_path, esco_path = dummy_data_files
    return Config(
        seed=42,
        device="cpu",
        model=ModelConfig(
            hf_id="sentence-transformers/all-MiniLM-L6-v2",
            proj_dim=None,
            asymmetric=False,
            normalize_output=True,
        ),
        data=DataConfig(pairs_path=pairs_path, esco_titles_path=esco_path),
        infer=InferConfig(batch_size=32, topk=5),
        artifacts=ArtifactsConfig(run_dir="runs/test"),
    )

In [17]:
from src.config import Config, ModelConfig, DataConfig, InferConfig, ArtifactsConfig
import os
import tempfile

# Create dummy files for the config
temp_dir = tempfile.gettempdir()
pairs_path = os.path.join(temp_dir, "dummy_pairs.jsonl")
esco_path = os.path.join(temp_dir, "dummy_esco.jsonl")

with open(pairs_path, "w") as f:
    f.write('{"skill": "python", "job": "Data Scientist"}\n')

with open(esco_path, "w") as f:
    f.write('{"title": "Data Scientist"}\n')

dummy_data_files = (pairs_path, esco_path)

def create_base_config(dummy_data_files):
    """A base config that can be modified by other fixtures."""
    pairs_path, esco_path = dummy_data_files
    return Config(
        seed=42,
        device="cpu",
        model=ModelConfig(
            hf_id="sentence-transformers/all-MiniLM-L6-v2",
            proj_dim=None,
            asymmetric=False,
            normalize_output=False,
        ),
        data=DataConfig(pairs_path=pairs_path, esco_titles_path=esco_path),
        infer=InferConfig(batch_size=32, topk=5),
        artifacts=ArtifactsConfig(run_dir="runs/test"),
    )

base_conf = create_base_config(dummy_data_files)



In [8]:
test_normalization_flag(base_conf)

AssertionError: Embeddings should not be normalized

In [None]:
from model import BiEncoder 
import numpy as np

def test_normalization_flag(base_config):
    """Test that the normalize_output flag works correctly."""
    # Test with normalization ON
    base_config.model.proj_dim = None # No projection for simplicity
    model_norm = BiEncoder(base_config.model, base_config.device)
    
    job_titles = ["Data Scientist"]
    job_embs_norm = model_norm.encode_job(job_titles, normalize=True)
    
    # Check that the L2 norm is close to 1
    norm = np.linalg.norm(job_embs_norm, axis=1)
    assert np.allclose(norm, 1.0), "Embeddings should be normalized"

    # Test with normalization OFF
    model_no_norm = BiEncoder(base_config.model, base_config.device)

    job_embs_no_norm = model_no_norm.encode_job(job_titles, normalize=False)

    # Check that the L2 norm is NOT 1
    norm_unnormalized = np.linalg.norm(job_embs_no_norm, axis=1)
    assert not np.allclose(norm_unnormalized, 1.0), "Embeddings should not be normalized"

In [18]:

base_conf.model.proj_dim = None # No projection for simplicity
model_norm = BiEncoder(base_conf.model, base_conf.device)

In [19]:
job_titles = ["Data Scientist"]
job_embs_norm = model_norm.encode_job(job_titles, normalize=True)

In [20]:
norm = np.linalg.norm(job_embs_norm, axis=1)
assert np.allclose(norm, 1.0), "Embeddings should be normalized"

In [21]:
model_no_norm = BiEncoder(base_conf.model, base_conf.device)

job_embs_no_norm = model_no_norm.encode_job(job_titles, normalize=False)

In [22]:
model_no_norm

BiEncoder(
  (st_model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
    (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
    (2): Normalize()
  )
)

In [25]:
# Check that the L2 norm is NOT 1
norm_unnormalized = np.linalg.norm(job_embs_no_norm, axis=1)
assert not np.allclose(norm_unnormalized, 1.0), "Embeddings should not be normalized"

AssertionError: Embeddings should not be normalized

In [28]:
print(SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"))

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)


In [29]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Drop the last module (Normalize)
model._modules.pop(str(len(model._modules)-1))

print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


In [31]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pretrained model (with Normalize layer in its architecture)
model =  SentenceTransformer("all-mpnet-base-v2")

texts = ["This is a test sentence.", "Another sentence to encode."]

# Encode with normalization
emb_norm = model.encode(texts, normalize_embeddings=True)
# Encode without normalization
emb_raw = model.encode(texts, normalize_embeddings=False)

# Check L2 norms
print("With normalization:")
for i, e in enumerate(emb_norm):
    print(f"  Text {i}: norm = {np.linalg.norm(e):.4f}")

print("\nWithout normalization:")
for i, e in enumerate(emb_raw):
    print(f"  Text {i}: norm = {np.linalg.norm(e):.4f}")


With normalization:
  Text 0: norm = 1.0000
  Text 1: norm = 1.0000

Without normalization:
  Text 0: norm = 1.0000
  Text 1: norm = 1.0000


In [32]:
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False, 'architecture': 'MPNetModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)
