In [None]:
import sys
from pathlib import Path
# Add tools directory so Matcher can be imported (notebook is in tools/matching_related/)
_cwd = Path.cwd()
if (_cwd / "Matcher").exists():
    _tools = _cwd
elif (_cwd.parent / "Matcher").exists():
    _tools = _cwd.parent
elif (_cwd / "tools" / "Matcher").exists():
    _tools = _cwd / "tools"
else:
    _tools = _cwd.parent
if str(_tools) not in sys.path:
    sys.path.insert(0, str(_tools))

In [None]:
import torch
from sentence_transformers import SentenceTransformer
from Matcher.utilities import MatchingUtilities, DataLoader
from Matcher.embedding import EmbeddingUtilities

# A trick to load the model only once in jupyter notebook
try:
    model
except:
    # Load the model with quantization
    model = SentenceTransformer(
        "Qwen/Qwen3-Embedding-8B",
        model_kwargs={
            "dtype": torch.float16,
            "attn_implementation": "flash_attention_2",
        },
        tokenizer_kwargs={"padding_side": "left"},
        device="cuda"
    )
    
matching_utilities = MatchingUtilities("../db.sqlite3")
embedder = EmbeddingUtilities(model)
dataloader = DataLoader()


In [None]:
heterosexual_female_df, heterosexual_male_df, homosexual_female_df, homosexual_male_df = matching_utilities.load_and_clean_data()

In [None]:
embedded_heterosexual_female_df = embedder.transform(heterosexual_female_df)
dataloader.save_data(embedded_heterosexual_female_df, "embedded_heterosexual_female_df")

In [None]:
embedded_heterosexual_male_df = embedder.transform(heterosexual_male_df)
dataloader.save_data(embedded_heterosexual_male_df, "embedded_heterosexual_male_df")

In [None]:
embedded_homosexual_female_df = embedder.transform(homosexual_female_df)
dataloader.save_data(embedded_homosexual_female_df, "embedded_homosexual_female_df")

In [None]:
embedded_homosexual_male_df = embedder.transform(homosexual_male_df)
dataloader.save_data(embedded_homosexual_male_df, "embedded_homosexual_male_df")