# Fine-Tuning

# Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.losses import CoSENTLoss, MatryoshkaLoss
from datasets import Dataset
from peft import LoraConfig, TaskType
import xgboost as xgb
import torch

2026-02-17 09:55:08.386509: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2026-02-17 09:55:09.000760: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2026-02-17 09:55:10.819929: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
import os
# This enables memory fragmentation handling specifically for AMD HIP
os.environ["PYTORCH_HIP_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
import torch
def print_gpu_utilization():
    if not torch.cuda.is_available():
        print("No GPU detected.")
        return

    # On AMD ROCm, 'cuda' functions query the HIP backend
    reserved = torch.cuda.memory_reserved()
    allocated = torch.cuda.memory_allocated()
    total_memory = torch.cuda.get_device_properties(0).total_memory
    
    print(f"Total GPU Mem: {total_memory / 1024**3:.2f} GB")
    print(f"Reserved (Cached): {reserved / 1024**3:.2f} GB")
    print(f"Allocated (Active): {allocated / 1024**3:.2f} GB")
    print(f"Free (Approx): {(total_memory - reserved) / 1024**3:.2f} GB")
    print("-" * 30)

# Run it
print_gpu_utilization()

Total GPU Mem: 15.82 GB
Reserved (Cached): 0.00 GB
Allocated (Active): 0.00 GB
Free (Approx): 15.82 GB
------------------------------




# Data Import, Clean, and Sample

In [4]:
# ==========================================
# 1. DATA LOADING & PREPROCESSING (STRATIFIED)
# ==========================================
print("Loading freMTPL2freq dataset...")
dataset = fetch_openml(data_id=41214, as_frame=True)
full_df = dataset.frame

# Clean basic types first
full_df['ClaimNb'] = pd.to_numeric(full_df['ClaimNb'])
full_df['Exposure'] = pd.to_numeric(full_df['Exposure'])
full_df['Exposure'] = full_df['Exposure'].clip(upper=1.0)
full_df['Frequency'] = full_df['ClaimNb'] / full_df['Exposure']

# --- CRITICAL STEP: Create Stratification Column ---
# We stratify on whether a claim occurred (Binary) rather than the raw count.
# Stratifying on raw count (0, 1, 2, 3...) often fails because '3' claims is too rare to split.
full_df['has_claim'] = full_df['ClaimNb'] > 0

brand_mapping = {'B1': 'Renault, Nissan, or Citroen', 'B2': 'Renault, Nissan, or Citroen',
                 'B3': 'Volkswagen, Audi, Skoda, or Seat', 'B4': 'Opel, General Motors, or Ford',
                 'B5': 'Opel, General Motors, or Ford','B6': 'Fiat', 'B10':'Mercedes, Chrysler, or BMW',
                 'B11':'Mercedes, Chrysler, or BMW', 'B12': 'Japanese (except Nissan) or Korean', 'B13': 'Other','B14': 'Other' }

region_mapping = {
    "R11": "Île-de-France",
    "R21": "Champagne-Ardenne",
    "R22": "Picardie",
    "R23": "Haute-Normandie",
    "R24": "Centre",
    "R25": "Basse-Normandie",
    "R26": "Bourgogne",
    "R31": "Nord–Pas-de-Calais",
    "R41": "Lorraine",
    "R42": "Alsace",
    "R43": "Franche–Comté",
    "R52": "Pays de la Loire",
    "R53": "Bretagne",
    "R54": "Poitou–Charentes",
    "R72": "Aquitaine",
    "R73": "Midi–Pyrénées",
    "R74": "Limousin",
    "R82": "Rhône–Alpes",
    "R83": "Auvergne",
    "R91": "Languedoc–Roussillon",
    "R93": "Provence–Alpes–Côte d’Azur",
    "R94": "Corse"
}

area_mapping = {
    "A": "rural area",
    "B": "semi-rural area",
    "C": "suburban-fringe area",
    "D": "suburban area",
    "E": "urban area",
    "F": "urban center"
}

gas_mapping = {
    "'Diesel'": "Diesel",
    "'Regular'": "Regular"

}

full_df["VehBrand"] = full_df["VehBrand"].map(brand_mapping)
full_df["Region"] = full_df["Region"].map(region_mapping)
full_df["Area"] = full_df["Area"].map(area_mapping)
full_df["VehGas"] = full_df["VehGas"].map(gas_mapping)

# --- DOWNSAMPLE 50k STRATIFIED ---
# We use train_test_split to 'pick' 50,000 rows while keeping the Claim Ratio intact.
# The 'test_size' is ignored; we just want the 'train' portion to be exactly 50k.
subset_df, _ = train_test_split(
    full_df, 
    train_size=100000, 
    stratify=full_df['has_claim'], # <--- Guarantees representative risk profile
    random_state=42
)

print(f"Stratified Subset Size: {len(subset_df)}")
print(f"Claim Rate in Subset: {subset_df['has_claim'].mean():.4f}")
print(f"Claim Rate in Full Data: {full_df['has_claim'].mean():.4f} (Should match)")

# --- 80/20 TRAIN/TEST SPLIT ---
# Now split your 50k subset into Train/Test for modeling
train_df, test_df = train_test_split(
    subset_df, 
    test_size=0.20,
    random_state=42, 
    stratify=subset_df['has_claim'] # Stratify AGAIN to keep test set fair
)

print(f"Final Train: {len(train_df)} | Final Test: {len(test_df)}")

Loading freMTPL2freq dataset...
Stratified Subset Size: 100000
Claim Rate in Subset: 0.0502
Claim Rate in Full Data: 0.0502 (Should match)
Final Train: 80000 | Final Test: 20000


In [5]:
train_df

Unnamed: 0,IDpol,ClaimNb,Exposure,Area,VehPower,VehAge,DrivAge,BonusMalus,VehBrand,VehGas,Density,Region,Frequency,has_claim
407825,3102671.0,0,0.77,suburban-fringe area,5,2,47,50,"Opel, General Motors, or Ford",Regular,198,Île-de-France,0.0,False
13464,32336.0,0,0.34,urban area,5,12,23,90,"Renault, Nissan, or Citroen",Regular,2951,Lorraine,0.0,False
465592,3210480.0,0,1.00,rural area,10,11,68,50,"Renault, Nissan, or Citroen",Regular,15,Provence–Alpes–Côte d’Azur,0.0,False
475594,3240884.0,0,0.16,urban area,4,2,29,72,"Volkswagen, Audi, Skoda, or Seat",Diesel,5410,Île-de-France,0.0,False
454705,3185379.0,0,1.00,semi-rural area,6,14,47,50,"Renault, Nissan, or Citroen",Regular,63,Centre,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618889,5059808.0,0,0.57,suburban area,7,1,73,50,Japanese (except Nissan) or Korean,Diesel,533,Provence–Alpes–Côte d’Azur,0.0,False
200926,2003525.0,0,0.36,rural area,5,2,45,50,Japanese (except Nissan) or Korean,Regular,4,Corse,0.0,False
60108,129687.0,0,1.00,semi-rural area,7,16,47,50,"Renault, Nissan, or Citroen",Regular,50,Centre,0.0,False
570243,4161421.0,0,1.00,suburban-fringe area,6,11,80,50,"Renault, Nissan, or Citroen",Diesel,392,Aquitaine,0.0,False


# Create Prompts

In [6]:
# ==========================================
# 2. SERIALIZATION (Tabular -> Text)
# ==========================================
def serialize_row(row):
    """
    Converts a row of insurance covariates into a natural language prompt.
    Uses a fixed template for consistency between Training and Inference.
    """
    # Handling categorical values cleanly
    veh_brand = str(row['VehBrand']).strip()
    veh_gas = str(row['VehGas']).strip()
    area = str(row['Area']).strip()
    region = str(row['Region']).strip()
    
    return (
        f"A policyholder is {row['DrivAge']} years old living in a {area} of {region}, France with a population density {row['Density']} people/km2. "
        f"They drive a {veh_brand} vehicle which runs on {veh_gas} fuel with a vehicle power class of {row['VehPower']} (min = 4, max = 15). "
        f"The vehicle age is {row['VehAge']} years and the driver's bonus-malus score is {row['BonusMalus']} (scored between 50 and 230 with entrance level 100, <100 means bonus, >100 means malus)."
    )

# Apply serialization
print("Serializing rows to text...")
train_df['text_desc'] = train_df.apply(serialize_row, axis=1)
test_df['text_desc'] = test_df.apply(serialize_row, axis=1)

Serializing rows to text...


In [7]:
import random
import pandas as pd

# 1. Define your helper logic (Mappings)
# (Assuming you have these maps based on your code snippet. 
# If not, you can just use row['Area'] directly.)
def get_clean_variables(row):
    return {
        "driv_age": row['DrivAge'],
        "area": row.get('Area', 'Unknown'),       # Replace with your mapping logic if needed
        "region": row.get('Region', 'Unknown'),   # e.g., mapping codes to names
        "density": row['Density'],
        "veh_brand": row.get('VehBrand', 'Unknown'),
        "veh_gas": row.get('VehGas', 'Unknown'),
        "veh_power": row['VehPower'],
        "veh_age": row['VehAge'],
        "bonus": row['BonusMalus']
    }

# --- TEMPLATE 1: The Narrative (Your Original) ---
def template_narrative(v):
    return (
        f"You are an auto insurance underwriter. Evaluate the risk level of a policyholder based strictly on the following insurance-related information. "
        f"A policyholder is {v['driv_age']} years old living in a {v['area']} of {v['region']}, France with a population density {v['density']} people/km2. "
        f"They drive a {v['veh_brand']} vehicle which runs on {v['veh_gas']} fuel with a vehicle power class of {v['veh_power']} (min = 4, max = 15). "
        f"The vehicle age is {v['veh_age']} years and the driver's bonus-malus score is {v['bonus']} (scored between 50 and 230 with entrance level 100, <100 means bonus, >100 means malus)."
    )

# --- TEMPLATE 2: The Analytical (Concise & Metric Focused) ---
# Good for forcing the model to pay attention to numbers
def template_analytical(v):
    return (
        f"You are an auto insurance underwriter. Evaluate the risk level of a policyholder based strictly on the following insurance-related information. "
        f"RISK PROFILE: Driver Age: {v['driv_age']} | Bonus-Malus: {v['bonus']} | Region: {v['region']} ({v['area']}). "
        f"VEHICLE DATA: Brand: {v['veh_brand']} | Fuel: {v['veh_gas']} | Power: {v['veh_power']} | Age: {v['veh_age']} years. "
        f"ENVIRONMENT: Density {v['density']} per km2."
    )

# --- TEMPLATE 3: The Vehicle-First (Reversed Focus) ---
# Changes the sentence structure completely to prevent position bias
def template_reversed(v):
    return (
        f"You are an auto insurance underwriter. Evaluate the risk level of a policyholder based strictly on the following insurance-related information. "
        f"Insured Vehicle: A {v['veh_age']}-year-old {v['veh_brand']} ({v['veh_gas']} fuel, Power Class {v['veh_power']}). "
        f"Operated by a {v['driv_age']}-year-old driver with a bonus-malus score of {v['bonus']}. "
        f"Location: {v['region']} ({v['area']} zone), Density: {v['density']}."
    )

# List of all templates
templates = [template_narrative, template_analytical, template_reversed]

# 2. Generate the Augmented Dataset
# We use the "Expand" strategy (Tripling the dataset size)
augmented_texts = []
augmented_labels = [] # Assuming you want to keep the target variables aligned
augmented_weights = []

# Loop through your original dataframe
for idx, row in train_df.iterrows():
    # Pre-calculate variables once
    vars_dict = get_clean_variables(row)
    
    # Target variables (ClaimNb, Exposure)
    label = row['ClaimNb'] 
    weight = row['Exposure']
    
    # Apply ALL templates to every row
    for func in templates:
        augmented_texts.append(func(vars_dict))
        augmented_labels.append(label)
        augmented_weights.append(weight)

# 3. Create the new Training DataFrame
# This dataframe is now 3x larger than your original
train_df_aug = pd.DataFrame({
    'text_desc': augmented_texts,
    'ClaimNb': augmented_labels,
    'Exposure': augmented_weights
})

print(f"Original Size: {len(train_df)}")
print(f"Augmented Size: {len(train_df_aug)}")
print("Sample:", train_df_aug['text_desc'].iloc[1]) # Check the second template

Original Size: 80000
Augmented Size: 240000
Sample: You are an auto insurance underwriter. Evaluate the risk level of a policyholder based strictly on the following insurance-related information. RISK PROFILE: Driver Age: 47 | Bonus-Malus: 50 | Region: Île-de-France (suburban-fringe area). VEHICLE DATA: Brand: Opel, General Motors, or Ford | Fuel: Regular | Power: 5 | Age: 2 years. ENVIRONMENT: Density 198 per km2.


In [8]:
train_df_aug

Unnamed: 0,text_desc,ClaimNb,Exposure
0,You are an auto insurance underwriter. Evaluat...,0,0.77
1,You are an auto insurance underwriter. Evaluat...,0,0.77
2,You are an auto insurance underwriter. Evaluat...,0,0.77
3,You are an auto insurance underwriter. Evaluat...,0,0.34
4,You are an auto insurance underwriter. Evaluat...,0,0.34
...,...,...,...
239995,You are an auto insurance underwriter. Evaluat...,0,1.00
239996,You are an auto insurance underwriter. Evaluat...,0,1.00
239997,You are an auto insurance underwriter. Evaluat...,0,0.50
239998,You are an auto insurance underwriter. Evaluat...,0,0.50


### Example Prompt

A policyholder is 35 years old living in a suburban-fringe area of Poitou–Charentes, France with a population density 233 people/km2. They drive a Opel, General Motors, or Ford vehicle which runs on Diesel fuel with a vehicle power class of 5 (min = 4, max = 15). The vehicle age is 7 years and the driver's bonus-malus score is 50 (scored between 50 and 230 with entrance level 100, <100 means bonus, >100 means malus).

# Pair Generation

In [9]:
# ==========================================
# 3. PAIR GENERATION (Metric Learning)
# ==========================================
def generate_pairs(dataframe, num_pairs=30000):
    # --- FIX 1: Calculate Frequency on the fly ---
    # We work on a copy to avoid SettingWithCopy warnings on the main df
    df = dataframe.copy()
    df['Frequency'] = df['ClaimNb'] / df['Exposure']
    
    # 1. Filter out noise (Low exposure rows are unreliable)
    valid_df = df[df['Exposure'] > 0.08]
    
    # 2. Pre-calculate indices
    n_contrastive = num_pairs // 2
    n_random = num_pairs - n_contrastive
    
    # 3. Vectorized Sampling
    zeros = valid_df[valid_df['ClaimNb'] == 0]
    claims = valid_df[valid_df['ClaimNb'] > 0]
    
    # Sample A and B for the contrastive set (Zero vs Claim)
    zeros_batch = zeros.sample(n=n_contrastive, replace=True)
    claims_batch = claims.sample(n=n_contrastive, replace=True)
    
    # Sample A and B for the random set
    # Note: Since your DF is now 3x larger, replace=True is perfectly fine
    rand_a_batch = valid_df.sample(n=n_random, replace=True)
    rand_b_batch = valid_df.sample(n=n_random, replace=True)
    
    # 4. Concatenate
    list_a = pd.concat([zeros_batch, rand_a_batch], ignore_index=True)
    list_b = pd.concat([claims_batch, rand_b_batch], ignore_index=True)
    
    # 5. Vectorized Calculation
    freq_a = list_a['Frequency'].to_numpy()
    freq_b = list_b['Frequency'].to_numpy()
    
    diffs = np.abs(freq_a - freq_b)
    scores = 1.0 / (1.0 + 2.0 * diffs)
    
    # 6. Build Dataset
    return Dataset.from_dict({
        "sentence1": list_a['text_desc'].tolist(),
        "sentence2": list_b['text_desc'].tolist(),
        "score": scores
    })

ft_dataset = generate_pairs(train_df_aug, num_pairs=30000)

# Model Setup

In [10]:
# ==========================================
# 4. MODEL SETUP (Qwen + LoRA)
# ==========================================
# Using Qwen2.5-0.5B as a proxy for the requested "0.6B" model.
# NOTE: Ensure you have access to this model on HF Hub or use local path.
model_id = "Qwen/Qwen3-Embedding-0.6B" 

print(f"Loading base model: {model_id}")
model = SentenceTransformer(model_id, trust_remote_code=True, device="cuda" if torch.cuda.is_available() else "cpu")

# Apply LoRA to make fine-tuning feasible and prevent catastrophic forgetting
peft_config = LoraConfig(
    task_type=TaskType.FEATURE_EXTRACTION,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)

# Attach adapter
model[0].auto_model.add_adapter(peft_config)
model[0].auto_model.enable_input_require_grads()

Loading base model: Qwen/Qwen3-Embedding-0.6B


  return t.to(


In [11]:
# Replace line 23 with this block to verify LoRA is working
trainable_params = 0
all_param = 0
for name, param in model[0].auto_model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()

print(f"trainable params: {trainable_params:,} || all params: {all_param:,} || trainable%: {100 * trainable_params / all_param:.2f}%")

trainable params: 5,046,272 || all params: 600,822,784 || trainable%: 0.84%


# Fine-Tuning CoSENT Loss

In [12]:
import torch

def print_gpu_utilization():
    if not torch.cuda.is_available():
        print("No GPU detected.")
        return

    # On AMD ROCm, 'cuda' functions query the HIP backend
    reserved = torch.cuda.memory_reserved()
    allocated = torch.cuda.memory_allocated()
    total_memory = torch.cuda.get_device_properties(0).total_memory
    
    print(f"Total GPU Mem: {total_memory / 1024**3:.2f} GB")
    print(f"Reserved (Cached): {reserved / 1024**3:.2f} GB")
    print(f"Allocated (Active): {allocated / 1024**3:.2f} GB")
    print(f"Free (Approx): {(total_memory - reserved) / 1024**3:.2f} GB")
    print("-" * 30)

# Run it
print_gpu_utilization()

Total GPU Mem: 15.82 GB
Reserved (Cached): 2.24 GB
Allocated (Active): 2.24 GB
Free (Approx): 13.58 GB
------------------------------


In [13]:
# ==========================================
# 5. FINE-TUNING (CoSENTLoss)
# ==========================================
# CoSENTLoss optimizes the rank order of similarities to match the labels
loss_func = CoSENTLoss(model=model)
loss = MatryoshkaLoss(model, loss_func, [1024, 512, 64, 48])


args = SentenceTransformerTrainingArguments(
    output_dir="./qwen-fremtpl-finetuned_100",
    num_train_epochs=2,          # 1 epoch is sufficient for 15k pairs
    per_device_train_batch_size=8,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True, 
    learning_rate=2e-5,
    fp16=True,                   # Use Mixed Precision
    logging_steps=50,
    save_strategy="no",          # Skip saving checkpoints to save disk space for this demo
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=ft_dataset,
    loss=loss,
)

print("Starting Fine-Tuning...")
trainer.train()

# Save locally (Adapters only)
model.save_pretrained("./qwen-fremtpl-final_100")
print("Fine-tuning complete. Model saved.")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting Fine-Tuning...


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss
50,12.47
100,12.3002
150,12.3204
200,12.3284
250,12.391
300,12.2173
350,12.2903
400,12.2794
450,12.3013
500,12.2587


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Fine-tuning complete. Model saved.


# Downsteam GLM

# Using Fine-Tuned Model (after training)

In [14]:
import torch
from sentence_transformers import SentenceTransformer

# 1. Faster Loading Config
model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-0.6B", 
    trust_remote_code=True, 
    device="cuda",
    model_kwargs={
        "torch_dtype": torch.float16,   # Critical for speed/VRAM
        "attn_implementation": "sdpa"   # Faster attention
    }
)

# 2. Load & Merge
model.load_adapter("./qwen-fremtpl-final_100")

# 3. Restrict Length
#model.max_seq_length = 512

`torch_dtype` is deprecated! Use `dtype` instead!


In [15]:
from sentence_transformers import SentenceTransformer

# Point simply to the FOLDER path. 
# The library will automatically look for 'modules.json' and 'adapter_model.safetensors' inside.
#model = SentenceTransformer("./qwen-fremtpl-final", trust_remote_code=True)

# Test it immediately
print("Model loaded successfully!")
with model.truncate_sentence_embeddings(truncate_dim=64):
    embeddings_truncated = model.encode(["hello there", "hiya"])
assert embeddings_truncated.shape[-1] == 64

Model loaded successfully!


In [16]:
embeddings_truncated

array([[-0.01862  , -0.007812 , -0.01043  , -0.0416   ,  0.00637  ,
        -0.00972  , -0.03528  ,  0.03114  , -0.0937   ,  0.01201  ,
         0.002016 , -0.03111  ,  0.1039   , -0.00924  , -0.05087  ,
         0.0857   , -0.02142  ,  0.0631   ,  0.10956  , -0.0799   ,
         0.03464  ,  0.0085   , -0.04028  ,  0.1392   ,  0.006653 ,
        -0.00534  , -0.0316   ,  0.10675  ,  0.0215   , -0.01288  ,
         0.02158  ,  0.03146  , -0.00978  , -0.002033 , -0.03592  ,
        -0.01346  , -0.01267  ,  0.0004709, -0.03094  ,  0.0586   ,
        -0.01813  ,  0.02397  ,  0.0669   , -0.01537  , -0.003511 ,
        -0.0279   ,  0.05136  , -0.00895  , -0.000533 , -0.006157 ,
        -0.028    , -0.02452  ,  0.011826 ,  0.008286 ,  0.008125 ,
        -0.05804  ,  0.05334  , -0.004417 ,  0.04184  , -0.00953  ,
        -0.0937   ,  0.051    , -0.02545  ,  0.00911  ],
       [-0.001335 ,  0.0176   , -0.01573  , -0.0507   ,  0.03174  ,
        -0.02054  ,  0.004562 ,  0.0933   , -0.0811   ,  0.

Model is outputting embeddings of dim 64

### Check VRAM Usage

In [17]:
import torch

def print_gpu_utilization():
    if not torch.cuda.is_available():
        print("No GPU detected.")
        return

    # On AMD ROCm, 'cuda' functions query the HIP backend
    reserved = torch.cuda.memory_reserved()
    allocated = torch.cuda.memory_allocated()
    total_memory = torch.cuda.get_device_properties(0).total_memory
    
    print(f"Total GPU Mem: {total_memory / 1024**3:.2f} GB")
    print(f"Reserved (Cached): {reserved / 1024**3:.2f} GB")
    print(f"Allocated (Active): {allocated / 1024**3:.2f} GB")
    print(f"Free (Approx): {(total_memory - reserved) / 1024**3:.2f} GB")
    print("-" * 30)

# Run it
print_gpu_utilization()

Total GPU Mem: 15.82 GB
Reserved (Cached): 3.54 GB
Allocated (Active): 3.39 GB
Free (Approx): 12.29 GB
------------------------------


Now want to generate embeddings from data

Ensure model is running on GPU

In [18]:
print(model.device)

cuda:0


In [19]:
import torch

# 1. Define the device (On AMD ROCm, we still call it 'cuda' in PyTorch)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Moving model to: {device}")

# 2. Move the model
model = model.to(device)

Moving model to: cuda


In [22]:
print("Generating embeddings for GLM...")

# Encode the serialized text for BOTH Train and Test sets
# Note: We use the SAME model we just fine-tuned
with model.truncate_sentence_embeddings(truncate_dim=64):
    train_embeddings_64 = model.encode(train_df['text_desc'].tolist(), batch_size=32, show_progress_bar=True)
    test_embeddings_64 = model.encode(test_df['text_desc'].tolist(), batch_size=32, show_progress_bar=True)
assert train_embeddings_64.shape[-1] == 64, test_embeddings_64.shape[-1] == 64 


Generating embeddings for GLM...


Batches:   0%|          | 0/2500 [00:00<?, ?it/s]

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

In [23]:
import numpy as np

# Save everything in one single archive
np.savez(
    "insurance_training_data_100_64.npz", 
    X=train_embeddings_64,           # The Features (Embeddings)
    y=train_df['ClaimNb'].values, # The Target (Counts)
    w=train_df['Exposure'].values # The Weight (Exposure)
)
print("Saved all training data to insurance_training_data_100_64.npz")
np.savez(
    "insurance_testing_data_100_64.npz", 
    X=test_embeddings_64,           # The Features (Embeddings)
    y=test_df['ClaimNb'].values, # The Target (Counts)
    w=test_df['Exposure'].values # The Weight (Exposure)
)

print("Saved all testing data to insurance_testing_data_100_64.npz")

# # --- HOW TO LOAD IT SAFELY ---
# data = np.load("insurance_training_data.npz")

# X_train = data['X']
# y_train = data['y']
# exposure_train = data['w']

# # Verify alignment instantly
# print(f"Features: {X_train.shape}")
# print(f"Targets:  {y_train.shape}")
# # If these lengths match, your order is preserved.

Saved all training data to insurance_training_data_100_64.npz
Saved all testing data to insurance_testing_data_100_64.npz


In [31]:
import numpy as np
train_df = np.load("insurance_training_data_100_1024.npz")

test_df = np.load("insurance_testing_data_100_1024.npz")

Want to have embeddings -- claim count -- exposure

In [32]:
meta_train = [train_df['y'], train_df['w']]