In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.float16,   # load weights in fp16
    device_map="auto"
)
model.eval()



Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotar

In [3]:
import torch
print(torch.cuda.is_available())           # this should say False on AMD
print(torch.backends.mps.is_available())   # Mac only, ignore on WSL
print(torch.version.hip)                   # should not be None on ROCm
print(torch.cuda.device_count())
print(torch.cuda.get_device_name())


True
False
6.4.43484-123eb5128
1
AMD Radeon RX 9070 XT


In [4]:
model.hf_device_map

{'': 0}

In [5]:
from pathlib import Path

In [6]:
BATCH_SIZE   = 16                # adjust to your GPU memory
OUT_DIR      = Path("/home/dkusmenko/research/embedding_v2_5p.ipynb./embeddings")
OUT_DIR.mkdir(exist_ok=True, parents=True)
EMB_DTYPE    = np.float16        # save memory
MAX_TOKENS   = 512  

In [None]:
# def embedding_retrieval(prompt:str):
#     # tokenize the input text
#     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

#     with torch.no_grad():
#         out = model(**inputs, output_hidden_states=True)

#     # take last hidden layer [B, T, D] in fp32
#     last_layer = out.hidden_states[-1]

#     # convert to float16 to save memory
#     last_layer = last_layer.to(torch.float16)

#     vec_mean  = last_layer.mean(dim=1)
    
#     # return pooled embeddings as numpy array
#     return vec_mean
  

### Load in Data

In [7]:
df = pd.read_csv("freMTPL2freq.csv")

### Add Context

In [8]:
# Create frequency and claim index columns
df["Frequency"] = df["ClaimNb"]/df["Exposure"]
df["ClaimInd"] = df["ClaimNb"] >= 1

In [9]:
brand_mapping = {'B1': 'Renault, Nissan, or Citroen', 'B2': 'Renault, Nissan, or Citroen','B3': 'Volkswagen, Audi, Skoda, or Seat', 'B4': 'Opel, General Motors, or Ford', 'B5': 'Opel, General Motors, or Ford','B6': 'Fiat', 'B10':'Mercedes, Chrysler, or BMW', 'B11':'Mercedes, Chrysler, or BMW', 'B12': 'Japanese (except Nissan) or Korean', 'B13': 'Other','B14': 'Other' }

In [10]:
region_mapping = {
    "R11": "Île-de-France",
    "R21": "Champagne-Ardenne",
    "R22": "Picardie",
    "R23": "Haute-Normandie",
    "R24": "Centre",
    "R25": "Basse-Normandie",
    "R26": "Bourgogne",
    "R31": "Nord–Pas-de-Calais",
    "R41": "Lorraine",
    "R42": "Alsace",
    "R43": "Franche–Comté",
    "R52": "Pays de la Loire",
    "R53": "Bretagne",
    "R54": "Poitou–Charentes",
    "R72": "Aquitaine",
    "R73": "Midi–Pyrénées",
    "R74": "Limousin",
    "R82": "Rhône–Alpes",
    "R83": "Auvergne",
    "R91": "Languedoc–Roussillon",
    "R93": "Provence–Alpes–Côte d’Azur",
    "R94": "Corse"
}


In [11]:
area_mapping = {
    "A": "rural area",
    "B": "semi-rural area",
    "C": "suburban-fringe area",
    "D": "suburban area",
    "E": "urban area",
    "F": "urban center"
}


In [12]:
df["VehBrand"] = df["VehBrand"].map(brand_mapping)
df["Region"] = df["Region"].map(region_mapping)
df["Area"] = df["Area"].map(area_mapping)

### Split to small portion of data

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(["Frequency"], axis = 1), df["Frequency"], stratify= df["ClaimInd"], test_size=0.05, random_state=42)

In [14]:
X_test = X_test.reset_index(drop=True)

### Create Prompts and Retreive Embeddings

Prompts have form:

"You are an auto insurance underwriter. A policyholder is {X_test["DrivAge"][i]} years old and lives in a {X_test["Area"][i]} of {X_test["Region"][i]} in France, with a population density of {X_test["Density"][i]} people per square kilometer. They drive a {X_test["VehBrand"][i]} brand vehicle that is {X_test["VehAge"][i]} years old, which takes {X_test["VehGas"][i]} gas and is in power class {X_test["VehPower"][i]}. The policyholder has a Bonus-Malus score of {X_test["BonusMalus"][i]}. What is the risk level for this policyholder?"

In [15]:
prompt_list = []
for i in range(len(X_test)):
    prompt_list.append(f"You are an auto insurance underwriter. A policyholder is {X_test['DrivAge'][i]} years old and lives in a {X_test['Area'][i]} of {X_test['Region'][i]} in France, with a population density of {X_test['Density'][i]} people per square kilometer. They drive a {X_test['VehBrand'][i]} brand vehicle that is {X_test['VehAge'][i]} years old, which takes {X_test['VehGas'][i]} gas and is in power class {X_test['VehPower'][i]}. The policyholder has a Bonus-Malus score of {X_test['BonusMalus'][i]}. What is the risk level for this policyholder?")

In [16]:
def get_prompts():
    # Example dummy prompts
    for p in prompt_list:
        yield p

# ===== BATCH ITERATOR =====
def batch_iter(iterable, batch_size):
    buf = []
    for x in iterable:
        buf.append(x)
        if len(buf) == batch_size:
            yield buf
            buf = []
    if buf:
        yield buf

# ===== MAIN EXTRACTION LOOP =====
for batch_idx, batch_prompts in enumerate(batch_iter(get_prompts(), BATCH_SIZE)):
    # Tokenize & move to device
    inputs = tokenizer(
        batch_prompts,
        padding=True,
        truncation=True,
        max_length=MAX_TOKENS,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        last_layer = outputs.hidden_states[-1]      # [B, T, H]
        embeddings = last_layer.mean(dim=1)         # mean pooling → [B, H]
        embeddings = embeddings.cpu().numpy().astype(EMB_DTYPE)

    # Save batch to disk
    np.save(OUT_DIR / f"batch_{batch_idx:06d}.npy", embeddings)
    #print(f"Saved batch {batch_idx}, shape={embeddings.shape}")

print("✅ All embeddings extracted.")

  attn_output = torch.nn.functional.scaled_dot_product_attention(


✅ All embeddings extracted.


In [17]:
import numpy as np
from pathlib import Path

# folder that contains all saved .npy batches
EMB_DIR = Path("/home/dkusmenko/research/embedding_v2_5p.ipynb./embeddings")  # change to your folder

# get all batch files
files = sorted(EMB_DIR.glob("*.npy"))

all_embs = []
for f in files:
    arr = np.load(f)
    all_embs.append(arr)

# stack into one big matrix
emb_matrix = np.vstack(all_embs)
print("Final shape:", emb_matrix.shape)

# optionally save once as a single file
np.save(EMB_DIR / "all_embeddings.npy", emb_matrix)


Final shape: (33901, 1536)


In [18]:
indtest = np.array([np.array(X_test["ClaimInd"])]).T

In [19]:
new_X = np.hstack((emb_matrix, indtest))

In [20]:
pdX = pd.DataFrame(new_X)


### Perform PCA to have dim 44

In [21]:
#stratified pca
from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(pdX.drop([1536], axis = 1), y_test, stratify = pdX[1536], test_size=0.2, random_state=42)

In [22]:
scaler = StandardScaler()
X_train2s = scaler.fit_transform(X_train2)
X_test2s = scaler.transform(X_test2)

pca = PCA(n_components=44)
X_train2pc = pca.fit_transform(X_train2s)
X_test2pc = pca.transform(X_test2s)

### Fit GLM to Model

In [23]:
from sklearn.linear_model import PoissonRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0.0, 0.1, 1.0, 5.0, 10.0, 20.0]}
grid = GridSearchCV(PoissonRegressor(max_iter=1000), param_grid, cv=5)
grid.fit(X_train2pc, y_train2)

print("Best alpha:", grid.best_params_['alpha'])


Best alpha: 1.0


In [24]:
import numpy as np
from sklearn.linear_model import PoissonRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

model = PoissonRegressor(alpha=grid.best_params_['alpha'], max_iter=10000)
model.fit(X_train2pc, y_train2)


0,1,2
,alpha,1.0
,fit_intercept,True
,solver,'lbfgs'
,max_iter,10000
,tol,0.0001
,warm_start,False
,verbose,0


In [25]:
from sklearn.metrics import mean_poisson_deviance

y_pred = model.predict(X_test2pc)
mpd = mean_poisson_deviance(y_test2, y_pred)
print("Mean Poisson Deviance:", mpd)


Mean Poisson Deviance: 1.5785119017617686


### Textbook Model

In [26]:
freq = np.array([np.array(y_test)]).T

In [27]:
red_X = np.hstack((X_test, freq))

In [28]:
new_Xred = pd.DataFrame(red_X, columns=['IDpol', 'ClaimNb', 'Exposure','Area','VehPower','VehAge','DrvAge','BonusMalus','VehBrand','VehGas','Density','Region','ClaimInd','Frequency'])

In [29]:
dat = new_Xred

In [30]:
dat = pd.get_dummies(dat, columns=['VehGas'],drop_first=True)
dat = pd.get_dummies(dat, columns=['VehBrand'],drop_first=True)
dat = pd.get_dummies(dat, columns=['Region'],drop_first=True)

In [31]:
area_remapping = {
    "rural area": 1,
    "semi-rural area": 2,
    "suburban-fringe area": 3,
    "suburban area": 4,
    "urban area": 5,
    "urban center": 6
}


In [32]:
import pandas as pd
import numpy as np

# Assuming dat is a pandas DataFrame
dat["Area"] = dat["Area"].map(area_remapping)

dat['VehPowerGLM'] = pd.Categorical(np.minimum(dat['VehPower'], 9))

dat['VehAgeGLM'] = pd.cut(
    dat['VehAge'],
    bins=[0, 5, 12, 101],
    labels=["0-5", "6-12", "12+"],
    include_lowest=True
)

dat['DrivAgeGLM'] = pd.cut(
    dat['DrvAge'],
    bins=[18, 20, 25, 30, 40, 50, 70, 101],
    labels=["18-20", "21-25", "26-30", "31-40", "41-50", "51-70", "71+"],
    include_lowest=True
)

dat['BonusMalusGLM'] = np.minimum(dat['BonusMalus'], 150)

dat['DensityGLM'] = np.log(dat["Density"].astype(float))


In [33]:
dat = pd.get_dummies(dat, columns=['DrivAgeGLM'],drop_first=True)
dat = pd.get_dummies(dat, columns=['VehAgeGLM'],drop_first=True)
dat = pd.get_dummies(dat, columns=['VehPowerGLM'],drop_first=True)

In [34]:
from sklearn.model_selection import train_test_split
X_traindat, X_testdat, y_traindat, y_testdat = train_test_split(dat.drop(["IDpol", "ClaimNb", "Exposure", "Frequency", "ClaimInd", "VehPower", "VehAge", "DrvAge", "BonusMalus","Density"], axis = 1), dat["Frequency"], stratify= dat["ClaimInd"], test_size=0.2, random_state=42)

In [35]:
from sklearn.linear_model import PoissonRegressor
from sklearn.model_selection import GridSearchCV

param_grid2 = {'alpha': [0.1, 1.0, 5.0, 10.0, 20.0]}
grid2 = GridSearchCV(PoissonRegressor(max_iter=10000), param_grid2, cv=5)
grid2.fit(X_traindat, y_traindat)

print("Best alpha:", grid2.best_params_['alpha'])


Best alpha: 0.1


In [41]:
import numpy as np
from sklearn.linear_model import PoissonRegressor

modelbook = PoissonRegressor(alpha=0.1, max_iter=1000)
modelbook.fit(X_traindat, y_traindat)


0,1,2
,alpha,0.1
,fit_intercept,True
,solver,'lbfgs'
,max_iter,1000
,tol,0.0001
,warm_start,False
,verbose,0


In [42]:
from sklearn.metrics import mean_poisson_deviance

y_pred_book = modelbook.predict(X_testdat)
mpdbook = mean_poisson_deviance(y_testdat, y_pred_book)
print("Mean Poisson Deviance:", mpdbook)


Mean Poisson Deviance: 1.6250425816973921
