In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

from tabpfn import TabPFNRegressor


In [None]:
csv_path = "/vol/miltank/projects/practical_sose25/in_context_learning/data/other/radiomics_embeddings_fat.csv"
df = pd.read_csv(csv_path)
print(f"Shape before cleaning: {df.shape}")

# Drop non-feature columns
df = df.drop(columns=['eid'], errors='ignore')
df = df.apply(pd.to_numeric, errors='coerce')

# Drop rows with missing target
df = df.dropna(subset=['age'])

X = df.drop(columns=['age'])
y = df['age'].values.astype(np.float32)

# Handle missing feature values
X = X.fillna(X.median())

print(f"Final dataset shape: {X.shape}")


  df = pd.read_csv(csv_path)


Shape before cleaning: (13230, 8697)
Final dataset shape: (13159, 8695)


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X_scaled)


pca = PCA(n_components=500)
X_pca = pca.fit_transform(X_imputed)

print(f"PCA reduced shape: {X_pca.shape}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_.sum():.2f}")


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624
 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638
 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652
 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666
 7667 7668]. At least one non-missing value is needed for imputation with strategy='median'.


PCA reduced shape: (13159, 500)
Explained variance ratio: 0.92


In [6]:
X_train, X_temp, y_train, y_temp = train_test_split(X_pca, y, train_size=0.7, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [None]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'


model = TabPFNRegressor(device=device)
model.fit(X_train, y_train)


y_pred_test = model.predict(X_test)


mae = mean_absolute_error(y_test, y_pred_test)
mse = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print(f"Test MAE: {mae:.3f}")
print(f"Test MSE: {mse:.3f}")
print(f"Test R2: {r2:.3f}")
