In [1]:
# Imports
import pandas as pd
import numpy as np
import os
import joblib
import logging
from tqdm import tqdm
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
# Load dataset (adjust path & delimiter accordingly)
dataset_path = '3_merged_data3.txt'
data = pd.read_csv(dataset_path, sep='\t')  # or sep=',' if CSV

# Load p-values file
pval_path = '3_transposed_headers_with_scores.txt'
pvals = pd.read_csv(pval_path, sep='\t')

In [3]:
# Get list of features from dataset columns
dataset_features = set(data.columns)

# Get list of features from p-value file
pval_features = set(pvals['isoform'])

# Find missing and extra features
missing_in_pval = dataset_features - pval_features
extra_in_pval = pval_features - dataset_features

# Print results
print(f"Features in dataset not in p-value file: {len(missing_in_pval)}")
if missing_in_pval:
    print("Missing features:\n", missing_in_pval)

print(f"Extra features in p-value file not in dataset: {len(extra_in_pval)}")
if extra_in_pval:
    print("Extra features:\n", extra_in_pval)


Features in dataset not in p-value file: 2
Missing features:
 {'avg7_calingiri', 'ID'}
Extra features in p-value file not in dataset: 0


In [None]:
# Extract features and target (assuming 'avg7_calingiri' is target)
target_col = 'avg7_calingiri'
feature_cols = [col for col in data.columns if col != target_col and col != 'ID']

X = data[feature_cols]
y = data[target_col]

# Map p-values by feature
pval_map = pvals.set_index('isoform')['p-value_lowest'].to_dict()
print(X.shape)

(149, 33048)


In [None]:
weights = {}
for feat in feature_cols:
    p = pval_map.get(feat, None)
    if p is None:
        weights[feat] = 1.0
    else:
        weights[feat] = 1.0 / (p + 1e-8)


In [28]:
print(type(weights))

<class 'dict'>


In [20]:
# Step 1: Clamp weights to avoid extremes [0.01, 100]
weights_clamped = {
    feat: np.clip(weight, 1e-2, 100)
    for feat, weight in weights.items()
}

# Step 2: Normalize weights to range [0.01, 1]
clamped_vals = np.array(list(weights_clamped.values()))
min_val, max_val = clamped_vals.min(), clamped_vals.max()

weights_norm = {
    feat: ((val - min_val) / (max_val - min_val)) * (1 - 0.01) + 0.01
    for feat, val in weights_clamped.items()
}


In [27]:
print(type(weights_norm))

<class 'dict'>


In [21]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
weights_series = pd.Series(weights_norm)

# Now multiply features by these  weights
X_train_weighted = X_train * weights_series
X_test_weighted = X_test * weights_series
print(X_train_weighted.shape)
print("New weighted features stats:",
      X_train_weighted.min().min(),
      X_train_weighted.max().max(),
      X_train_weighted.std().mean())


(119, 33048)
New weighted features stats: 0.0 4.0 0.20485378840529941


In [30]:
print("Feature variance stats:", np.var(X_train_weighted, axis=0).min(), np.var(X_train_weighted, axis=0).max())
print("Target variance:", np.var(y_train))


Feature variance stats: 0.0 1.260221735753125
Target variance: 1.6101913424193204


In [31]:
from sklearn.feature_selection import VarianceThreshold

# Step 1: Init the selector
selector = VarianceThreshold(threshold=1e-5)

# Step 2: Fit only on TRAIN
X_train_filtered = selector.fit_transform(X_train_weighted)

# Step 3: Use .transform() on VAL and TEST
X_test_filtered = selector.transform(X_test_weighted)


In [25]:
models = {
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42),
    'LightGBM': LGBMRegressor(n_estimators=100, random_state=42),
    'CatBoost': CatBoostRegressor(iterations=100, random_seed=42, verbose=False)
}

In [32]:
print(X_train_filtered.shape)

(119, 29946)


In [26]:
# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger()

model_save_dir = 'saved_models'
os.makedirs(model_save_dir, exist_ok=True)

results = {}

logger.info(f"Starting training for {len(models)} models...")

for name, model in tqdm(models.items(), desc="Models", unit="model"):
    logger.info(f"Training {name}...")
    try:
        model.fit(X_train_filtered, y_train)
        y_pred = model.predict(X_test_filtered)
        mse = mean_squared_error(y_test, y_pred)
        results[name] = mse

        model_path = os.path.join(model_save_dir, f"{name}.model")
        if name == 'CatBoost':
            model.save_model(model_path, format='cbm')
        else:
            model.save_model(model_path)

        logger.info(f"{name} saved to {model_path} with Test MSE: {mse:.4f}")
        
    except Exception as e:
        logger.error(f"Error training {name}: {e}")





2025-06-04 07:42:54,819 - INFO - Starting training for 3 models...
Models:   0%|          | 0/3 [00:00<?, ?model/s]2025-06-04 07:42:54,822 - INFO - Training XGBoost...
  self.get_booster().save_model(fname)
2025-06-04 07:42:57,461 - INFO - XGBoost saved to saved_models\XGBoost.model with Test MSE: 1.7114
Models:  33%|███▎      | 1/3 [00:02<00:05,  2.64s/model]2025-06-04 07:42:57,462 - INFO - Training LightGBM...
[WinError 2] The system cannot find the file specified
  File "c:\Users\aadi2\AppData\Local\Programs\Python\Python310\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "c:\Users\aadi2\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 503, in run
    with Popen(*popenargs, **kwargs) as process:
  File "c:\Users\aadi2\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 971, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\aad

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.072774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 49663
[LightGBM] [Info] Number of data points in the train set: 119, number of used features: 24730
[LightGBM] [Info] Start training from score 4.115210


2025-06-04 07:42:59,772 - ERROR - Error training LightGBM: 'LGBMRegressor' object has no attribute 'save_model'
Models:  67%|██████▋   | 2/3 [00:04<00:02,  2.45s/model]2025-06-04 07:42:59,773 - INFO - Training CatBoost...




2025-06-04 07:43:14,467 - INFO - CatBoost saved to saved_models\CatBoost.model with Test MSE: 1.1173
Models: 100%|██████████| 3/3 [00:19<00:00,  6.55s/model]
