<a href="https://colab.research.google.com/github/chumingyzx/Kaggle---Multimodal-Single-Cell-Integration/blob/main/multi_xgb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import os
import glob

drive.mount('/content/drive')
folder_path = '/content/drive/My Drive/NeurIPS/Data/'
all_files = os.listdir(folder_path)

In [None]:
if not os.path.exists('/opt/conda/lib/python3.7/site-packages/tables'):
  !pip install tables
if not os.path.exists('/opt/conda/lib/python3.7/site-packages/xgboost'):
  !pip install xgboost

In [None]:
%%time
import pandas as pd
import numpy as np
import gc
import os
import random
import pickle
from sklearn.model_selection import StratifiedKFold,KFold
from scipy.sparse import hstack,vstack,csr_matrix,save_npz,load_npz
from sklearn.decomposition import TruncatedSVD
import xgboost as xgb
############################################################################
#----- work folder -----
############################################################################
input_path = '/content/drive/MyDrive/NeurIPS/Data/'
feature_path = '/content/drive/MyDrive/NeurIPS/Data/'
model_path = '/content/drive/MyDrive/NeurIPS/'
sub_path = '/content/drive/MyDrive/NeurIPS/'

# transform target by tsvd

In [None]:
train_multi_targets = pd.read_hdf(input_path+'train_multi_targets.h5')
tsvd = TruncatedSVD(n_components=1000)
data_reduced = tsvd.fit_transform(train_multi_targets.values)

# load sparse matrix svd

In [None]:
%%time

train_df = pd.read_feather(feature_path+'train_multi_inputs_id.feather')
test_df = pd.read_feather(feature_path+'test_multi_inputs_id.feather')
multi_inputs_svd = np.load(feature_path+'multi_inputs_svd_100.npy')
train_multi_X = multi_inputs_svd[:len(train_df)]
test_multi_X = multi_inputs_svd[len(train_df):]
train_multi_y = data_reduced

In [None]:
%%time

def xgb_kfold(train_df, test_df, train_cite_X, train_cite_y, test_cite_X, folds):
    params = {
        'objective': 'reg:squarederror',
        'learning_rate': 0.1,
        'max_depth': 6,
        'lambda': 1.0,
        'alpha': 0.1,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'tree_method': 'gpu_hist',
        'verbosity': 0,
        'seed': 42
    }

    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df)):
        print(f'n_fold: {n_fold}')

        train_x = train_cite_X[train_idx]
        valid_x = train_cite_X[valid_idx]
        train_y = train_cite_y[train_idx]
        valid_y = train_cite_y[valid_idx]

        dtrain = xgb.DMatrix(train_x, label=train_y)
        dvalid = xgb.DMatrix(valid_x, label=valid_y)
        dtest  = xgb.DMatrix(test_cite_X)

        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=10000,
            evals=[(dvalid, 'valid')],
            early_stopping_rounds=100,
            verbose_eval=1000
        )

        oof_preds[valid_idx] = bst.predict(dvalid, iteration_range=(0, bst.best_iteration))
        sub_preds += bst.predict(dtest, iteration_range=(0, bst.best_iteration)) / folds.n_splits

    return oof_preds, sub_preds

In [None]:
%%time
seed = 42
folds = KFold(n_splits= 5, shuffle=True, random_state=seed)
train_preds = []
test_preds = []
for i in range(1000):
    print('=====================')
    print(i)
    train_multi_y_single = train_multi_y[:,i]
    oof_preds,sub_preds = xgb_kfold(train_df, test_df, train_multi_X, train_multi_y_single, test_multi_X, folds)
    train_preds.append(oof_preds)
    test_preds.append(sub_preds)

In [None]:
def correlation_score(y_true, y_pred):
    """Average sample-wise Pearson correlation with safety checks"""
    y_true = y_true.values if isinstance(y_true, pd.DataFrame) else y_true
    y_pred = y_pred.values if isinstance(y_pred, pd.DataFrame) else y_pred
    corrsum = 0
    valid_count = 0
    for i in range(len(y_true)):
        if np.std(y_true[i]) == 0 or np.std(y_pred[i]) == 0:
            continue  # skip invalid
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
        valid_count += 1
    return corrsum / valid_count if valid_count > 0 else 0.0




In [None]:
oof_preds = np.stack(train_preds, axis=1)
sub_preds = np.stack(test_preds, axis=1)

xgb = np.concatenate([oof_preds, sub_preds], axis=0)

tsvd = TruncatedSVD(n_components=100, algorithm='arpack')
xgb_svd = tsvd.fit_transform(xgb)
np.save(feature_path + 'multi_xgb_svd_100.npy', xgb_svd)