<a href="https://colab.research.google.com/github/chumingyzx/Kaggle---Multimodal-Single-Cell-Integration/blob/main/cite_xgb_transformed_sparse_matrix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import os
import glob

drive.mount('/content/drive')
folder_path = '/content/drive/My Drive/NeurIPS/Data/'
all_files = os.listdir(folder_path)

In [None]:
if not os.path.exists('/opt/conda/lib/python3.7/site-packages/tables'):
  !pip install tables
if not os.path.exists('/opt/conda/lib/python3.7/site-packages/xgboost'):
  !pip install xgboost
# if not os.path.exists('/opt/conda/lib/python3.7/site-packages/tensorflow'):
  # !pip install --upgrade tensorflow

In [None]:
%%time
import pandas as pd
import numpy as np
import gc
import os
import random
import pickle
from sklearn.model_selection import StratifiedKFold,KFold
from scipy.sparse import hstack,vstack,csr_matrix,save_npz,load_npz
from sklearn.decomposition import TruncatedSVD
from scipy.stats import spearmanr
import xgboost as xgb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input

############################################################################
#----- work folder -----
############################################################################
input_path = '/content/drive/MyDrive/NeurIPS/Data/'
feature_path = '/content/drive/MyDrive/NeurIPS/Data/'
model_path = '/content/drive/MyDrive/NeurIPS/'
sub_path = '/content/drive/MyDrive/NeurIPS/'

# load csr_matrix

In [None]:
%%time

train_df = pd.read_feather(feature_path+'train_cite_inputs_id.feather')
test_df = pd.read_feather(feature_path+'test_cite_inputs_id.feather')

cite_inputs_sparse = load_npz(feature_path+"cite_inputs_sparse.npz")

train_cite_X = cite_inputs_sparse[:len(train_df)]
test_cite_X = cite_inputs_sparse[len(train_df):]
train_cite_y = np.load(feature_path+'train_cite_targets.npy')

In [None]:
train_cite_X.shape,test_cite_X.shape,train_cite_y.shape

# model

In [None]:
%%time


def xgb_kfold(train_df, test_df, train_cite_X, train_cite_y, test_cite_X, folds):
    params = {
        'objective': 'reg:squarederror',
        'learning_rate': 0.1,
        'max_depth': 5,
        'lambda': 1.0,
        'alpha': 0.1,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'device': 'gpu',
        'tree_method': 'gpu_hist',
        'verbosity': 0,
        'seed': 42
    }

    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df)):
        print(f'n_fold: {n_fold}')

        train_x = train_cite_X[train_idx]
        valid_x = train_cite_X[valid_idx]
        train_y = train_cite_y[train_idx]
        valid_y = train_cite_y[valid_idx]

        dtrain = xgb.DMatrix(train_x, label=train_y)
        dvalid = xgb.DMatrix(valid_x, label=valid_y)
        dtest  = xgb.DMatrix(test_cite_X)

        bst = xgb.train(
            params,
            dtrain,
            num_boost_round=10000,
            evals=[(dvalid, 'valid')],
            early_stopping_rounds=100,
            verbose_eval=1000
        )

        oof_preds[valid_idx] = bst.predict(dvalid, iteration_range=(0, bst.best_iteration))
        sub_preds += bst.predict(dtest, iteration_range=(0, bst.best_iteration)) / folds.n_splits

    return oof_preds, sub_preds


In [None]:
%%time
seed = 42
folds = KFold(n_splits= 5, shuffle=True, random_state=seed)
train_preds = []
test_preds = []
for i in range(140):
    print('=====================')
    print(i)
    train_cite_y_single = train_cite_y[:,i]
    oof_preds,sub_preds = xgb_kfold(train_df, test_df, train_cite_X, train_cite_y_single, test_cite_X, folds)
    train_preds.append(oof_preds)
    test_preds.append(sub_preds)

In [None]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules.

    It is assumed that the predictions are not constant.

    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

oof_preds = np.zeros((len(train_df), 140))
for n in range(len(train_preds)):
    oof_preds[:,n] =  train_preds[n]

cv = correlation_score(train_cite_y, oof_preds)
print (cv)

sub_preds = np.zeros((len(test_df), 140))
for n in range(len(test_preds)):
    sub_preds[:,n] =  test_preds[n]

xgb1 = np.concatenate([oof_preds,sub_preds],axis=0)

tsvd = TruncatedSVD(n_components=100, algorithm='arpack')
xgb1_svd = tsvd.fit_transform(xgb1)
np.save(feature_path+'cite_xgb1_svd_100.npy', xgb1_svd)