In [1]:
import os
import time
import random
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, KBinsDiscretizer
from sklearn.metrics import roc_curve, auc, precision_recall_curve, roc_auc_score
from sklearn.model_selection import KFold, StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.pipeline import Pipeline

# hide Warning(s)
import warnings 
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' #last_expr

In [2]:
def load_data(project, path='./dataset/'):
    data = pd.read_csv(path+project+'_data.tsv', sep='\t', index_col=0)
    pairs = pd.read_csv(path+project+'_pairs.tsv', sep='\t', index_col=0)
    train = pd.read_csv(path+project+'_train.tsv', sep='\t', index_col=0)
    valid = pd.read_csv(path+project+'_valid.tsv', sep='\t', index_col=0)
    test = pd.read_csv(path+project+'_test.tsv', sep='\t', index_col=0)
    pairTrain = pd.read_csv(path+project+'_pairTrain.tsv', sep='\t', index_col=0)
    train_pairs = pd.read_csv(path+project+'_train_pairs.tsv', sep='\t', index_col=0)
    pairValid = pd.read_csv(path+project+'_pairValid.tsv', sep='\t', index_col=0)
    valid_pairs = pd.read_csv(path+project+'_valid_pairs.tsv', sep='\t', index_col=0)
    featues = pd.read_csv(path+project+'_features.tsv', sep='\t', index_col=0)
    return data, pairs, train, valid, test, pairTrain, train_pairs, pairValid, valid_pairs, featues

In [3]:
IBD = ['27', '36', '38', '48_1', '59']
CRC = ['18_2', '26', '28_2', '81_2', '82', '83', '87', '91', '93_3', '97', '99_2']
#clf = LogisticRegression(random_state=0)
#clf = Pipeline([('scaler', KBinsDiscretizer(encode='ordinal')), ('logist', LogisticRegression(random_state=0))])
#clf = RandomForestClassifier(n_estimators=100, max_features=0.2)
clf = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier(n_estimators=100, max_features=0.2))])

In [4]:
project = '38'
data, pairs, train, valid, test, pairTrain, train_pairs, pairValid, valid_pairs, features = load_data(project, path='./dataset/')

extend_datasets = {}
for i in IBD:
    if i!=project:
        extend_datasets[i] = load_data(i, path='./dataset/')[0]
extend_datasets[project+'_train'] = train
extend_datasets[project+'_valid'] = valid
extend_datasets[project+'_test'] = test

In [16]:
def train_step(data, features, pairs, ntree=100, nsample=7, min_paired=0.5):
    trees = []
    for i in range(ntree):
        clf = DecisionTreeClassifier(random_state=i, max_features=0.25)
        if random.random()>min_paired:
            sss = StratifiedShuffleSplit(n_splits=1, test_size=nsample, random_state=i)
            for _, (_, index) in enumerate(sss.split(data.iloc[:, :-1], data.iloc[:, -1])): pass
            subdata = data.iloc[index, :]
            clf.fit(subdata[features], subdata['Group'])
            trees.append(clf)
        else:
            rindex = random.sample(list(pairs.index), int(nsample/2)+1)
            rsamples = list(set(pairs.loc[rindex, 'Case']).union(set(pairs.loc[rindex, 'Control'])))
            subdata = data.iloc[rsamples, :]
            clf.fit(subdata[features], subdata['Group'])
            trees.append(clf)
    return trees

def predict_step(trees, data, features):
    result = []
    for model in trees:
        res = model.predict(data[features])
        result.append(res)
    #print(result)
    return np.round(np.array(result).mean(0), 5)

In [6]:
projects = ['27', '36', '38', '48_1', '59']
disease = 'IBD'
ntree = 101
nsamples = [3, 5, 7, 9, 11]
min_paireds = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

results = []
for project in projects:
    ### data
    data, pairs, train, valid, test, pairTrain, train_pairs, pairValid, valid_pairs, features = load_data(project, path='./dataset/')
    extend_datasets = {}
    LODO_data = pd.DataFrame()
    LODO_pairs = pd.DataFrame()
    for i in projects:
        if i!=project:
            extend_datasets[i], extend_pairs = load_data(i, path='./dataset/')[:2]
            temp = extend_datasets[i].copy()
            temp.index = i+'_'+temp.index
            LODO_data = pd.concat([LODO_data, temp], axis=0, sort=False)
            extend_pairs = i+'_'+extend_pairs
            LODO_pairs = pd.concat([LODO_pairs, extend_pairs], axis=0, sort=False)
    extend_datasets[project+'_train'] = train
    extend_datasets[project+'_valid'] = valid
    extend_datasets[project+'_test'] = test

    #LODO_data.shape
    LODO_pairs_index = []
    samples = list(LODO_data.index)
    for case, control in LODO_pairs.values:
        LODO_pairs_index.append([samples.index(case), samples.index(control)])
    LODO_pairs_index = pd.DataFrame(LODO_pairs_index, columns=['Case', 'Control'])
    #LODO_pairs_index.shape
    
    ###
    for nsample in nsamples:
        for min_paired in min_paireds:
            print('###', disease, project, ntree, nsample, min_paired)
            
            overlap_features = features.loc[features['overlap_features']==1, :].index
            trees = train_step(train, overlap_features, train_pairs, ntree=ntree, nsample=nsample, min_paired=min_paired)
            for name, dataset in extend_datasets.items():
                score = roc_auc_score(dataset['Group'], predict_step(trees, dataset, overlap_features))
                results.append(['Study2Study', disease, ntree, nsample, min_paired, 'overlap', project, name, score])
            ### LODO
            trees = train_step(LODO_data, overlap_features, LODO_pairs_index, ntree=ntree, nsample=nsample, min_paired=min_paired)
            score = roc_auc_score(data['Group'], predict_step(trees, data, overlap_features))
            results.append(['LODO', disease, ntree, nsample, min_paired, 'overlap', 'Others', project, score])
            
            
            overlap_features = features.loc[(features['overlap_features']==1)&(features['unmatched_diff_features']==1), :].index
            trees = train_step(train, overlap_features, train_pairs, ntree=ntree, nsample=nsample, min_paired=min_paired)
            for name, dataset in extend_datasets.items():
                score = roc_auc_score(dataset['Group'], predict_step(trees, dataset, overlap_features))
                results.append(['Study2Study', disease, ntree, nsample, min_paired, 'unmatched', project, name, score])
                
            ### LODO
            trees = train_step(LODO_data, overlap_features, LODO_pairs_index, ntree=ntree, nsample=nsample, min_paired=min_paired)
            score = roc_auc_score(data['Group'], predict_step(trees, data, overlap_features))
            results.append(['LODO', disease, ntree, nsample, min_paired, 'unmatched', 'Others', project, score])
                
            overlap_features = features.loc[(features['overlap_features']==1)&(features['matched_diff_features']==1), :].index
            trees = train_step(train, overlap_features, train_pairs, ntree=ntree, nsample=nsample, min_paired=min_paired)
            for name, dataset in extend_datasets.items():
                score = roc_auc_score(dataset['Group'], predict_step(trees, dataset, overlap_features))
                results.append(['Study2Study', disease, ntree, nsample, min_paired, 'matched', project, name, score])
            
            ### LODO
            trees = train_step(LODO_data, overlap_features, LODO_pairs_index, ntree=ntree, nsample=nsample, min_paired=min_paired)
            score = roc_auc_score(data['Group'], predict_step(trees, data, overlap_features))
            results.append(['LODO', disease, ntree, nsample, min_paired, 'matched', 'Others', project, score])
    

results = pd.DataFrame(results, columns=['Type', 'Disease', 'NTree', 'NSample', 'Min_paired_ratio', 'Feature', 'TrainingSet', 'TestSet', 'AUC'])

### IBD 27 101 3 0.0
### IBD 27 101 3 0.1
### IBD 27 101 3 0.2
### IBD 27 101 3 0.3
### IBD 27 101 3 0.4
### IBD 27 101 3 0.5
### IBD 27 101 3 0.6
### IBD 27 101 3 0.7
### IBD 27 101 3 0.8
### IBD 27 101 3 0.9
### IBD 27 101 3 1.0
### IBD 27 101 5 0.0
### IBD 27 101 5 0.1
### IBD 27 101 5 0.2
### IBD 27 101 5 0.3
### IBD 27 101 5 0.4
### IBD 27 101 5 0.5
### IBD 27 101 5 0.6
### IBD 27 101 5 0.7
### IBD 27 101 5 0.8
### IBD 27 101 5 0.9
### IBD 27 101 5 1.0
### IBD 27 101 7 0.0
### IBD 27 101 7 0.1
### IBD 27 101 7 0.2
### IBD 27 101 7 0.3
### IBD 27 101 7 0.4
### IBD 27 101 7 0.5
### IBD 27 101 7 0.6
### IBD 27 101 7 0.7
### IBD 27 101 7 0.8
### IBD 27 101 7 0.9
### IBD 27 101 7 1.0
### IBD 27 101 9 0.0
### IBD 27 101 9 0.1
### IBD 27 101 9 0.2
### IBD 27 101 9 0.3
### IBD 27 101 9 0.4
### IBD 27 101 9 0.5
### IBD 27 101 9 0.6
### IBD 27 101 9 0.7
### IBD 27 101 9 0.8
### IBD 27 101 9 0.9
### IBD 27 101 9 1.0
### IBD 27 101 11 0.0
### IBD 27 101 11 0.1
### IBD 27 101 11 0.2
### IBD 27

In [7]:
results.to_csv('M2. Emsemble_'+disease+'.tsv', sep='\t')

In [17]:
projects = ['18_2', '26', '28_2', '81_2', '82', '83', '87', '91', '93_3', '97', '99_2']
disease = 'CRC'
ntree = 101
nsamples = [3, 5, 7, 9, 11]
min_paireds = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

results = []
for project in projects:
    ### data
    data, pairs, train, valid, test, pairTrain, train_pairs, pairValid, valid_pairs, features = load_data(project, path='./dataset/')
    extend_datasets = {}
    LODO_data = pd.DataFrame()
    LODO_pairs = pd.DataFrame()
    for i in projects:
        if i!=project:
            extend_datasets[i], extend_pairs = load_data(i, path='./dataset/')[:2]
            temp = extend_datasets[i].copy()
            temp.index = i+'_'+temp.index
            LODO_data = pd.concat([LODO_data, temp], axis=0, sort=False)
            extend_pairs = i+'_'+extend_pairs
            LODO_pairs = pd.concat([LODO_pairs, extend_pairs], axis=0, sort=False)
    extend_datasets[project+'_train'] = train
    extend_datasets[project+'_valid'] = valid
    extend_datasets[project+'_test'] = test

    #LODO_data.shape
    LODO_pairs_index = []
    samples = list(LODO_data.index)
    for case, control in LODO_pairs.values:
        LODO_pairs_index.append([samples.index(case), samples.index(control)])
    LODO_pairs_index = pd.DataFrame(LODO_pairs_index, columns=['Case', 'Control'])
    #LODO_pairs_index.shape
    
    ###
    for nsample in nsamples:
        for min_paired in min_paireds:
            print('###', disease, project, ntree, nsample, min_paired)
            
            overlap_features = features.loc[features['overlap_features']==1, :].index
            trees = train_step(train, overlap_features, train_pairs, ntree=ntree, nsample=nsample, min_paired=min_paired)
            for name, dataset in extend_datasets.items():
                score = roc_auc_score(dataset['Group'], predict_step(trees, dataset, overlap_features))
                results.append(['Study2Study', disease, ntree, nsample, min_paired, 'overlap', project, name, score])
            ### LODO
            trees = train_step(LODO_data, overlap_features, LODO_pairs_index, ntree=ntree, nsample=nsample, min_paired=min_paired)
            score = roc_auc_score(data['Group'], predict_step(trees, data, overlap_features))
            results.append(['LODO', disease, ntree, nsample, min_paired, 'overlap', 'Others', project, score])
            
            
            overlap_features = features.loc[(features['overlap_features']==1)&(features['unmatched_diff_features']==1), :].index
            trees = train_step(train, overlap_features, train_pairs, ntree=ntree, nsample=nsample, min_paired=min_paired)
            for name, dataset in extend_datasets.items():
                score = roc_auc_score(dataset['Group'], predict_step(trees, dataset, overlap_features))
                results.append(['Study2Study', disease, ntree, nsample, min_paired, 'unmatched', project, name, score])
                
            ### LODO
            trees = train_step(LODO_data, overlap_features, LODO_pairs_index, ntree=ntree, nsample=nsample, min_paired=min_paired)
            score = roc_auc_score(data['Group'], predict_step(trees, data, overlap_features))
            results.append(['LODO', disease, ntree, nsample, min_paired, 'unmatched', 'Others', project, score])
                
            overlap_features = features.loc[(features['overlap_features']==1)&(features['matched_diff_features']==1), :].index
            if len(overlap_features)==0:
                continue
            trees = train_step(train, overlap_features, train_pairs, ntree=ntree, nsample=nsample, min_paired=min_paired)
            for name, dataset in extend_datasets.items():
                score = roc_auc_score(dataset['Group'], predict_step(trees, dataset, overlap_features))
                results.append(['Study2Study', disease, ntree, nsample, min_paired, 'matched', project, name, score])
            
            ### LODO
            trees = train_step(LODO_data, overlap_features, LODO_pairs_index, ntree=ntree, nsample=nsample, min_paired=min_paired)
            score = roc_auc_score(data['Group'], predict_step(trees, data, overlap_features))
            results.append(['LODO', disease, ntree, nsample, min_paired, 'matched', 'Others', project, score])
    

results = pd.DataFrame(results, columns=['Type', 'Disease', 'NTree', 'NSample', 'Min_paired_ratio', 'Feature', 'TrainingSet', 'TestSet', 'AUC'])
results.to_csv('M2. Emsemble_'+disease+'.tsv', sep='\t')

### CRC 99_2 101 3 0.0
### CRC 99_2 101 3 0.1
### CRC 99_2 101 3 0.2
### CRC 99_2 101 3 0.3
### CRC 99_2 101 3 0.4
### CRC 99_2 101 3 0.5
### CRC 99_2 101 3 0.6
### CRC 99_2 101 3 0.7
### CRC 99_2 101 3 0.8
### CRC 99_2 101 3 0.9
### CRC 99_2 101 3 1.0
### CRC 99_2 101 5 0.0
### CRC 99_2 101 5 0.1
### CRC 99_2 101 5 0.2
### CRC 99_2 101 5 0.3
### CRC 99_2 101 5 0.4
### CRC 99_2 101 5 0.5
### CRC 99_2 101 5 0.6
### CRC 99_2 101 5 0.7
### CRC 99_2 101 5 0.8
### CRC 99_2 101 5 0.9
### CRC 99_2 101 5 1.0
### CRC 99_2 101 7 0.0
### CRC 99_2 101 7 0.1
### CRC 99_2 101 7 0.2
### CRC 99_2 101 7 0.3
### CRC 99_2 101 7 0.4
### CRC 99_2 101 7 0.5
### CRC 99_2 101 7 0.6
### CRC 99_2 101 7 0.7
### CRC 99_2 101 7 0.8
### CRC 99_2 101 7 0.9
### CRC 99_2 101 7 1.0
### CRC 99_2 101 9 0.0
### CRC 99_2 101 9 0.1
### CRC 99_2 101 9 0.2
### CRC 99_2 101 9 0.3
### CRC 99_2 101 9 0.4
### CRC 99_2 101 9 0.5
### CRC 99_2 101 9 0.6
### CRC 99_2 101 9 0.7
### CRC 99_2 101 9 0.8
### CRC 99_2 101 9 0.9
### CRC 99_

In [15]:
LODO_data

In [None]:
overlap_features = features.loc[features['overlap_features']==1, :].index
trees = train_step(train, overlap_features, train_pairs, ntree=301, nsample=7, min_paired=0.0)
for name, dataset in extend_datasets.items():
    name, roc_auc_score(dataset['Group'], predict_step(trees, dataset, overlap_features))

In [None]:
overlap_features = features.loc[features['overlap_features']==1, :].index
trees = train_step(train, overlap_features, train_pairs, ntree=301, nsample=7, min_paired=0.2)
for name, dataset in extend_datasets.items():
    name, roc_auc_score(dataset['Group'], predict_step(trees, dataset, overlap_features))

In [None]:
overlap_features = features.loc[features['overlap_features']==1, :].index
trees = train_step(train, overlap_features, train_pairs, ntree=301, nsample=7, min_paired=0.5)
for name, dataset in extend_datasets.items():
    name, roc_auc_score(dataset['Group'], predict_step(trees, dataset, overlap_features))

In [None]:
overlap_features = features.loc[(features['overlap_features']==1)&(features['matched_diff_features']==1), :].index
trees = train_step(train, overlap_features, train_pairs, ntree=301, nsample=7, min_paired=0.5)
for name, dataset in extend_datasets.items():
    name, roc_auc_score(dataset['Group'], predict_step(trees, dataset, overlap_features))

In [None]:
overlap_features = features.loc[(features['overlap_features']==1)&(features['unmatched_diff_features']==1), :].index
trees = train_step(train, overlap_features, train_pairs, ntree=101, nsample=7)
for name, dataset in extend_datasets.items():
    name, roc_auc_score(dataset['Group'], predict_step(trees, dataset, overlap_features))

In [None]:
overlap_features = features.loc[(features['overlap_features']==1)&(features['matched_diff_features']==1), :].index
trees = train_step(train, overlap_features, train_pairs, ntree=101, nsample=7)
for name, dataset in extend_datasets.items():
    name, roc_auc_score(dataset['Group'], predict_step(trees, dataset, overlap_features))