In [1]:
from module.prepare import *
from itertools import product
from sklearn.externals import joblib
from sklearn import metrics
from sklearn.model_selection import ParameterGrid

import gc
import os
import re
import math
import sys
from collections import Counter
import random
from itertools import islice
import time
import configparser
import json
import datetime

import seaborn as sns
import matplotlib.pyplot as plt
import bokeh

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import scipy as sp
from scipy import stats

import sklearn
from joblib import dump, load
from sklearn.decomposition import *
from sklearn.feature_selection import *
from sklearn.ensemble import *
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.manifold import *
import sklearn.tree as Tr 
from sklearn.neural_network import MLPClassifier

import lightgbm as lgb
import optuna
import optuna.integration.lightgbm

from module.metrics import *



In [2]:
def getCurrentTime():
    return datetime.datetime.strftime(datetime.datetime.fromtimestamp(time.time()),format='%Y-%m-%d-%H-%M-%S')


def LGBOptuna(trial):
    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25)
    dtrain = lgb.Dataset(train_x, label=train_y)
 
    param = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
 
    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(test_x)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(test_y, pred_labels)
    return accuracy
 
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    print('Number of finished trials:', len(study.trials))
    print('Best trial:', study.best_trial.params)
    return

def LGBTuning(Xtrain,Xtest,Ytrain,Ytest,new_params=None):
    
    clf = lgb.LGBMClassifier(objective='cross_entropy', ### {cross_entropy, binary}
#                              silent=False,
                             verbose=0,
                             random_state=seed,
                             n_jobs=20,
#                              class_weight
                            )
    
    default_params = {
    'learning_rate': [0.1], 
    'boosting_type':['gbdt'], 
    'n_estimators': [500],
    'num_iterations':[1000],
    'max_bin':[256]
    }
    
    if new_params is not None:
        default_params.update(new_params)
    
    arg_str = ''
    for k,v in default_params.items():
        if type(v[0])==str:
            arg_str += k+'='+"'"+v[0]+"',"
        else:
            arg_str += k+'='+str(v[0])+","
    eval(
        'clf.'+clf.set_params.__name__+"("
            +arg_str.rstrip(',')+
            ")"
        )

#     print('DEBUG:: tuning params\n',clf.get_params())
    clf.fit(Xtrain,Ytrain)
    Ypred = clf.predict(Xtest)
    score_train = clf.score(Xtrain,Ytrain)
    print('train score %f'%score_train)
    
    return [Ypred,Ytest,score_train,clf]

In [3]:
cv = 5
generalize_ratio = 1.0/cv
test_ratio = 1.0/cv
tuning_mode = False

if tuning_mode:
    cv = 1

tuning 1

In [None]:
def TuningParametersStage1(fname=getCurrentTime()+'-stage1.csv'):
    res = []
    for DATAID in [3]:
        INFO('data id %d'%DATAID)
        for RNA_K in range(3,7):
            for PROTEIN_K in range(3,7):
                for TOP_RATIO in np.linspace(0.93,0.99,5):
                    start_time = time.time()
                    [data,T] = ReadData(DATAID,PROTEIN_K,RNA_K)
                    [X,Y] = ToMatrix(data,'dense')
                    
                    for _cv in range(5): ### cv testing
                        [X_train,X_test,Y_train,Y_test] = SplitDataset(X,Y,generalize_ratio)
                        [X_train,X_test,Y_train,Y_test] = \
                                    RandomForestDimensionalityReduction(X_train,X_test,Y_train,Y_test,topRatio=TOP_RATIO)
                        r = LGBTuning(X_train,X_test,Y_train,Y_test)
                        r = {
                            'DATAID': DATAID,
                            'test_score':scoreFunction(r[0],r[1]),
                            'train_score':r[2],
                            'RNA_K':RNA_K,
                            'PROTEIN_K':PROTEIN_K,
                            'TOP_RATIO':TOP_RATIO,
                            'cv':_cv,
                        }
                        print('DEBUG:: result ',r)
                        res.append(r)
                    end_time = time.time()
                    print('DEBUG:: time elapsed ',(end_time-start_time)/60)
    df = pd.DataFrame(data=res,columns=['DATAID','test_score','train_score','RNA_K','PROTEIN_K','TOP_RATIO'])
    df.to_csv(os.path.join('./result',fname))
    
    return


In [None]:
TuningParametersStage1('data-3-global-tune-1.csv')

In [6]:
import json
import re

'''
['NPInter10412','reRPI2825','RPI488','RPI2241','RPI1807','LPI43250','EVLncRNAs']
'''
params_1 = {
    0:[(5,5,0.99)],
    1:[(3,4,0.96)], # stage1
    2:[(4,6,0.96)],
    3:[(3,3,0.975),(6,3,0.93),(4,3,0.975)],
    4:[(4,3,0.96)],
    5:[(4,6,0.945)]
}



stage 2

In [5]:
fname_result2 = './result/'+getCurrentTime()+'-stage2-3.csv'

tune_grid = [
    [{
        "boosting_type": ["gbdt"], 
        "learning_rate": [0.1], 
        "n_estimators": [500], 
        "num_iterations": [2000],
    }],
    [{
        'learning_rate': [0.025,0.03,0.035,0.04], ### 0.1
        'boosting_type':['gbdt'], ### goss>gbdt
        'n_estimators': [500],
        'num_iterations':[2000], ### 2000
        'max_depth': [8,9,10], ### <400<675
        'max_bin':[256],
        'colsample_bytree' : [0.95,1], ### 0.75
        'bagging_fraction':[0.9,0.95,1], ### 1
        'bagging_freq':[1,2,3],
        'lambda_l1': [0.075,0.1,0.125],
    }],
    [{
        "bagging_fraction": [0.95], 
        "bagging_freq": [2], 
        "boosting_type": ["gbdt"], 
        "colsample_bytree": [1], 
        "lambda_l1": [0.01], 
        "learning_rate": [0.2], 
        "max_bin": [256], 
        "max_depth": [7], 
        "n_estimators": [500], 
        "num_iterations": [2000],
        
#         'learning_rate': [0.15,0.2,0.25], ### 0.1
#         'boosting_type':['gbdt'], ### goss>gbdt
#         'n_estimators': [500],
#         'num_iterations':[1500,2000], ### 2000
#         'max_depth': [5,7,9], ### <400<675
#         'max_bin':[256],
#         'colsample_bytree' : [0.8,0.9,1], ### 0.75
#         'bagging_fraction':[0.9,0.95,1], ### 1
#         'bagging_freq':[2,3,4],
#         'lambda_l1': [0.005,0.01,0.015],
     }],
    [{
#         "boosting_type": ["gbdt"], 
#         "colsample_bytree": [0.8], 
#         "learning_rate": [0.05], 
#         "max_bin": [256], 
#         "n_estimators": [500], 
#         "num_iterations": [1000],
        
        'learning_rate': [0.15,0.2,0.25], ### 0.1
        'boosting_type':['gbdt'], ### goss>gbdt
        'n_estimators': [500],
        'num_iterations':[2000], ### 2000
        'max_depth': [12,13,14,15], ### <400<675
        'max_bin':[256],
        'colsample_bytree' : [0.7,0.8,1], ### 0.75
        'bagging_fraction':[0.82,0.9,1], ### 1
#         'bagging_freq':[2,3,4],
        'lambda_l1': [0,0.01,0.02],
     }],
    [{
        "boosting_type": ["gbdt"], 
        "learning_rate": [0.01], 
        "max_bin": [256], 
        "n_estimators": [500], 
        "num_iterations": [2000],
     }],
    [{
        "boosting_type": ["gbdt"], 
        "learning_rate": [0.1], 
        "n_estimators": [500], 
        "num_iterations": [1000],
     }]
]

tune_grid = list(map(lambda x:list(ParameterGrid(x)),tune_grid))


In [None]:
tuning_cv2 = 5
tuning_generalize_ratio2 = 1.0/tuning_cv2 if tuning_cv2!=1 else 0.2

df_columns = ['dataid','protein_k','rna_k','top_ratio','training_score','tune_param','scores','cv']
df_result2 = pd.DataFrame([],columns=df_columns)

count = 0
for _dataid in [3]:
    INFO('dataid %d'%_dataid)
    for global_params in params_1[_dataid]:
        
        ### get conf of dataid
        protein_k = global_params[0]
        rna_k = global_params[1]
        top_ratio = global_params[2]
        ### read data
        [data,T] = ReadData(_dataid,protein_k,rna_k)
        [X,Y] = ToMatrix(data,'dense')

        for _cv in range(tuning_cv2):
            
            start_time = time.time()
            
            INFO('tuning cv %d'%_cv)
            
            ### split dataset
            [X_train,X_test,Y_train,Y_test] = SplitDataset(X,Y,tuning_generalize_ratio2)
            ### dimensionality reduction
            [X_train,X_test,Y_train,Y_test] = \
                        RandomForestDimensionalityReduction(X_train,X_test,Y_train,Y_test,topRatio=top_ratio)
            
            for sp in tune_grid[_dataid]:
                sp = dict(map(lambda x:(x,[sp[x]]),sp))
                tune_results = LGBTuning(X_train,X_test,Y_train,Y_test,sp)
                tune_score = scoreFunction(tune_results[0],tune_results[1])
                r = pd.Series({
                                'dataid':_dataid,
                                'protein_k':protein_k,
                                'rna_k':rna_k,
                                'top_ratio':top_ratio,
                                'training_score':tune_results[2],
                                'tune_param':json.dumps(sp),
                                'scores':json.dumps(tune_score),
                                'cv':_cv
                })
                df_result2 = df_result2.append(r,ignore_index=True)
                count += 1
                if count%100==1:
                    print(dict(r))
    
            end_time = time.time()
            print('DEBUG:: time elapsed ',(end_time-start_time)/60)
df_result2.to_csv(fname_result2)
            

optuna

In [None]:
import lightgbm as lgb

import optuna

from boruta import BorutaPy

# 1. Define an objective function to be maximized.
def objective(trial):

    # 2. Suggest values of the hyperparameters using a trial object.
    param = {
        'objective': 'cross_entropy',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate':trial.suggest_loguniform('learning_rate', 1e-2, 0.5),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-15, 1),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-15, 4),
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024),
        'max_depth': trial.suggest_int('max_depth', 3, 25),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 3, 30),
        
        'n_estimators': 500,
        'num_iterations': 1000,
        'max_bin': 256,
        'random_state': seed,
        'n_jobs': 20,
    }

    dtrain = lgb.Dataset(X_train,Y_train)
    clf = lgb.train(param, dtrain)
    
    Ypred_test = clf.predict(X_test)
    Ypred_test = np.array(list(map(lambda x:0 if x<=0.5 else 1,Ypred_test)))
    
    Ypred_train = clf.predict(X_train)
    Ypred_train = np.array(list(map(lambda x:0 if x<=0.5 else 1,Ypred_train)))
    
    score_test = scoreFunction(Ypred_test,Y_test)
    score_train = scoreFunction(Ypred_train,Y_train)
    
    trial.set_user_attr('test_scores', score_test)
    trial.set_user_attr('train_scores', score_train)
    
    return score_test['acc']

def objective_nn(trial):

    param = {
        'solver': trial.suggest_categorical('solver', ['adam']),
        'learning_rate_init': trial.suggest_loguniform('learning_rate_init', 1e-5, 0.05),
        'hidden_layer_sizes_1': int(trial.suggest_discrete_uniform('hidden_layer_sizes_1', 300, 1000, 10)),
        'hidden_layer_sizes_2': int(trial.suggest_discrete_uniform('hidden_layer_sizes_2', 100, 400, 10)),
        'hidden_layer_sizes_f': int(trial.suggest_discrete_uniform('hidden_layer_sizes_f', 10, 50, 10)),
#         'hidden_layer_sizes_3': int(trial.suggest_discrete_uniform('hidden_layer_sizes', 50, 1000, 10)),
        'max_iter': int(trial.suggest_discrete_uniform('max_iter', 200, 1000, 100)),
        'alpha': trial.suggest_loguniform('alpha', 1e-5, 1),
#         'activation': trial.suggest_categorical('activation', ['tanh','relu']),
#         'beta_1': trial.suggest_loguniform('learning_rate_init', 1e-5, 0.05),
        'batch_size': int(trial.suggest_discrete_uniform('max_iter', 50, 300, 10)),
        'max_bin': 256,
        'random_state': seed,
        'n_jobs': 20,
    }
    
    clf = MLPClassifier(random_state=param['random_state'], 
                        max_iter=param['max_iter'],
                        hidden_layer_sizes=(param['hidden_layer_sizes_1'],param['hidden_layer_sizes_2'],param['hidden_layer_sizes_f']),
                        learning_rate_init=param['learning_rate_init'],
                        solver=param['solver'],
#                         activation=param['activation'],
                        alpha=param['alpha'],
                        batch_size=param['batch_size'],
                       )
    
    clf.fit(X_train,Y_train)
    
    Ypred_test = clf.predict(X_test)
    Ypred_test = np.array(list(map(lambda x:0 if x<=0.5 else 1,Ypred_test)))
    
    Ypred_train = clf.predict(X_train)
    Ypred_train = np.array(list(map(lambda x:0 if x<=0.5 else 1,Ypred_train)))
    
    score_test = scoreFunction(Ypred_test,Y_test)
    score_train = scoreFunction(Ypred_train,Y_train)
    
    trial.set_user_attr('test_scores', score_test)
    trial.set_user_attr('train_scores', score_train)    
    
    return score_test['acc']
    
_dataid = 3
global_params = params_1[_dataid][0]
### get conf of dataid
protein_k = global_params[0]
rna_k = global_params[1]
top_ratio = global_params[2]
### read data
[data,T] = ReadData(_dataid,protein_k,rna_k)
[X,Y] = ToMatrix(data,'dense')
### split dataset
[X_train,X_test,Y_train,Y_test] = SplitDataset(X,Y,0.2)
### dimensionality reduction
[X_train,X_test,Y_train,Y_test] = \
            RandomForestDimensionalityReduction(X_train,X_test,Y_train,Y_test,topRatio=top_ratio)


# 3. Create a study object and optimize the objective function.

storage = optuna.storages.redis.RedisStorage(
    url='redis://QAZPLMTgv@123@r-bp1ba71b01eb1a94pd.redis.rds.aliyuncs.com:6379/db1',
)
study = optuna.create_study(direction='maximize')
study.optimize(objective_nn, n_trials=500)


read data 3 3
# DEBUG: # DEBUG: **************new dl 3***************
# DEBUG: READ SEQ FROM FILE
# DEBUG: READ CLUSTER FROM FILE
ERROR:: regex  
ERROR:: regex  
# DEBUG: READ PAIR FROM FILE
# DEBUG: GENERATE NEGATIVE PAIR
# DEBUG: negative pair number 2240
INFO::count of negative pairs2240
# DEBUG: PAIR UNION
# DEBUG: EXTRACT FEATURES--PROTEIN
# DEBUG: EXTRACT FEATURES--RNA
# DEBUG: K-MER CALCULATION
# DEBUG: FEATURE UNION
# DEBUG: GARBAGE COLLECTION
MATRIX TRANSFORMATION
DEBUG:: total features count  2375
data shape 4480 2375
rf raw data fit score 0.999721
INFO::dimension remained 2344 0.975000
dimension remained 2344


[I 2020-05-25 18:15:00,582] Finished trial#0 with value: 0.7879464285714286 with parameters: {'solver': 'adam', 'learning_rate_init': 0.008764598659058494, 'hidden_layer_sizes_1': 600.0, 'hidden_layer_sizes_2': 200.0, 'hidden_layer_sizes_f': 30.0, 'max_iter': 1000.0, 'alpha': 0.015000216978308504}. Best is trial#0 with value: 0.7879464285714286.
[I 2020-05-25 18:16:39,615] Finished trial#1 with value: 0.7734375 with parameters: {'solver': 'adam', 'learning_rate_init': 0.00885509443649648, 'hidden_layer_sizes_1': 820.0, 'hidden_layer_sizes_2': 200.0, 'hidden_layer_sizes_f': 30.0, 'max_iter': 500.0, 'alpha': 0.9366619459828381}. Best is trial#0 with value: 0.7879464285714286.
[I 2020-05-25 18:18:03,851] Finished trial#2 with value: 0.8370535714285714 with parameters: {'solver': 'adam', 'learning_rate_init': 0.00027003285183822406, 'hidden_layer_sizes_1': 540.0, 'hidden_layer_sizes_2': 220.0, 'hidden_layer_sizes_f': 10.0, 'max_iter': 900.0, 'alpha': 0.000479006493235985}. Best is trial#2 

In [None]:

# optuna.visualization.plot_intermediate_values(study)
# optuna.visualization.plot_optimization_history(study)
study.best_trial.params
study.best_trial.user_attrs

In [None]:
import redis
r = redis.StrictRedis(host='r-bp1ba71b01eb1a94pd.redis.rds.aliyuncs.com', port=6379, db=1,password='QAZPLMTgv@123')
r.hmset(getCurrentTime()+'@'+str(_dataid),{
    "dataset":str(_dataid),
    "params":json.dumps(study.best_trial.params),
    "train-scores":json.dumps(study.best_trial.user_attrs['train_scores']),
    "test-scores":json.dumps(study.best_trial.user_attrs['test_scores']),
          })


In [None]:
json.dumps(study.best_trial.params)

stage 3

In [None]:
tuning_cv2 = 5
tuning_generalize_ratio2 = 1.0/tuning_cv2 if tuning_cv2!=1 else 0.2

df_columns = ['dataid','protein_k','rna_k','top_ratio','training_score','tune_param','scores']
df_result2 = pd.DataFrame([],columns=df_columns)

for _dataid in [0,1,2,3,4,5]:
    INFO('dataid %d'%_dataid)
    for global_params in params_1[_dataid]:
        start_time = time.time()
        ### get conf of dataid
        protein_k = global_params[0]
        rna_k = global_params[1]
        top_ratio = global_params[2]
        ### read data
        [data,T] = ReadData(_dataid,protein_k,rna_k)
        [X,Y] = ToMatrix(data,'dense')

        for _cv in range(tuning_cv2):
            INFO('tuning cv %d'%_cv)
            
            ### split dataset
            [X_train,X_test,Y_train,Y_test] = SplitDataset(X,Y,tuning_generalize_ratio2)
            ### dimensionality reduction
            [X_train,X_test,Y_train,Y_test] = \
                        RandomForestDimensionalityReduction(X_train,X_test,Y_train,Y_test,topRatio=top_ratio)
            
            for sp in tune_grid[_dataid]:
                sp = dict(map(lambda x:(x,[sp[x]]),sp))
                tune_results = LGBTuning(X_train,X_test,Y_train,Y_test,sp)
                tune_score = scoreFunction(tune_results[0],tune_results[1])
                r = pd.Series({
                                'dataid':_dataid,
                                'protein_k':protein_k,
                                'rna_k':rna_k,
                                'top_ratio':top_ratio,
                                'training_score':tune_results[2],
                                'tune_param':json.dumps(sp),
                                'scores':json.dumps(tune_score),
                })
                df_result2 = df_result2.append(r,ignore_index=True)
    
        end_time = time.time()
        print('DEBUG:: time elapsed ',(end_time-start_time)/60)
df_result2.to_csv(fname_result2)
            

分析

In [None]:

def get_df_from_stage2_result(fpath,group=['dataid']):
    df = pd.read_csv(fpath)
    acc = [json.loads(row['scores'])['acc'] for idx,row in df.iterrows()]
    auc = [json.loads(row['scores'])['auc'] for idx,row in df.iterrows()]
    fpr = [json.loads(row['scores'])['fpr'] for idx,row in df.iterrows()]
    tpr = [json.loads(row['scores'])['tpr'] for idx,row in df.iterrows()]
    mcc = [json.loads(row['scores'])['mcc'] for idx,row in df.iterrows()]
    tnr = [json.loads(row['scores'])['tnr'] for idx,row in df.iterrows()]
    ppv = [json.loads(row['scores'])['ppv'] for idx,row in df.iterrows()]
    f_score = [json.loads(row['scores'])['f_score'] for idx,row in df.iterrows()]
    ap = [json.loads(row['scores'])['ap'] for idx,row in df.iterrows()]
    brier = [json.loads(row['scores'])['brier'] for idx,row in df.iterrows()]
    sensitivity = [json.loads(row['scores'])['sensitivity'] for idx,row in df.iterrows()]
#     print(fpr,tpr)
    df['acc'] = acc
    df['auc'] = auc
    df['fpr'] = np.array(fpr)[:,1]
    df['tpr'] = np.array(tpr)[:,1]
    df['mcc'] = mcc
    df['tnr'] = tnr
    df['ppv'] = ppv
    df['f_score'] = f_score
    df['ap'] = ap
    df['brier'] = brier
    df['sensitivity'] = sensitivity

    a = df.groupby(by=group).agg({
        'acc':np.mean,
        'auc':np.mean,
        'fpr':np.mean,
        'tpr':np.mean,
        'mcc':np.mean,
        'tnr':np.mean,
        'ppv':np.mean,
        'f_score':np.mean,
        'ap':np.mean,
        'brier':np.mean,
        'sensitivity':np.mean,
    }).reset_index()
#     b = a.join(df.set_index(['dataid','acc']),on=['dataid','acc'],how='inner',lsuffix='_left', rsuffix='_right')
    return a

# df_tune1 = get_df_from_stage2_result('./result2020-05-09-16-22-10-stage2.csv')
# df_sub2_tune1 = get_df_from_stage2_result('./result2020-05-09-21-08-01-stage2.csv')
# df_raw = get_df_from_stage2_result('./result2020-05-10-10-30-30-stage2.csv')
# df_tune = get_df_from_stage2_result('./result2020-05-09-21-42-23-stage2.csv')

df = get_df_from_stage2_result('./result/2020-05-13-21-28-20-stage2-3.csv',
                               group=['dataid','protein_k','rna_k','top_ratio','tune_param'])

In [None]:
df.loc[ (df['acc']==df['acc'].max()) | (df['auc']==df['auc'].max()) ].values
# df['acc'].max()

In [None]:
set(df['RNA_K'])

In [None]:
df_tune1

In [None]:
df_sub2_tune1

In [None]:
df_raw

In [None]:
df_tune

In [None]:
fname_optimal_stage1 = './result/2020-05-04-20-55-26-stage1.csv'
df = pd.read_csv(fname_optimal_stage1)
acc = [ float(re.findall('[0-9]+\.[0-9]+',x)[0]) for x in df['test_score'] ]
auc = [ float(re.findall('[0-9]+\.[0-9]+',x)[1]) for x in df['test_score'] ]
df['acc'] = acc
df['auc'] = auc

new_df1 = df.groupby(by=['DATAID','PROTEIN_K','RNA_K']).agg({'acc':np.max,'auc':np.max})
new_df2 = df.groupby(by=['DATAID','PROTEIN_K','RNA_K']).agg({'acc':np.mean,'auc':np.mean})
inspect_df = df.groupby(by=['DATAID']).agg({'acc':np.max,'auc':np.max})
inspect_df.join(df.set_index(['DATAID','acc']),on=['DATAID','acc'],how='inner',lsuffix='_left', rsuffix='_right')

In [None]:
inspect_df

In [None]:
acc = [json.loads(row['scores'])['acc'] for idx,row in df_result2.iterrows()]
auc = [json.loads(row['scores'])['auc'] for idx,row in df_result2.iterrows()]

df_result2['acc'] = acc
df_result2['auc'] = auc

df_result2.groupby(by=['dataid']).agg({'acc':np.mean})
# df_result2

In [5]:
import xgboost as xgb
import catboost as cb
from sklearn.ensemble import BaggingClassifier

DATAID = 3
PROTEIN_K = params_1[DATAID][0][0]
RNA_K = params_1[DATAID][0][1]
topRatio = params_1[DATAID][0][2]
[data,T] = ReadData(DATAID,PROTEIN_K,RNA_K)
[X,Y] = ToMatrix(data,'dense')
[X_train,X_test,Y_train,Y_test] = SplitDataset(X,Y,generalize_ratio)
[X_train,X_test,Y_train,Y_test] = \
    RandomForestDimensionalityReduction(X_train,X_test,Y_train,Y_test,topRatio=topRatio)
r1 = LGBTuning(X_train,X_test,Y_train,Y_test)

read data 3 3
# DEBUG: # DEBUG: **************new dl 3***************
# DEBUG: READ SEQ FROM FILE
# DEBUG: READ CLUSTER FROM FILE
ERROR:: regex  
ERROR:: regex  
# DEBUG: READ PAIR FROM FILE
# DEBUG: GENERATE NEGATIVE PAIR
# DEBUG: negative pair number 2240
INFO::count of negative pairs2240
# DEBUG: PAIR UNION
# DEBUG: EXTRACT FEATURES--PROTEIN
# DEBUG: EXTRACT FEATURES--RNA
# DEBUG: K-MER CALCULATION
# DEBUG: FEATURE UNION
# DEBUG: GARBAGE COLLECTION
MATRIX TRANSFORMATION
DEBUG:: total features count  2375
data shape 4480 2375
rf raw data fit score 0.999721
INFO::dimension remained 2344 0.975000
dimension remained 2344




train score 0.999721


In [6]:
# estimators = [
#     ('cb', Tr.DecisionTreeClassifier()),
#     ('lgb'+str(i), lgb.LGBMClassifier(objective='cross_entropy', random_state=seed, n_jobs=6,max_depth=i)) for i in range(5,20)
#     ('lgb2', lgb.LGBMClassifier(objective='cross_entropy', random_state=seed, n_jobs=6,max_depth=6)),
#     ('lgb3', lgb.LGBMClassifier(objective='cross_entropy', random_state=seed, n_jobs=6,max_depth=3)),
#     ('lgb4', lgb.LGBMClassifier(objective='cross_entropy', random_state=seed, n_jobs=6,max_depth=12)),
#     ('xgb',xgb.XGBClassifier()),
# ]
# clf = StackingClassifier(
#     estimators=estimators, final_estimator=LogisticRegressionCV(cv=10)
# )
clf = BaggingClassifier(base_estimator=MLPClassifier(solver= 'adam',
                    learning_rate_init= 0.0008421977409048232,
                    hidden_layer_sizes= (550,128),
                    max_iter= 300,
                    alpha= 0.4891603457998322,
                    activation= 'relu'
                   ),
                        max_samples=0.7,max_features=0.6,bootstrap_features=True,
                         n_estimators=50, random_state=0, n_jobs=-1).fit(X_train, Y_train)


In [7]:
# clf = MLPClassifier(solver= 'adam',
#                     learning_rate_init= 0.0008421977409048232,
#                     hidden_layer_sizes_1= 930.0,
#                     hidden_layer_sizes_2= 250.0,
#                     max_iter= 300.0,
#                     alpha= 0.4891603457998322,
#                     activation= 'relu'
#                    ).fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
scoreFunction(Y_pred,Y_test)

{'acc': 0.8102678571428571,
 'auc': 0.8102630831127263,
 'fpr': [0.0, 0.1824175824175824, 1.0],
 'tpr': [0.0, 0.8027210884353742, 1.0],
 'mcc': 0.6204148261328282,
 'tnr': 0.8175824175824176,
 'ppv': 0.8100686498855835,
 'f_score': 0.806378132118451,
 'ap': 0.7473574026292439,
 'brier': 0.18973214285714285,
 'sensitivity': 0.8027210884353742}

In [8]:
Ypred = clf.predict(X_train)
scoreFunction(Ypred,Y_train)

{'acc': 0.9793526785714286,
 'auc': 0.9793772919440356,
 'fpr': [0.0, 0.023529411764705882, 1.0],
 'tpr': [0.0, 0.9822123401889938, 1.0],
 'mcc': 0.9587187554867308,
 'tnr': 0.9764705882352941,
 'ppv': 0.9767827529021559,
 'f_score': 0.979490022172949,
 'ap': 0.9683366450128457,
 'brier': 0.020647321428571428,
 'sensitivity': 0.9822123401889938}

In [None]:
import optuna
study_name = 'example-study'  # Unique identifier of the study.
study = optuna.create_study(study_name=study_name, 
                            storage='sqlite:///example.db',
                            load_if_exists=True,
                            pruner=optuna.pruners.MedianPruner()
                           )

study.optimize(objective, n_trials=3)

df = study.trials_dataframe(attrs=('number', 'value', 'params', 'state'))

# study.best_params  # Get best parameters for the objective function.
# study.best_value  # Get best objective value.
# study.best_trial  # Get best trial's information.
# study.trials  # Get all trials' information.

(3584, 2344)